fix: newline in virtual.yaml

Stupid one line fix, but it will fix CI Signed-off-by: Dave <dave@gray101.com>
feat(swagger): update swagger (#2128 )
2026-05-19 14:17:21 -04:00 · 2024-04-25 10:39:07 -04:00 · 2024-04-25 16:10:08 +02:00 · 2024-04-25 16:06:18 +02:00 · 2024-04-25 16:05:02 +02:00 · 2024-04-25 15:57:06 +02:00
205 changed files with 6748 additions and 11239 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,11 @@
 .idea
 .github
 .vscode
 models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
-Dockerfile*
+Dockerfile*
 # SonarQube
 .scannerwork
--- a/.env
+++ b/.env
@@ -1,33 +1,33 @@
 ## Set number of threads.
 ## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
-# THREADS=14
+# LOCALAI_THREADS=14
 ## Specify a different bind address (defaults to ":8080")
-# ADDRESS=127.0.0.1:8080
+# LOCALAI_ADDRESS=127.0.0.1:8080
 ## Default models context size
-# CONTEXT_SIZE=512
+# LOCALAI_CONTEXT_SIZE=512
 #
 ## Define galleries.
 ## models will to install will be visible in `/models/available`
-# GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+# LOCALAI_GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
 ## CORS settings
-# CORS=true
+# LOCALAI_CORS=true
-# CORS_ALLOW_ORIGINS=*
+# LOCALAI_CORS_ALLOW_ORIGINS=*
 ## Default path for models
 #
-# MODELS_PATH=/models
+# LOCALAI_MODELS_PATH=/models
 ## Enable debug mode
-# DEBUG=true
+# LOCALAI_LOG_LEVEL=debug
 ## Disables COMPEL (Diffusers)
 # COMPEL=0
 ## Enable/Disable single backend (useful if only one GPU is available)
-# SINGLE_ACTIVE_BACKEND=true
+# LOCALAI_SINGLE_ACTIVE_BACKEND=true
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
@@ -46,13 +46,13 @@
 # GO_TAGS=stablediffusion
 ## Path where to store generated images
-# IMAGE_PATH=/tmp
+# LOCALAI_IMAGE_PATH=/tmp/generated/images
 ## Specify a default upload limit in MB (whisper)
-# UPLOAD_LIMIT
+# LOCALAI_UPLOAD_LIMIT=15
 ## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
-# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+# LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
 ### Advanced settings ###
 ### Those are not really used by LocalAI, but from components in the stack ###
@@ -72,18 +72,18 @@
 # LLAMACPP_PARALLEL=1
 ### Enable to run parallel requests
-# PARALLEL_REQUESTS=true
+# LOCALAI_PARALLEL_REQUESTS=true
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
-# WATCHDOG_IDLE=true
+# LOCALAI_WATCHDOG_IDLE=true
 #
 # Enables watchdog to kill backends that are busy for too much time
 # WATCHDOG_BUSY=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered idle
-# WATCHDOG_IDLE_TIMEOUT=5m
+# LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m
 #
 # Enables watchdog to kill backends that are busy for too much time
 # LOCALAI_WATCHDOG_BUSY=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered busy
-# WATCHDOG_BUSY_TIMEOUT=5m
+# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,25 @@
 # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 version: 2
 updates:
  - package-ecosystem: "gomod"
    directory: "/"
    schedule:
      interval: "weekly"
  - package-ecosystem: "github-actions"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "pip"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "docker"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -49,7 +49,7 @@ jobs:
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -0,0 +1,43 @@
 name: Dependabot auto-merge
 on:
 - pull_request_target
 permissions:
  contents: write
  pull-requests: write
  packages: read
 jobs:
  dependabot:
    runs-on: ubuntu-latest
    if: ${{ github.actor == 'dependabot[bot]' }}
    steps:
      - name: Dependabot metadata
        id: metadata
        uses: dependabot/fetch-metadata@v2.0.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Approve a PR if not already approved
        run: |
          gh pr checkout "$PR_URL"
            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
          then
            gh pr review --approve "$PR_URL"
          else
            echo "PR already approved.";
          fi
        env:
          PR_URL: ${{github.event.pull_request.html_url}}
          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
      - name: Enable auto-merge for Dependabot PRs
        if: ${{ contains(github.event.pull_request.title, 'bump')}}
        run: gh pr merge --auto --squash "$PR_URL"
        env:
          PR_URL: ${{github.event.pull_request.html_url}}
          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -0,0 +1,90 @@
 name: 'generate and publish GRPC docker caches'
 on:
 - workflow_dispatch
 concurrency:
  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
  cancel-in-progress: true
 jobs:
  generate_caches:
    strategy:
      matrix:
        include:
          - grpc-base-image: ubuntu:22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
    steps:
      - name: Release space from worker
        if: matrix.runs-on == 'ubuntu-latest'
        run: |
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          df -h
          echo
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
          sudo apt-get remove -y '^mono-.*' || true
          sudo apt-get remove -y '^ghc-.*' || true
          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
          sudo apt-get remove -y 'php.*' || true
          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
          sudo apt-get remove -y '^google-.*' || true
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
          sudo apt-get remove -y microsoft-edge-stable || true
          sudo apt-get remove -y firefox || true
          sudo apt-get remove -y powershell || true
          sudo apt-get remove -y r-base-core || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          sudo rm -rfv build || true
          sudo rm -rf /usr/share/dotnet || true
          sudo rm -rf /opt/ghc || true
          sudo rm -rf "/usr/local/share/boost" || true
          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
          df -h
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
        with:
          platforms: all
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@master
      - name: Checkout
        uses: actions/checkout@v4
      - name: Cache GRPC
        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
          # This means that even the MAKEFLAGS have to be an EXACT match.
          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
          build-args: |
            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
            MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.58.0
          context: .
          file: ./Dockerfile
          cache-to: type=gha,ignore-error=true
          target: grpc
          platforms: ${{ matrix.platforms }}
          push: false
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -22,6 +22,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -61,12 +62,14 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -85,6 +88,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -102,11 +106,12 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -122,4 +127,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -26,6 +26,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
@@ -129,6 +130,7 @@ jobs:
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
@@ -140,12 +142,14 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -158,6 +162,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -171,6 +176,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -180,6 +186,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -189,6 +196,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -198,6 +206,7 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -210,6 +219,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
@@ -219,6 +229,7 @@ jobs:
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
@@ -236,6 +247,7 @@ jobs:
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
@@ -258,7 +270,7 @@ jobs:
            aio: "-aio-cpu"
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -269,7 +281,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -280,7 +292,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -291,7 +303,7 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -302,4 +314,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -6,6 +6,10 @@ on:
    inputs:
      base-image:
        description: 'Base image'
        required: true
        type: string
      grpc-base-image:
        description: 'GRPC Base image, must be a compatible image with base-image'
        required: false
        default: ''
        type: string
@@ -57,7 +61,7 @@ on:
      makeflags:
        description: 'Make Flags'
        required: false
-        default: '--jobs=3 --output-sync=target'
+        default: '--jobs=4 --output-sync=target'
        type: string
      aio:
        description: 'AIO Image Name'
@@ -201,15 +205,16 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
          # This means that even the MAKEFLAGS have to be an EXACT match.
          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
          build-args: |
-            IMAGE_TYPE=${{ inputs.image-type }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
-            BASE_IMAGE=${{ inputs.base-image }}
+            MAKEFLAGS=--jobs=4 --output-sync=target
            MAKEFLAGS=${{ inputs.makeflags }}
            GRPC_VERSION=v1.58.0
          context: .
          file: ./Dockerfile
          cache-from: type=gha
          cache-to: type=gha,ignore-error=true
          target: grpc
          platforms: ${{ inputs.platforms }}
          push: false
@@ -280,6 +285,7 @@ jobs:
        run: |
          docker pull localai/localai:${{ steps.meta.outputs.version }}
          docker tag localai/localai:${{ steps.meta.outputs.version }} localai/localai:${{ inputs.latest-image }}
          docker push localai/localai:${{ inputs.latest-image }}
          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
          docker tag quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
@@ -289,6 +295,7 @@ jobs:
        run: |
          docker pull localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }}
          docker tag localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }} localai/localai:${{ inputs.latest-image-aio }}
          docker push localai/localai:${{ inputs.latest-image-aio }}
          docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
          docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -0,0 +1,35 @@
 name: LocalAI-bot auto-merge
 on:
 - pull_request_target
 permissions:
  contents: write
  pull-requests: write
  packages: read
 jobs:
  dependabot:
    runs-on: ubuntu-latest
    if: ${{ github.actor == 'localai-bot' }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Approve a PR if not already approved
        run: |
          gh pr checkout "$PR_URL"
            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
          then
            gh pr review --approve "$PR_URL"
          else
            echo "PR already approved.";
          fi
        env:
          PR_URL: ${{github.event.pull_request.html_url}}
          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
      - name: Enable auto-merge for LocalAIBot PRs
        run: gh pr merge --auto --squash "$PR_URL"
        env:
          PR_URL: ${{github.event.pull_request.html_url}}
          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,6 +1,8 @@
 name: Build and Release
-on: push
+on: 
 - push
 - pull_request
 env:
  GRPC_VERSION: v1.58.0
@@ -33,14 +35,14 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential ffmpeg protobuf-compiler
      - name: Install CUDA Dependencies
        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
        run: |
@@ -55,7 +57,7 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -75,6 +77,9 @@ jobs:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
          export PATH=$PATH:$GOPATH/bin
          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
            export BUILD_TYPE=cublas
            export PATH=/usr/local/cuda/bin:$PATH
@@ -82,12 +87,12 @@ jobs:
          else
            STATIC=true make dist
          fi
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-linux-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
@@ -100,27 +105,24 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v1
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
  build-macOS:
    strategy:
@@ -138,13 +140,15 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build
        id: build
        env:
@@ -153,13 +157,61 @@ jobs:
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
          make dist
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-MacOS-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
  build-macOS-arm64:
    strategy:
      matrix:
        include:
          - build: 'avx2'
            defines: ''
          - build: 'avx'
            defines: '-DLLAMA_AVX2=OFF'
          - build: 'avx512'
            defines: '-DLLAMA_AVX512=ON'
    runs-on: macos-14
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
          make dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-MacOS-arm64-${{ matrix.build }}
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,14 +14,17 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
      - name: Upload SARIF file
-        uses: github/codeql-action/upload-sarif@v2
+        if: ${{ github.actor != 'dependabot[bot]' }}
        uses: github/codeql-action/upload-sarif@v3
        with:
          # Path to SARIF file relative to the root of the repository
-          sarif_file: results.sarif
+          sarif_file: results.sarif
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -32,8 +32,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
@@ -61,8 +62,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
@@ -72,6 +74,37 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
  tests-rerankers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
      - name: Test rerankers
        run: |
           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/rerankers
           make --jobs=5 --output-sync=target -C backend/python/rerankers test
  tests-diffusers:
    runs-on: ubuntu-latest
    steps:
@@ -90,8 +123,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
@@ -101,6 +135,35 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/diffusers
           make --jobs=5 --output-sync=target -C backend/python/diffusers test
  tests-parler-tts:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
      - name: Test parler-tts
        run: |
           export PATH=$PATH:/opt/conda/bin
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
  tests-transformers-musicgen:
    runs-on: ubuntu-latest
@@ -120,8 +183,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
@@ -151,8 +215,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
  #         pip install --user grpcio-tools
  #         sudo rm -rfv /usr/bin/conda || true
@@ -222,8 +287,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
  #         pip install --user grpcio-tools
  #         sudo rm -rfv /usr/bin/conda || true
@@ -254,8 +320,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
  #         pip install --user grpcio-tools
  #         sudo rm -rfv /usr/bin/conda || true
  #     - name: Test vllm
  #       run: |
@@ -280,8 +347,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-          sudo apt-get install -y libopencv-dev    
+          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
      - name: Test vall-e-x
        run: |
@@ -307,7 +375,8 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng          
+          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
      - name: Test coqui
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -60,7 +60,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -70,17 +70,27 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
          sudo apt-get install -y libopencv-dev
-          
+
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
@@ -89,10 +99,10 @@ jobs:
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -108,11 +118,14 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
+        uses: mxschmitt/action-tmate@v3.18
-        timeout-minutes: 5
+        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  tests-aio-container:
    runs-on: ubuntu-latest
@@ -163,8 +176,11 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
+        uses: mxschmitt/action-tmate@v3.18
-        timeout-minutes: 5
+        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  tests-apple:
    runs-on: macOS-14
@@ -177,7 +193,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -186,7 +202,8 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
          pip install --user grpcio-tools
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -196,5 +213,8 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
+        uses: mxschmitt/action-tmate@v3.18
-        timeout-minutes: 5
+        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -0,0 +1,31 @@
 name: Update swagger
 on:
  schedule:
    - cron: 0 20 * * *
  workflow_dispatch:
 jobs:
  swagger:
    strategy:
      fail-fast: false
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-go@v5
        with:
          go-version: 'stable'
      - run: |
          go install github.com/swaggo/swag/cmd/swag@latest
      - name: Bump swagger 🔧
        run: |
          make swagger
      - name: Create Pull Request
        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: 'feat(swagger): update swagger'
          title: 'feat(swagger): update swagger'
          branch: "update/swagger"
          body:  Update swagger
          signoff: true
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -0,0 +1,18 @@
 name: 'Yamllint GitHub Actions'
 on:
  - pull_request
 jobs:
  yamllint:
    name: 'Yamllint'
    runs-on: ubuntu-latest
    steps:
      - name: 'Checkout'
        uses: actions/checkout@master
      - name: 'Yamllint'
        uses: karancode/yamllint-github-action@master
        with:
          yamllint_file_or_dir: 'gallery'
          yamllint_strict: false
          yamllint_comment: true
        env:
          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,11 @@ backend-assets/*
 !backend-assets/.keep
 prepare
 /ggml-metal.metal
 # Protobuf generated files
 *.pb.go
 *pb2.py
 *pb2_grpc.py
 # SonarQube
 .scannerwork
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to localAI
+# Contributing to LocalAI
 Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
@@ -29,8 +29,9 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
 2. Navigate to the project directory: `cd LocalAI`
-3. Install the required dependencies: `make prepare`
+3. Install the required dependencies ( see https://localai.io/basics/build/#build-localai-locally )
-4. Run LocalAI: `make run`
+4. Build LocalAI: `make build`
 5. Run LocalAI: `./local-ai`
 ## Contributing
@@ -59,14 +60,29 @@ If you find a bug, have a feature request, or encounter any issues, please check
 `make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
 ### Running AIO tests
 All-In-One images has a set of tests that automatically verifies that most of the endpoints works correctly, a flow can be :
 ```bash
 # Build the LocalAI docker image
 make DOCKER_IMAGE=local-ai docker
 # Build the corresponding AIO image
 BASE_IMAGE=local-ai DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
 # Run the AIO e2e tests
 LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio make run-e2e-aio
 ```
 ## Documentation
- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
+We are welcome the contribution of the documents, please open new PR or create a new issue. The documentation is available under `docs/` https://github.com/mudler/LocalAI/tree/master/docs
-
+ 
 ## Community and Communication
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
---
+---
--- a/65
+++ b/65
@@ -1,8 +1,9 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
 ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
 # extras or core
-FROM ${BASE_IMAGE} as requirements-core
+FROM ${BASE_IMAGE} AS requirements-core
 USER root
@@ -15,17 +16,30 @@ ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 ARG GO_TAGS="stablediffusion tinydream tts"
 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
+    apt-get install -y ca-certificates curl python3-pip unzip && apt-get clean
 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin
 # Install grpc compilers
 ENV PATH $PATH:/root/go/bin
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \
    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
 # Install protobuf (the version in 22.04 is too old)
 RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
    rm protoc.zip
 # Install grpcio-tools (the version in 22.04 is too old)
 RUN pip install --user grpcio-tools
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -66,9 +80,10 @@ RUN test -n "$TARGETARCH" \
 ###################################
 ###################################
-FROM requirements-core as requirements-extras
+FROM requirements-core AS requirements-extras
-RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+RUN apt install -y gpg && \
    curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
@@ -90,7 +105,7 @@ RUN if [ ! -e /usr/bin/python ]; then \
 ###################################
 ###################################
-FROM ${BASE_IMAGE} as grpc
+FROM ${GRPC_BASE_IMAGE} AS grpc
 ARG MAKEFLAGS
 ARG GRPC_VERSION=v1.58.0
@@ -100,22 +115,21 @@ ENV MAKEFLAGS=${MAKEFLAGS}
 WORKDIR /build
 RUN apt-get update && \
-    apt-get install -y g++ cmake git && \
+    apt-get install -y build-essential cmake git  && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc
-RUN cd grpc && \
+WORKDIR /build/grpc/cmake/build
-    mkdir -p cmake/build && \
+
-    cd cmake/build && \
+RUN cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
    make
 ###################################
 ###################################
-FROM requirements-${IMAGE_TYPE} as builder
+FROM requirements-${IMAGE_TYPE} AS builder
 ARG GO_TAGS="stablediffusion tts"
 ARG GRPC_BACKENDS
@@ -133,6 +147,12 @@ WORKDIR /build
 COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"
 RUN apt-get update && \
    apt-get install -y build-essential cmake git  && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN make prepare
 # If we are building with clblas support, we need the libraries for the builds
@@ -147,9 +167,11 @@ RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
 COPY --from=grpc /build/grpc ./grpc/
-RUN cd /build/grpc/cmake/build && make install
+WORKDIR /build/grpc/cmake/build
 RUN make install
 # Rebuild with defaults backends
 WORKDIR /build
 RUN make build
 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
@@ -191,6 +213,11 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
    apt-get clean \
    ; fi
 RUN apt-get update && \
    apt-get install -y cmake git  && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 WORKDIR /build
 # we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
@@ -202,7 +229,7 @@ COPY . .
 COPY --from=builder /build/sources ./sources/
 COPY --from=grpc /build/grpc ./grpc/
-RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
+RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf /build/grpc
 # Copy the binary
 COPY --from=builder /build/local-ai ./
@@ -232,6 +259,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/sentencetransformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/rerankers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers \
    ; fi
@@ -250,6 +280,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers-musicgen \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/parler-tts \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/coqui \
    ; fi
@@ -259,7 +292,7 @@ RUN mkdir -p /build/models
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
-  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
+  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
 VOLUME /build/models
 EXPOSE 8080
--- a/228
+++ b/228
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai
 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=cc4a95426d17417d3c83f12bdb514fbe8abe2a88
+CPPLLAMA_VERSION?=784e11dea1f5ce9638851b2b0dddb107e2a609c8
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
-WHISPER_CPP_VERSION?=13c22321d1ac758ce68a429c23104e234b440769
+WHISPER_CPP_VERSION?=858452d58dba3acdc3431c9bced2bb8cfd9bf418
 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -179,20 +179,20 @@ endif
 all: help
 ## BERT embeddings
-sources/go-bert:
+sources/go-bert.cpp:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert.cpp
-	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+	cd sources/go-bert.cpp && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
-sources/go-bert/libgobert.a: sources/go-bert
+sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
-	$(MAKE) -C sources/go-bert libgobert.a
+	$(MAKE) -C sources/go-bert.cpp libgobert.a
-## go-llama-ggml
+## go-llama.cpp
-sources/go-llama-ggml:
+sources/go-llama.cpp:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama.cpp
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+	cd sources/go-llama.cpp && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
+sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## go-piper
 sources/go-piper:
@@ -211,12 +211,12 @@ sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
 ## RWKV
-sources/go-rwkv:
+sources/go-rwkv.cpp:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv.cpp
-	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+	cd sources/go-rwkv.cpp && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
-sources/go-rwkv/librwkv.a: sources/go-rwkv
+sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
-	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
 ## stable diffusion
 sources/go-stable-diffusion:
@@ -236,23 +236,24 @@ sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
 ## whisper
 sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
+	git clone https://github.com/ggerganov/whisper.cpp sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && make libwhisper.a
-get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream
 replace:
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
@@ -271,12 +272,12 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama-ggml clean
+	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-rwkv clean
+	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert clean
+	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
@@ -289,10 +290,12 @@ clean: ## Remove build related file
 	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
-	rm -rf backend-assets
+	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
 	rmdir pkg/grpc/proto || true
 clean-tests:
 	rm -rf test-models
@@ -416,30 +419,152 @@ help: ## Show this help.
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
 		}' $(MAKEFILE_LIST)
 .PHONY: protogen
 protogen: protogen-go protogen-python
 .PHONY: protogen-clean
 protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
 protogen-go:
 	mkdir -p pkg/grpc/proto
 	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto
-protogen-python:
+.PHONY: protogen-go-clean
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
+protogen-go-clean:
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
+	$(RM) pkg/grpc/proto/backend.pb.go pkg/grpc/proto/backend_grpc.pb.go
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
+	$(RM) bin/*
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
+
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
+.PHONY: protogen-python
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
+
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/coqui/ --grpc_python_out=backend/python/coqui/ backend/backend.proto
+.PHONY: protogen-python-clean
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
+
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
+.PHONY: autogptq-protogen
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/mamba/ --grpc_python_out=backend/python/mamba/ backend/backend.proto
+autogptq-protogen:
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
+	$(MAKE) -C backend/python/autogptq protogen
 .PHONY: autogptq-protogen-clean
 autogptq-protogen-clean:
 	$(MAKE) -C backend/python/autogptq protogen-clean
 .PHONY: bark-protogen
 bark-protogen:
 	$(MAKE) -C backend/python/bark protogen
 .PHONY: bark-protogen-clean
 bark-protogen-clean:
 	$(MAKE) -C backend/python/bark protogen-clean
 .PHONY: coqui-protogen
 coqui-protogen:
 	$(MAKE) -C backend/python/coqui protogen
 .PHONY: coqui-protogen-clean
 coqui-protogen-clean:
 	$(MAKE) -C backend/python/coqui protogen-clean
 .PHONY: diffusers-protogen
 diffusers-protogen:
 	$(MAKE) -C backend/python/diffusers protogen
 .PHONY: diffusers-protogen-clean
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 .PHONY: exllama-protogen
 exllama-protogen:
 	$(MAKE) -C backend/python/exllama protogen
 .PHONY: exllama-protogen-clean
 exllama-protogen-clean:
 	$(MAKE) -C backend/python/exllama protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
 .PHONY: exllama2-protogen-clean
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean
 .PHONY: mamba-protogen
 mamba-protogen:
 	$(MAKE) -C backend/python/mamba protogen
 .PHONY: mamba-protogen-clean
 mamba-protogen-clean:
 	$(MAKE) -C backend/python/mamba protogen-clean
 .PHONY: petals-protogen
 petals-protogen:
 	$(MAKE) -C backend/python/petals protogen
 .PHONY: petals-protogen-clean
 petals-protogen-clean:
 	$(MAKE) -C backend/python/petals protogen-clean
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
 .PHONY: rerankers-protogen-clean
 rerankers-protogen-clean:
 	$(MAKE) -C backend/python/rerankers protogen-clean
 .PHONY: sentencetransformers-protogen
 sentencetransformers-protogen:
 	$(MAKE) -C backend/python/sentencetransformers protogen
 .PHONY: sentencetransformers-protogen-clean
 sentencetransformers-protogen-clean:
 	$(MAKE) -C backend/python/sentencetransformers protogen-clean
 .PHONY: transformers-protogen
 transformers-protogen:
 	$(MAKE) -C backend/python/transformers protogen
 .PHONY: transformers-protogen-clean
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean
 .PHONY: parler-tts-protogen
 parler-tts-protogen:
 	$(MAKE) -C backend/python/parler-tts protogen
 .PHONY: parler-tts-protogen-clean
 parler-tts-protogen-clean:
 	$(MAKE) -C backend/python/parler-tts protogen-clean
 .PHONY: transformers-musicgen-protogen
 transformers-musicgen-protogen:
 	$(MAKE) -C backend/python/transformers-musicgen protogen
 .PHONY: transformers-musicgen-protogen-clean
 transformers-musicgen-protogen-clean:
 	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
 .PHONY: vall-e-x-protogen
 vall-e-x-protogen:
 	$(MAKE) -C backend/python/vall-e-x protogen
 .PHONY: vall-e-x-protogen-clean
 vall-e-x-protogen-clean:
 	$(MAKE) -C backend/python/vall-e-x protogen-clean
 .PHONY: vllm-protogen
 vllm-protogen:
 	$(MAKE) -C backend/python/vllm protogen
 .PHONY: vllm-protogen-clean
 vllm-protogen-clean:
 	$(MAKE) -C backend/python/vllm protogen-clean
 ## GRPC
 # Note: it is duplicated in the Dockerfile
-prepare-extra-conda-environments:
+prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
@@ -447,14 +572,16 @@ prepare-extra-conda-environments:
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
 	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/transformers-musicgen
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2
-prepare-test-extra:
+prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
@@ -478,11 +605,11 @@ backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
-backend-assets/grpc: replace
+backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc
-backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
+backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
@@ -524,17 +651,16 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif
-backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
+backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
-backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
+backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
@@ -597,4 +723,4 @@ docker-image-intel-xpu:
 .PHONY: swagger
 swagger:
-	swag init -g core/http/api.go --output swagger
+	swag init -g core/http/app.go --output swagger
--- a/README.md
+++ b/README.md
@@ -44,23 +44,19 @@
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
 ## 🔥🔥 Hot topics / Roadmap
 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
- Landing page: https://github.com/mudler/LocalAI/pull/1922
+- Reranker API: https://github.com/mudler/LocalAI/pull/2121
 - Gallery WebUI: https://github.com/mudler/LocalAI/pull/2104
 - llama3: https://github.com/mudler/LocalAI/discussions/2076
 - Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
 - Openvino support: https://github.com/mudler/LocalAI/pull/1892
 - Vector store: https://github.com/mudler/LocalAI/pull/1795
 - All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
 - Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715
 - Upload file API: https://github.com/mudler/LocalAI/pull/1703
 - ROCm container images: https://github.com/mudler/LocalAI/pull/1595 / Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
 - Mamba support: https://github.com/mudler/LocalAI/pull/1589
 - Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
 - 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
 - Img2vid https://github.com/mudler/LocalAI/pull/1442
 Hot topics (looking for contributors):
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -0,0 +1,27 @@
 name: jina-reranker-v1-base-en
 backend: rerankers
 parameters:
  model: cross-encoder
 usage: |
    You can test this model with curl like this:
    curl http://localhost:8080/v1/rerank \
      -H "Content-Type: application/json" \
      -d '{
      "model": "jina-reranker-v1-base-en",
      "query": "Organic skincare products for sensitive skin",
      "documents": [
        "Eco-friendly kitchenware for modern homes",
        "Biodegradable cleaning supplies for eco-conscious consumers",
        "Organic cotton baby clothes for sensitive skin",
        "Natural organic skincare range for sensitive skin",
        "Tech gadgets for smart homes: 2024 edition",
        "Sustainable gardening tools and compost solutions",
        "Sensitive skin-friendly facial cleansers and toners",
        "Organic food wraps and storage solutions",
        "All-natural pet food for dogs with allergies",
        "Yoga mats made from recycled materials"
      ],
      "top_n": 3
    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -6,15 +6,22 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if .FunctionCall }}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
+    <tool_call>
-    {{- if .Content}}
+    {{- else if eq .RoleName "tool" }}
-    {{.Content}}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .Content}}
-    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{.Content }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    {{- end }}
-    <|im_end|>
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
+    </tool_call><|im_end|>
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size
 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
 check_vars
--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -0,0 +1,27 @@
 name: jina-reranker-v1-base-en
 backend: rerankers
 parameters:
  model: cross-encoder
 usage: |
    You can test this model with curl like this:
    curl http://localhost:8080/v1/rerank \
      -H "Content-Type: application/json" \
      -d '{
      "model": "jina-reranker-v1-base-en",
      "query": "Organic skincare products for sensitive skin",
      "documents": [
        "Eco-friendly kitchenware for modern homes",
        "Biodegradable cleaning supplies for eco-conscious consumers",
        "Organic cotton baby clothes for sensitive skin",
        "Natural organic skincare range for sensitive skin",
        "Tech gadgets for smart homes: 2024 edition",
        "Sustainable gardening tools and compost solutions",
        "Sensitive skin-friendly facial cleansers and toners",
        "Organic food wraps and storage solutions",
        "All-natural pet food for dogs with allergies",
        "Yoga mats made from recycled materials"
      ],
      "top_n": 3
    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -6,15 +6,22 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if .FunctionCall }}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
+    <tool_call>
-    {{- if .Content}}
+    {{- else if eq .RoleName "tool" }}
-    {{.Content}}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .Content}}
-    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{.Content }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    {{- end }}
-    <|im_end|>
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
+    </tool_call><|im_end|>
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -0,0 +1,27 @@
 name: jina-reranker-v1-base-en
 backend: rerankers
 parameters:
  model: cross-encoder
 usage: |
    You can test this model with curl like this:
    curl http://localhost:8080/v1/rerank \
      -H "Content-Type: application/json" \
      -d '{
      "model": "jina-reranker-v1-base-en",
      "query": "Organic skincare products for sensitive skin",
      "documents": [
        "Eco-friendly kitchenware for modern homes",
        "Biodegradable cleaning supplies for eco-conscious consumers",
        "Organic cotton baby clothes for sensitive skin",
        "Natural organic skincare range for sensitive skin",
        "Tech gadgets for smart homes: 2024 edition",
        "Sustainable gardening tools and compost solutions",
        "Sensitive skin-friendly facial cleansers and toners",
        "Organic food wraps and storage solutions",
        "All-natural pet food for dogs with allergies",
        "Yoga mats made from recycled materials"
      ],
      "top_n": 3
    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -7,15 +7,22 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if .FunctionCall }}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
+    <tool_call>
-    {{- if .Content}}
+    {{- else if eq .RoleName "tool" }}
-    {{.Content}}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .Content}}
-    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{.Content }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    {{- end }}
-    <|im_end|>
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -30,8 +37,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
+    </tool_call><|im_end|>
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -23,6 +23,30 @@ service Backend {
  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
 }
 message RerankRequest {
  string query = 1;
  repeated string documents = 2;
  int32 top_n = 3;
 }
 message RerankResult {
  Usage usage = 1;
  repeated DocumentResult results = 2;
 }
 message Usage {
  int32 total_tokens = 1;
  int32 prompt_tokens = 2;
 }
 message DocumentResult {
  int32 index = 1;
  string text = 2;
  float relevance_score = 3;
 }
 message StoresKey {
@@ -107,11 +131,15 @@ message PredictOptions {
  string NegativePrompt = 40;
  int32 NDraft = 41;
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
 }
 // The response message containing the result
 message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
 }
 message ModelOptions {
@@ -173,6 +201,7 @@ message ModelOptions {
  bool   EnforceEager = 52;
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string MMProj = 41;
@@ -256,3 +285,8 @@ message StatusResponse {
  State state = 1;
  MemoryUsageData memory = 2;
 }
 message Message {
  string role = 1;
  string content = 2;
 }
--- a/backend/backend_grpc.pb.go
+++ b/backend/backend_grpc.pb.go
@@ -1,457 +0,0 @@
 // Code generated by protoc-gen-go-grpc. DO NOT EDIT.
 // versions:
 // - protoc-gen-go-grpc v1.2.0
 // - protoc             v4.23.4
 // source: backend/backend.proto
 package proto
 import (
 	context "context"
 	grpc "google.golang.org/grpc"
 	codes "google.golang.org/grpc/codes"
 	status "google.golang.org/grpc/status"
 )
 // This is a compile-time assertion to ensure that this generated file
 // is compatible with the grpc package it is being compiled against.
 // Requires gRPC-Go v1.32.0 or later.
 const _ = grpc.SupportPackageIsVersion7
 // BackendClient is the client API for Backend service.
 //
 // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
 type BackendClient interface {
 	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
 	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
 	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
 	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
 	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
 	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
 	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
 	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
 	TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error)
 	Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error)
 }
 type backendClient struct {
 	cc grpc.ClientConnInterface
 }
 func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
 	return &backendClient{cc}
 }
 func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
 	out := new(Reply)
 	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
 	out := new(Reply)
 	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
 	out := new(Result)
 	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
 	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
 	if err != nil {
 		return nil, err
 	}
 	x := &backendPredictStreamClient{stream}
 	if err := x.ClientStream.SendMsg(in); err != nil {
 		return nil, err
 	}
 	if err := x.ClientStream.CloseSend(); err != nil {
 		return nil, err
 	}
 	return x, nil
 }
 type Backend_PredictStreamClient interface {
 	Recv() (*Reply, error)
 	grpc.ClientStream
 }
 type backendPredictStreamClient struct {
 	grpc.ClientStream
 }
 func (x *backendPredictStreamClient) Recv() (*Reply, error) {
 	m := new(Reply)
 	if err := x.ClientStream.RecvMsg(m); err != nil {
 		return nil, err
 	}
 	return m, nil
 }
 func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
 	out := new(EmbeddingResult)
 	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
 	out := new(Result)
 	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
 	out := new(TranscriptResult)
 	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
 	out := new(Result)
 	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error) {
 	out := new(TokenizationResponse)
 	err := c.cc.Invoke(ctx, "/backend.Backend/TokenizeString", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 func (c *backendClient) Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error) {
 	out := new(StatusResponse)
 	err := c.cc.Invoke(ctx, "/backend.Backend/Status", in, out, opts...)
 	if err != nil {
 		return nil, err
 	}
 	return out, nil
 }
 // BackendServer is the server API for Backend service.
 // All implementations must embed UnimplementedBackendServer
 // for forward compatibility
 type BackendServer interface {
 	Health(context.Context, *HealthMessage) (*Reply, error)
 	Predict(context.Context, *PredictOptions) (*Reply, error)
 	LoadModel(context.Context, *ModelOptions) (*Result, error)
 	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
 	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
 	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
 	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
 	TTS(context.Context, *TTSRequest) (*Result, error)
 	TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error)
 	Status(context.Context, *HealthMessage) (*StatusResponse, error)
 	mustEmbedUnimplementedBackendServer()
 }
 // UnimplementedBackendServer must be embedded to have forward compatible implementations.
 type UnimplementedBackendServer struct {
 }
 func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
 }
 func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
 }
 func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
 }
 func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
 	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
 }
 func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
 }
 func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
 }
 func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
 }
 func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
 }
 func (UnimplementedBackendServer) TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method TokenizeString not implemented")
 }
 func (UnimplementedBackendServer) Status(context.Context, *HealthMessage) (*StatusResponse, error) {
 	return nil, status.Errorf(codes.Unimplemented, "method Status not implemented")
 }
 func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
 // UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
 // Use of this interface is not recommended, as added methods to BackendServer will
 // result in compilation errors.
 type UnsafeBackendServer interface {
 	mustEmbedUnimplementedBackendServer()
 }
 func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
 	s.RegisterService(&Backend_ServiceDesc, srv)
 }
 func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(HealthMessage)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).Health(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/Health",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(PredictOptions)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).Predict(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/Predict",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(ModelOptions)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).LoadModel(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/LoadModel",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
 	m := new(PredictOptions)
 	if err := stream.RecvMsg(m); err != nil {
 		return err
 	}
 	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
 }
 type Backend_PredictStreamServer interface {
 	Send(*Reply) error
 	grpc.ServerStream
 }
 type backendPredictStreamServer struct {
 	grpc.ServerStream
 }
 func (x *backendPredictStreamServer) Send(m *Reply) error {
 	return x.ServerStream.SendMsg(m)
 }
 func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(PredictOptions)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).Embedding(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/Embedding",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(GenerateImageRequest)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).GenerateImage(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/GenerateImage",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(TranscriptRequest)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).AudioTranscription(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/AudioTranscription",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(TTSRequest)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).TTS(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/TTS",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_TokenizeString_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(PredictOptions)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).TokenizeString(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/TokenizeString",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).TokenizeString(ctx, req.(*PredictOptions))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 func _Backend_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
 	in := new(HealthMessage)
 	if err := dec(in); err != nil {
 		return nil, err
 	}
 	if interceptor == nil {
 		return srv.(BackendServer).Status(ctx, in)
 	}
 	info := &grpc.UnaryServerInfo{
 		Server:     srv,
 		FullMethod: "/backend.Backend/Status",
 	}
 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
 		return srv.(BackendServer).Status(ctx, req.(*HealthMessage))
 	}
 	return interceptor(ctx, in, info, handler)
 }
 // Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
 // It's only intended for direct use with grpc.RegisterService,
 // and not to be introspected or modified (even as a copy)
 var Backend_ServiceDesc = grpc.ServiceDesc{
 	ServiceName: "backend.Backend",
 	HandlerType: (*BackendServer)(nil),
 	Methods: []grpc.MethodDesc{
 		{
 			MethodName: "Health",
 			Handler:    _Backend_Health_Handler,
 		},
 		{
 			MethodName: "Predict",
 			Handler:    _Backend_Predict_Handler,
 		},
 		{
 			MethodName: "LoadModel",
 			Handler:    _Backend_LoadModel_Handler,
 		},
 		{
 			MethodName: "Embedding",
 			Handler:    _Backend_Embedding_Handler,
 		},
 		{
 			MethodName: "GenerateImage",
 			Handler:    _Backend_GenerateImage_Handler,
 		},
 		{
 			MethodName: "AudioTranscription",
 			Handler:    _Backend_AudioTranscription_Handler,
 		},
 		{
 			MethodName: "TTS",
 			Handler:    _Backend_TTS_Handler,
 		},
 		{
 			MethodName: "TokenizeString",
 			Handler:    _Backend_TokenizeString_Handler,
 		},
 		{
 			MethodName: "Status",
 			Handler:    _Backend_Status_Handler,
 		},
 	},
 	Streams: []grpc.StreamDesc{
 		{
 			StreamName:    "PredictStream",
 			Handler:       _Backend_PredictStream_Handler,
 			ServerStreams: true,
 		},
 	},
 	Metadata: "backend/backend.proto",
 }
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -5,7 +5,6 @@ SYSTEM ?= $(HOST_SYSTEM)
 TAG_LIB_GRPC?=v1.59.0
 GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
 GIT_CLONE_DEPTH?=1
 NUM_BUILD_THREADS?=$(shell nproc --ignore=1)
 INSTALLED_PACKAGES=installed_packages
 GRPC_REPO=grpc_repo
@@ -52,7 +51,7 @@ $(GRPC_REPO):
 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
-	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . -- -j ${NUM_BUILD_THREADS} && cmake --build . --target install -- -j ${NUM_BUILD_THREADS}
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install
 build: $(INSTALLED_PACKAGES)
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2332,6 +2332,10 @@ public:
                std::string completion_text = result.result_json.value("content", "");
                reply.set_message(completion_text);
                int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
                reply.set_tokens(tokens_predicted);
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);
                // Send the reply
                writer->Write(reply);
@@ -2357,6 +2361,10 @@ public:
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
            completion_text = result.result_json.value("content", "");
            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
            reply->set_prompt_tokens(tokens_evaluated);
            reply->set_tokens(tokens_predicted);
            reply->set_message(completion_text);
        }
        else
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,4 +1,13 @@
 .PHONY: autogptq
-autogptq:
+autogptq: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
@@ -39,7 +39,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.model_name = "Qwen-VL-Chat"
                model = AutoModelForCausalLM.from_pretrained(model_path, 
                    trust_remote_code=request.TrustRemoteCode,
                    use_triton=request.UseTriton,
                    device_map="auto").eval()
            else:
                model = AutoGPTQForCausalLM.from_quantized(model_path,
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/autogptq/backend_pb2_grpc.py
+++ b/backend/python/autogptq/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,15 +1,25 @@
 .PHONY: ttsbark
-ttsbark:
+ttsbark: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: run
-run:
+run: protogen
 	@echo "Running bark..."
 	bash run.sh
 	@echo "bark run."
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing bark..."
 	bash test.sh
 	@echo "bark tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/bark/backend_pb2_grpc.py
+++ b/backend/python/bark/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -2,6 +2,7 @@
 set -ex
 SKIP_CONDA=${SKIP_CONDA:-0}
 REQUIREMENTS_FILE=$1
 # Check if environment exist
 conda_env_exists(){
@@ -14,7 +15,7 @@ else
    export PATH=$PATH:/opt/conda/bin
    if conda_env_exists "transformers" ; then
        echo "Creating virtual environment..."
-        conda env create --name transformers --file $1
+        conda env create --name transformers --file $REQUIREMENTS_FILE
        echo "Virtual environment created."
    else 
        echo "Virtual environment already exists."
@@ -28,11 +29,16 @@ if [ -d "/opt/intel" ]; then
    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino]
 fi
-if [ "$PIP_CACHE_PURGE" = true ] ; then
+# If we didn't skip conda, activate the environment
-    if [ $SKIP_CONDA -eq 0 ]; then
+# to install FlashAttention
-        # Activate conda environment
+if [ $SKIP_CONDA -eq 0 ]; then
-        source activate transformers
+    source activate transformers
-    fi
+fi
 if [[ $REQUIREMENTS_FILE =~ -nvidia.yml$ ]]; then
    #TODO: FlashAttention is supported on nvidia and ROCm, but ROCm install can't be done this easily
    pip install flash-attn --no-build-isolation
 fi
 if [ "$PIP_CACHE_PURGE" = true ] ; then
    pip cache purge
 fi
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -116,8 +116,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
      - rerankers[transformers]
      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -104,8 +104,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1
      - rerankers[transformers]
      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -108,8 +108,10 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1  
+      - xformers==0.0.23.post1
      - rerankers[transformers]
      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,15 +1,25 @@
 .PHONY: coqui
-coqui:
+coqui: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: run
-run:
+run: protogen
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/coqui/backend_pb2_grpc.py
+++ b/backend/python/coqui/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -12,15 +12,25 @@ export SKIP_CONDA=1
 endif
 .PHONY: diffusers
-diffusers:
+diffusers: protogen
 	@echo "Installing $(CONDA_ENV_PATH)..."
 	bash install.sh $(CONDA_ENV_PATH)
 .PHONY: run
-run:
+run: protogen
 	@echo "Running diffusers..."
 	bash run.sh
 	@echo "Diffusers run."
-test:
+test: protogen
 	bash test.sh
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/backend_pb2_grpc.py
+++ b/backend/python/diffusers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/diffusers/diffusers-rocm.yml
+++ b/backend/python/diffusers/diffusers-rocm.yml
@@ -61,4 +61,5 @@ dependencies:
      - urllib3==2.0.6
      - zipp==3.17.0
      - torch
      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -71,4 +71,5 @@ dependencies:
      - typing-extensions==4.8.0
      - urllib3==2.0.6
      - zipp==3.17.0
      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,11 +1,21 @@
 export CONDA_ENV_PATH = "exllama.yml"
 .PHONY: exllama
-exllama:
+exllama: protogen
 	bash install.sh ${CONDA_ENV_PATH}
 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama..."
 	bash run.sh
 	@echo "exllama run."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/backend_pb2_grpc.py
+++ b/backend/python/exllama/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,10 +1,20 @@
 .PHONY: exllama2
-exllama2:
+exllama2: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh
 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama2..."
 	bash run.sh
 	@echo "exllama2 run."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/exllama2/backend_pb2_grpc.py
+++ b/backend/python/exllama2/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/mamba/Makefile
+++ b/backend/python/mamba/Makefile
@@ -1,16 +1,26 @@
 .PHONY: mamba
-mamba:
+mamba: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh
 .PHONY: run
-run:
+run: protogen
 	@echo "Running mamba..."
 	bash run.sh
 	@echo "mamba run."
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing mamba..."
 	bash test.sh
-	@echo "mamba tested."
+	@echo "mamba tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/mamba/backend_pb2.py
+++ b/backend/python/mamba/backend_pb2.py
--- a/backend/python/mamba/backend_pb2_grpc.py
+++ b/backend/python/mamba/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/parler-tts/Makefile
+++ b/backend/python/parler-tts/Makefile
@@ -0,0 +1,39 @@
 export CONDA_ENV_PATH = "parler.yml"
 SKIP_CONDA?=0
 ifeq ($(BUILD_TYPE), cublas)
 export CONDA_ENV_PATH = "parler-nvidia.yml"
 endif
 # Intel GPU are supposed to have dependencies installed in the main python
 # environment, so we skip conda installation for SYCL builds.
 # https://github.com/intel/intel-extension-for-pytorch/issues/538
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 export SKIP_CONDA=1
 endif
 .PHONY: parler-tts
 parler-tts: protogen
 	@echo "Installing $(CONDA_ENV_PATH)..."
 	bash install.sh $(CONDA_ENV_PATH)
 .PHONY: run
 run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."
 .PHONY: test
 test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -0,0 +1,39 @@
 #!/bin/bash
 set -ex
 SKIP_CONDA=${SKIP_CONDA:-0}
 # Check if environment exist
 conda_env_exists(){
    ! conda list --name "${@}" >/dev/null 2>/dev/null
 }
 if [ $SKIP_CONDA -eq 1 ]; then
    echo "Skipping conda environment installation"
 else
    export PATH=$PATH:/opt/conda/bin
    if conda_env_exists "parler" ; then
        echo "Creating virtual environment..."
        conda env create --name parler --file $1
        echo "Virtual environment created."
    else 
        echo "Virtual environment already exists."
    fi
 fi
 if [ $SKIP_CONDA -ne 1 ]; then
    # Activate conda environment
    source activate parler
    # https://github.com/descriptinc/audiotools/issues/101
    # incompatible protobuf versions.
    curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o $CONDA_PREFIX/lib/python3.11/site-packages/google/protobuf/internal/builder.py
 fi
 if [ "$PIP_CACHE_PURGE" = true ] ; then
    if [ $SKIP_CONDA -ne 1 ]; then
        # Activate conda environment
        source activate parler
    fi
    pip cache purge
 fi
--- a/backend/python/parler-tts/parler-nvidia.yml
+++ b/backend/python/parler-tts/parler-nvidia.yml
@@ -0,0 +1,48 @@
 name: parler
 channels:
  - defaults
 dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - bzip2=1.0.8=h7b6447c_0
  - ca-certificates=2023.08.22=h06a4308_0
  - ld_impl_linux-64=2.38=h1181459_1
  - libffi=3.4.4=h6a678d5_0
  - libgcc-ng=11.2.0=h1234567_1
  - libgomp=11.2.0=h1234567_1
  - libstdcxx-ng=11.2.0=h1234567_1
  - libuuid=1.41.5=h5eee18b_0
  - ncurses=6.4=h6a678d5_0
  - openssl=3.0.11=h7f8727e_2
  - pip=23.2.1=py311h06a4308_0
  - python=3.11.5=h955ad1f_0
  - readline=8.2=h5eee18b_0
  - setuptools=68.0.0=py311h06a4308_0
  - sqlite=3.41.2=h5eee18b_0
  - tk=8.6.12=h1ccaba5_0
  - tzdata=2023c=h04d1e81_0
  - wheel=0.41.2=py311h06a4308_0
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
      - accelerate>=0.11.0
      - grpcio==1.59.0
      - numpy==1.26.0
      - nvidia-cublas-cu12==12.1.3.1
      - nvidia-cuda-cupti-cu12==12.1.105
      - nvidia-cuda-nvrtc-cu12==12.1.105
      - nvidia-cuda-runtime-cu12==12.1.105
      - nvidia-cudnn-cu12==8.9.2.26
      - nvidia-cufft-cu12==11.0.2.54
      - nvidia-curand-cu12==10.3.2.106
      - nvidia-cusolver-cu12==11.4.5.107
      - nvidia-cusparse-cu12==12.1.0.106
      - nvidia-nccl-cu12==2.18.1
      - nvidia-nvjitlink-cu12==12.2.140
      - nvidia-nvtx-cu12==12.1.105
      - torch==2.1.0
      - transformers>=4.34.0
      - descript-audio-codec
      - sentencepiece
      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/parler-tts/parler.yml
+++ b/backend/python/parler-tts/parler.yml
@@ -0,0 +1,36 @@
 name: parler
 channels:
  - defaults
 dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - bzip2=1.0.8=h7b6447c_0
  - ca-certificates=2023.08.22=h06a4308_0
  - ld_impl_linux-64=2.38=h1181459_1
  - libffi=3.4.4=h6a678d5_0
  - libgcc-ng=11.2.0=h1234567_1
  - libgomp=11.2.0=h1234567_1
  - libstdcxx-ng=11.2.0=h1234567_1
  - libuuid=1.41.5=h5eee18b_0
  - ncurses=6.4=h6a678d5_0
  - openssl=3.0.11=h7f8727e_2
  - pip=23.2.1=py311h06a4308_0
  - python=3.11.5=h955ad1f_0
  - readline=8.2=h5eee18b_0
  - setuptools=68.0.0=py311h06a4308_0
  - sqlite=3.41.2=h5eee18b_0
  - tk=8.6.12=h1ccaba5_0
  - tzdata=2023c=h04d1e81_0
  - wheel=0.41.2=py311h06a4308_0
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
      - accelerate>=0.11.0
      - numpy==1.26.0
      - grpcio==1.59.0
      - torch==2.1.0
      - transformers>=4.34.0
      - descript-audio-codec
      - sentencepiece
      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
 prefix: /opt/conda/envs/parler
--- a/backend/python/parler-tts/parler_tts_server.py
+++ b/backend/python/parler-tts/parler_tts_server.py
@@ -0,0 +1,125 @@
 #!/usr/bin/env python3
 """
 Extra gRPC server for MusicgenForConditionalGeneration models.
 """
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 from scipy.io.wavfile import write as write_wav
 from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer
 import soundfile as sf  
 import torch
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer for the backend service.
    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
    """
    def Health(self, request, context):
        """
        A gRPC method that returns the health status of the backend service.
        Args:
            request: A HealthRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Reply object that contains the health status of the backend service.
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        """
        A gRPC method that loads a model into memory.
        Args:
            request: A LoadModelRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        try:
            self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def TTS(self, request, context):
        model_name = request.model
        voice = request.voice
        if voice == "":
            voice = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
        if model_name == "":
            return backend_pb2.Result(success=False, message="request.model is required")
        try:
            device = "cuda:0" if torch.cuda.is_available() else "cpu"
            input_ids = self.tokenizer(voice, return_tensors="pt").input_ids.to(device)
            prompt_input_ids = self.tokenizer(request.text, return_tensors="pt").input_ids.to(device)
            generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
            audio_arr = generation.cpu().numpy().squeeze()
            print("[parler-tts] TTS generated!", file=sys.stderr)
            sf.write(request.dst, audio_arr, self.model.config.sampling_rate)
            print("[parler-tts] TTS saved to", request.dst, file=sys.stderr)
            print("[parler-tts] TTS for", file=sys.stderr)
            print(request, file=sys.stderr)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(success=True)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("[parler-tts] Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("[parler-tts] Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    print(f"[parler-tts] startup: {args}", file=sys.stderr)
    serve(args.addr)
--- a/backend/python/parler-tts/run.sh
+++ b/backend/python/parler-tts/run.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 ##
 ## A bash script wrapper that runs the parler-tts server with conda
 echo "Launching gRPC server for parler-tts"
 export PATH=$PATH:/opt/conda/bin
 # Activate conda environment
 source activate parler
 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 python $DIR/parler_tts_server.py $@
--- a/backend/python/parler-tts/test.sh
+++ b/backend/python/parler-tts/test.sh
@@ -0,0 +1,11 @@
 #!/bin/bash
 ##
 ## A bash script wrapper that runs the transformers server with conda
 # Activate conda environment
 source activate parler
 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 python -m unittest $DIR/test_parler.py
--- a/backend/python/parler-tts/test_parler.py
+++ b/backend/python/parler-tts/test_parler.py
@@ -0,0 +1,81 @@
 """
 A test script to test the gRPC service
 """
 import unittest
 import subprocess
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 class TestBackendServicer(unittest.TestCase):
    """
    TestBackendServicer is the class that tests the gRPC service
    """
    def setUp(self):
        """
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "parler_tts_server.py", "--addr", "localhost:50051"])
        time.sleep(10)
    def tearDown(self) -> None:
        """
        This method tears down the gRPC service by terminating the server
        """
        self.service.terminate()
        self.service.wait()
    def test_server_startup(self):
        """
        This method tests if the server starts up successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.Health(backend_pb2.HealthMessage())
                self.assertEqual(response.message, b'OK')
        except Exception as err:
            print(err)
            self.fail("Server failed to start")
        finally:
            self.tearDown()
    def test_load_model(self):
        """
        This method tests if the model is loaded successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
            print(err)
            self.fail("LoadModel service failed")
        finally:
            self.tearDown()
    def test_tts(self):
        """
        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
                self.assertTrue(response.success)
                tts_request = backend_pb2.TTSRequest(text="Hey, how are you doing today?")
                tts_response = stub.TTS(tts_request)
                self.assertIsNotNone(tts_response)
        except Exception as err:
            print(err)
            self.fail("TTS service failed")
        finally:
            self.tearDown()
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,17 +1,27 @@
 .PHONY: petals
-petals:
+petals: protogen
 	@echo "Creating virtual environment..."
 	bash install.sh "petals.yml"
 	@echo "Virtual environment created."
 .PHONY: run
-run:
+run: protogen
 	@echo "Running petals..."
 	bash run.sh
 	@echo "petals run."
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing petals..."
 	bash test.sh
 	@echo "petals tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/petals/backend_pb2_grpc.py
+++ b/backend/python/petals/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/rerankers/Makefile
+++ b/backend/python/rerankers/Makefile
@@ -0,0 +1,27 @@
 .PHONY: rerankers
 rerankers: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: run
 run: protogen
 	@echo "Running rerankers..."
 	bash run.sh
 	@echo "rerankers run."
 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
 test: protogen
 	@echo "Testing rerankers..."
 	bash test.sh
 	@echo "rerankers tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/rerankers/README.md
+++ b/backend/python/rerankers/README.md
@@ -0,0 +1,5 @@
 # Creating a separate environment for the reranker project
 ```
 make reranker
 ```
--- a/backend/python/rerankers/reranker.py
+++ b/backend/python/rerankers/reranker.py
@@ -0,0 +1,123 @@
 #!/usr/bin/env python3
 """
 Extra gRPC server for Rerankers models.
 """
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 from rerankers import Reranker
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer for the backend service.
    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
    """
    def Health(self, request, context):
        """
        A gRPC method that returns the health status of the backend service.
        Args:
            request: A HealthRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Reply object that contains the health status of the backend service.
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        """
        A gRPC method that loads a model into memory.
        Args:
            request: A LoadModelRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
        try:
            kwargs = {}
            if request.Type != "":
                kwargs['model_type'] = request.Type
            if request.PipelineType != "": # Reuse the PipelineType field for language
                kwargs['lang'] = request.PipelineType
            self.model_name = model_name
            self.model = Reranker(model_name, **kwargs)  
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
        # Replace this with your desired response
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Rerank(self, request, context):
        documents = []
        for idx, doc in enumerate(request.documents):
            documents.append(doc)
        ranked_results=self.model.rank(query=request.query, docs=documents, doc_ids=list(range(len(request.documents))))
        # Prepare results to return
        results = [
            backend_pb2.DocumentResult(
                index=res.doc_id,
                text=res.text,
                relevance_score=res.score
            ) for res in ranked_results.results
        ]
        # Calculate the usage and total tokens
        # TODO: Implement the usage calculation with reranker
        total_tokens = sum(len(doc.split()) for doc in request.documents) + len(request.query.split())
        prompt_tokens = len(request.query.split())
        usage = backend_pb2.Usage(total_tokens=total_tokens, prompt_tokens=prompt_tokens)
        return backend_pb2.RerankResult(usage=usage, results=results)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/rerankers/run.sh
+++ b/backend/python/rerankers/run.sh
@@ -0,0 +1,14 @@
 #!/bin/bash
 ##
 ## A bash script wrapper that runs the reranker server with conda
 export PATH=$PATH:/opt/conda/bin
 # Activate conda environment
 source activate transformers
 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 python $DIR/reranker.py $@
--- a/backend/python/rerankers/test.sh
+++ b/backend/python/rerankers/test.sh
@@ -0,0 +1,11 @@
 #!/bin/bash
 ##
 ## A bash script wrapper that runs the reranker server with conda
 # Activate conda environment
 source activate transformers
 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 python -m unittest $DIR/test_reranker.py
--- a/backend/python/rerankers/test_reranker.py
+++ b/backend/python/rerankers/test_reranker.py
@@ -0,0 +1,90 @@
 """
 A test script to test the gRPC service
 """
 import unittest
 import subprocess
 import time
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 class TestBackendServicer(unittest.TestCase):
    """
    TestBackendServicer is the class that tests the gRPC service
    """
    def setUp(self):
        """
        This method sets up the gRPC service by starting the server
        """
        self.service = subprocess.Popen(["python3", "reranker.py", "--addr", "localhost:50051"])
        time.sleep(10)
    def tearDown(self) -> None:
        """
        This method tears down the gRPC service by terminating the server
        """
        self.service.kill()
        self.service.wait()
    def test_server_startup(self):
        """
        This method tests if the server starts up successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.Health(backend_pb2.HealthMessage())
                self.assertEqual(response.message, b'OK')
        except Exception as err:
            print(err)
            self.fail("Server failed to start")
        finally:
            self.tearDown()
    def test_load_model(self):
        """
        This method tests if the model is loaded successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
            print(err)
            self.fail("LoadModel service failed")
        finally:
            self.tearDown()
    def test_rerank(self):
        """
        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                request = backend_pb2.RerankRequest(
                    query="I love you",
                    documents=["I hate you", "I really like you"],
                    top_n=2
                )
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
                self.assertTrue(response.success)
                rerank_response = stub.Rerank(request)
                print(rerank_response.results[0])
                self.assertIsNotNone(rerank_response.results)
                self.assertEqual(len(rerank_response.results), 2)
                self.assertEqual(rerank_response.results[0].text, "I really like you")
                self.assertEqual(rerank_response.results[1].text, "I hate you")
        except Exception as err:
            print(err)
            self.fail("Reranker service failed")
        finally:
            self.tearDown()
--- a/backend/python/sentencetransformers/Makefile
+++ b/backend/python/sentencetransformers/Makefile
@@ -1,17 +1,27 @@
 .PHONY: sentencetransformers
-sentencetransformers:
+sentencetransformers: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: run
-run:
+run: protogen
 	@echo "Running sentencetransformers..."
 	bash run.sh
 	@echo "sentencetransformers run."
 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing sentencetransformers..."
 	bash test.sh
 	@echo "sentencetransformers tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/sentencetransformers/backend_pb2_grpc.py
+++ b/backend/python/sentencetransformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/Makefile
+++ b/backend/python/transformers-musicgen/Makefile
@@ -1,16 +1,25 @@
 .PHONY: transformers-musicgen
-transformers-musicgen:
+transformers-musicgen: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2_grpc.py
+++ b/backend/python/transformers-musicgen/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/run.sh
+++ b/backend/python/transformers-musicgen/run.sh
@@ -8,7 +8,7 @@ echo "Launching gRPC server for transformers-musicgen"
 export PATH=$PATH:/opt/conda/bin
 # Activate conda environment
-source activate transformers-musicgen
+source activate transformers
 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/transformers/Makefile
+++ b/backend/python/transformers/Makefile
@@ -1,16 +1,26 @@
 .PHONY: transformers
-transformers:
+transformers: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."
 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/transformers/backend_pb2_grpc.py
+++ b/backend/python/transformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -148,7 +148,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                else:
                    device_map="CPU"
                self.model = OVModelForCausalLM.from_pretrained(model_name, 
-                                                                compile=True, 
+                                                                compile=True,
                                                                trust_remote_code=request.TrustRemoteCode,
                                                                ov_config={"PERFORMANCE_HINT": "LATENCY"}, 
                                                                device=device_map)
                self.OV = True
            else:
@@ -158,6 +160,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                                       quantization_config=quantization, 
                                                       device_map=device_map, 
                                                       torch_dtype=compute)
            if request.ContextSize > 0:
                self.max_tokens = request.ContextSize
            else:
                self.max_tokens = self.model.config.max_position_embeddings
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
            self.XPU = False
@@ -212,12 +219,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        set_seed(request.Seed)
        if request.TopP == 0:
            request.TopP = 0.9
        if request.TopK == 0:
            request.TopK = 40
        prompt = request.Prompt
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:    
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
        eos_token_id = self.tokenizer.eos_token_id
        if request.StopPrompts:
            eos_token_id = []
            for word in request.StopPrompts:
                eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
        inputs = self.tokenizer(prompt, return_tensors="pt")
        max_tokens = 200
        if request.Tokens > 0:
            max_tokens = request.Tokens
        else:
            max_tokens = self.max_tokens - inputs["input_ids"].size()[inputs["input_ids"].dim()-1]
        inputs = self.tokenizer(request.Prompt, return_tensors="pt")
        if self.CUDA:
            inputs = inputs.to("cuda")
        if XPU and self.OV == False:
@@ -235,7 +257,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
                        streamer=streamer)
            thread=Thread(target=self.model.generate, kwargs=config)
@@ -264,7 +286,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id)
            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@@ -3,18 +3,28 @@ export SKIP_CONDA=1
 endif
 .PHONY: ttsvalle
-ttsvalle:
+ttsvalle: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh
 .PHONY: run
-run:
+run: protogen
 	@echo "Running ttsvalle..."
 	bash run.sh
 	@echo "ttsvalle run."
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing valle..."
 	bash test.sh
 	@echo "valle tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/vall-e-x/backend_pb2.py
+++ b/backend/python/vall-e-x/backend_pb2.py
--- a/backend/python/vall-e-x/backend_pb2_grpc.py
+++ b/backend/python/vall-e-x/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vllm/Makefile
+++ b/backend/python/vllm/Makefile
@@ -1,15 +1,25 @@
 .PHONY: vllm
-vllm:
+vllm: protogen
 	$(MAKE) -C ../common-env/transformers
 .PHONY: run
-run:
+run: protogen
 	@echo "Running vllm..."
 	bash run.sh
 	@echo "vllm run."
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing vllm..."
 	bash test.sh
-	@echo "vllm tested."
+	@echo "vllm tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/vllm/backend_pb2.py
+++ b/backend/python/vllm/backend_pb2.py
--- a/backend/python/vllm/backend_pb2_grpc.py
+++ b/backend/python/vllm/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import backend_pb2 as backend__pb2
 class BackendStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.Health = channel.unary_unary(
                '/backend.Backend/Health',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Predict = channel.unary_unary(
                '/backend.Backend/Predict',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.LoadModel = channel.unary_unary(
                '/backend.Backend/LoadModel',
                request_serializer=backend__pb2.ModelOptions.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.PredictStream = channel.unary_stream(
                '/backend.Backend/PredictStream',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.Reply.FromString,
                )
        self.Embedding = channel.unary_unary(
                '/backend.Backend/Embedding',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.EmbeddingResult.FromString,
                )
        self.GenerateImage = channel.unary_unary(
                '/backend.Backend/GenerateImage',
                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.AudioTranscription = channel.unary_unary(
                '/backend.Backend/AudioTranscription',
                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
                response_deserializer=backend__pb2.TranscriptResult.FromString,
                )
        self.TTS = channel.unary_unary(
                '/backend.Backend/TTS',
                request_serializer=backend__pb2.TTSRequest.SerializeToString,
                response_deserializer=backend__pb2.Result.FromString,
                )
        self.TokenizeString = channel.unary_unary(
                '/backend.Backend/TokenizeString',
                request_serializer=backend__pb2.PredictOptions.SerializeToString,
                response_deserializer=backend__pb2.TokenizationResponse.FromString,
                )
        self.Status = channel.unary_unary(
                '/backend.Backend/Status',
                request_serializer=backend__pb2.HealthMessage.SerializeToString,
                response_deserializer=backend__pb2.StatusResponse.FromString,
                )
 class BackendServicer(object):
    """Missing associated documentation comment in .proto file."""
    def Health(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Predict(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def LoadModel(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def PredictStream(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Embedding(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def GenerateImage(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def AudioTranscription(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TTS(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def TokenizeString(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
    def Status(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details('Method not implemented!')
        raise NotImplementedError('Method not implemented!')
 def add_BackendServicer_to_server(servicer, server):
    rpc_method_handlers = {
            'Health': grpc.unary_unary_rpc_method_handler(
                    servicer.Health,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Predict': grpc.unary_unary_rpc_method_handler(
                    servicer.Predict,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'LoadModel': grpc.unary_unary_rpc_method_handler(
                    servicer.LoadModel,
                    request_deserializer=backend__pb2.ModelOptions.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'PredictStream': grpc.unary_stream_rpc_method_handler(
                    servicer.PredictStream,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.Reply.SerializeToString,
            ),
            'Embedding': grpc.unary_unary_rpc_method_handler(
                    servicer.Embedding,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
            ),
            'GenerateImage': grpc.unary_unary_rpc_method_handler(
                    servicer.GenerateImage,
                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
                    servicer.AudioTranscription,
                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
            ),
            'TTS': grpc.unary_unary_rpc_method_handler(
                    servicer.TTS,
                    request_deserializer=backend__pb2.TTSRequest.FromString,
                    response_serializer=backend__pb2.Result.SerializeToString,
            ),
            'TokenizeString': grpc.unary_unary_rpc_method_handler(
                    servicer.TokenizeString,
                    request_deserializer=backend__pb2.PredictOptions.FromString,
                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
            ),
            'Status': grpc.unary_unary_rpc_method_handler(
                    servicer.Status,
                    request_deserializer=backend__pb2.HealthMessage.FromString,
                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
            ),
    }
    generic_handler = grpc.method_handlers_generic_handler(
            'backend.Backend', rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Backend(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def Health(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Predict(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def LoadModel(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
            backend__pb2.ModelOptions.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def PredictStream(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.Reply.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Embedding(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.EmbeddingResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def GenerateImage(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
            backend__pb2.GenerateImageRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def AudioTranscription(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
            backend__pb2.TranscriptRequest.SerializeToString,
            backend__pb2.TranscriptResult.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TTS(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
            backend__pb2.TTSRequest.SerializeToString,
            backend__pb2.Result.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def TokenizeString(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
            backend__pb2.PredictOptions.SerializeToString,
            backend__pb2.TokenizationResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
    @staticmethod
    def Status(request,
            target,
            options=(),
            channel_credentials=None,
            call_credentials=None,
            insecure=False,
            compression=None,
            wait_for_ready=None,
            timeout=None,
            metadata=None):
        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
            backend__pb2.HealthMessage.SerializeToString,
            backend__pb2.StatusResponse.FromString,
            options, channel_credentials,
            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vllm/backend_vllm.py
+++ b/backend/python/vllm/backend_vllm.py
@@ -14,6 +14,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -71,7 +72,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
+    async def LoadModel(self, request, context):
        """
        Loads a language model.
@@ -94,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            engine_args.trust_remote_code = request.TrustRemoteCode
        if request.EnforceEager:
            engine_args.enforce_eager = request.EnforceEager
        if request.TensorParallelSize:
            engine_args.tensor_parallel_size = request.TensorParallelSize
        if request.SwapSpace != 0:
            engine_args.swap_space = request.SwapSpace
        if request.MaxModelLen != 0:
@@ -103,6 +106,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        try:
           engine_model_config = await self.llm.get_model_config()
           self.tokenizer = get_tokenizer(
               engine_model_config.tokenizer,
               tokenizer_mode=engine_model_config.tokenizer_mode,
               trust_remote_code=engine_model_config.trust_remote_code,
               truncation_side="left",
           )
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    async def Predict(self, request, context):
@@ -161,9 +176,15 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed
        prompt = request.Prompt
        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
        # Generate text
        request_id = random_uuid()
-        outputs = self.llm.generate(request.Prompt, sampling_params, request_id)
+        outputs = self.llm.generate(prompt, sampling_params, request_id)
        # Stream the results
        generated_text = ""
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -2,6 +2,7 @@ package backend
 import (
 	"context"
 	"fmt"
 	"os"
 	"regexp"
 	"strings"
@@ -9,9 +10,11 @@ import (
 	"unicode/utf8"
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )
@@ -26,7 +29,7 @@ type TokenUsage struct {
 	Completion int
 }
-func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	threads := c.Threads
 	if *threads == 0 && o.Threads != 0 {
@@ -71,10 +74,30 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
 		return nil, err
 	}
 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
 	// unless the prompt has already been tokenized (non-chat endpoints + functions)
 	if c.TemplateConfig.UseTokenizerTemplate && s == "" {
 		protoMessages = make([]*proto.Message, len(messages), len(messages))
 		for i, message := range messages {
 			protoMessages[i] = &proto.Message{
 				Role: message.Role,
 			}
 			switch ct := message.Content.(type) {
 			case string:
 				protoMessages[i].Content = ct
 			default:
 				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
 			}
 		}
 	}
 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
 	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
 		opts.Images = images
 		tokenUsage := TokenUsage{}
@@ -130,6 +153,12 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
 			if err != nil {
 				return LLMResponse{}, err
 			}
 			if tokenUsage.Prompt == 0 {
 				tokenUsage.Prompt = int(reply.PromptTokens)
 			}
 			if tokenUsage.Completion == 0 {
 				tokenUsage.Completion = int(reply.Tokens)
 			}
 			return LLMResponse{
 				Response: string(reply.Message),
 				Usage:    tokenUsage,
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -74,6 +74,7 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		EnforceEager:         c.EnforceEager,
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
 		TensorParallelSize:   int32(c.TensorParallelSize),
 		MMProj:               c.MMProj,
 		YarnExtFactor:        c.YarnExtFactor,
 		YarnAttnFactor:       c.YarnAttnFactor,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -0,0 +1,39 @@
 package backend
 import (
 	"context"
 	"fmt"
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )
 func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	bb := backend
 	if bb == "" {
 		return nil, fmt.Errorf("backend is required")
 	}
 	grpcOpts := gRPCModelOpts(backendConfig)
 	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
 		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
 	rerankModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
 	}
 	res, err := rerankModel.Rerank(context.Background(), request)
 	return res, err
 }
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -0,0 +1,20 @@
 package cli
 import "embed"
 type Context struct {
 	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
 	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
 	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
 	BackendAssets embed.FS `kong:"-"`
 }
 var CLI struct {
 	Context `embed:""`
 	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
 	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
 	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
 	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
 }
--- a/Show More
+++ b/Show More