Revert "Revert #1963 (#2056 )"

This reverts commit af9e5a2d05.
Revert #1963 (#2056 )
2026-02-03 03:02:38 -05:00 · 2024-04-17 23:36:17 +02:00 · 2024-04-17 23:33:49 +02:00 · 2024-04-17 21:17:25 +00:00 · 2024-04-16 21:37:50 +00:00 · 2024-04-16 21:20:11 +00:00
147 changed files with 4973 additions and 12530 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,6 @@
 .idea
+.github
+.vscode
 models
 examples/chatbot-ui/models
 examples/rwkv/models
--- a/.env
+++ b/.env
@@ -1,33 +1,33 @@
 ## Set number of threads.
 ## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
-# THREADS=14
+# LOCALAI_THREADS=14

 ## Specify a different bind address (defaults to ":8080")
-# ADDRESS=127.0.0.1:8080
+# LOCALAI_ADDRESS=127.0.0.1:8080

 ## Default models context size
-# CONTEXT_SIZE=512
+# LOCALAI_CONTEXT_SIZE=512
 #
 ## Define galleries.
 ## models will to install will be visible in `/models/available`
-# GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+# LOCALAI_GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]

 ## CORS settings
-# CORS=true
-# CORS_ALLOW_ORIGINS=*
+# LOCALAI_CORS=true
+# LOCALAI_CORS_ALLOW_ORIGINS=*

 ## Default path for models
 #
-# MODELS_PATH=/models
+# LOCALAI_MODELS_PATH=/models

 ## Enable debug mode
-# DEBUG=true
+# LOCALAI_LOG_LEVEL=debug

 ## Disables COMPEL (Diffusers)
 # COMPEL=0

 ## Enable/Disable single backend (useful if only one GPU is available)
-# SINGLE_ACTIVE_BACKEND=true
+# LOCALAI_SINGLE_ACTIVE_BACKEND=true

 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
@@ -46,13 +46,13 @@
 # GO_TAGS=stablediffusion

 ## Path where to store generated images
-# IMAGE_PATH=/tmp
+# LOCALAI_IMAGE_PATH=/tmp/generated/images

 ## Specify a default upload limit in MB (whisper)
-# UPLOAD_LIMIT
+# LOCALAI_UPLOAD_LIMIT=15

 ## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
-# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+# LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py

 ### Advanced settings ###
 ### Those are not really used by LocalAI, but from components in the stack ###
@@ -72,18 +72,18 @@
 # LLAMACPP_PARALLEL=1

 ### Enable to run parallel requests
-# PARALLEL_REQUESTS=true
+# LOCALAI_PARALLEL_REQUESTS=true

 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
-# WATCHDOG_IDLE=true
-#
-# Enables watchdog to kill backends that are busy for too much time
-# WATCHDOG_BUSY=true
+# LOCALAI_WATCHDOG_IDLE=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered idle
-# WATCHDOG_IDLE_TIMEOUT=5m
+# LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m
+#
+# Enables watchdog to kill backends that are busy for too much time
+# LOCALAI_WATCHDOG_BUSY=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered busy
-# WATCHDOG_BUSY_TIMEOUT=5m
+# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,25 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "gomod"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "github-actions"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -49,7 +49,7 @@ jobs:
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -0,0 +1,43 @@
+name: Dependabot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'dependabot[bot]' }}
+    steps:
+      - name: Dependabot metadata
+        id: metadata
+        uses: dependabot/fetch-metadata@v2.0.0
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          skip-commit-verification: true
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for Dependabot PRs
+        if: ${{ contains(github.event.pull_request.title, 'bump')}}
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -0,0 +1,35 @@
+name: LocalAI-bot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'localai-bot' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for LocalAIBot PRs
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,6 +1,8 @@
 name: Build and Release

-on: push
+on: 
+- push
+- pull_request

 env:
  GRPC_VERSION: v1.58.0
@@ -33,14 +35,14 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential ffmpeg protobuf-compiler
      - name: Install CUDA Dependencies
        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
        run: |
@@ -55,7 +57,7 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -75,6 +77,9 @@ jobs:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          export PATH=$PATH:$GOPATH/bin
          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
            export BUILD_TYPE=cublas
            export PATH=/usr/local/cuda/bin:$PATH
@@ -82,12 +87,12 @@ jobs:
          else
            STATIC=true make dist
          fi
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-linux-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
@@ -100,27 +105,24 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build stablediffusion
        run: |
+          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v1
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*

  build-macOS:
    strategy:
@@ -138,13 +140,15 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - uses: actions/setup-go@v4
+      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build
        id: build
        env:
@@ -153,13 +157,61 @@ jobs:
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
          make dist
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-MacOS-${{ matrix.build }}
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
+
+  build-macOS-arm64:
+    strategy:
+      matrix:
+        include:
+          - build: 'avx2'
+            defines: ''
+          - build: 'avx'
+            defines: '-DLLAMA_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DLLAMA_AVX512=ON'
+    runs-on: macos-14
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+      - name: Build
+        id: build
+        env:
+          CMAKE_ARGS: "${{ matrix.defines }}"
+          BUILD_ID: "${{ matrix.build }}"
+        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
+          make dist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: LocalAI-MacOS-arm64-${{ matrix.build }}
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -14,14 +14,17 @@ jobs:
      GO111MODULE: on
    steps:
      - name: Checkout Source
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
+        if: ${{ github.actor != 'dependabot[bot]' }}
        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
      - name: Upload SARIF file
-        uses: github/codeql-action/upload-sarif@v2
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        uses: github/codeql-action/upload-sarif@v3
        with:
          # Path to SARIF file relative to the root of the repository
-          sarif_file: results.sarif
+          sarif_file: results.sarif
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -32,8 +32,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -61,8 +62,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -90,8 +92,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -101,6 +104,35 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/diffusers
           make --jobs=5 --output-sync=target -C backend/python/diffusers test

+  tests-parler-tts:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test parler-tts
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts test

  tests-transformers-musicgen:
    runs-on: ubuntu-latest
@@ -120,8 +152,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          
          sudo rm -rfv /usr/bin/conda || true

@@ -151,8 +184,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -222,8 +256,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -254,8 +289,9 @@ jobs:
  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools
  #         sudo rm -rfv /usr/bin/conda || true
  #     - name: Test vllm
  #       run: |
@@ -280,8 +316,9 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev    
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true
      - name: Test vall-e-x
        run: |
@@ -307,7 +344,8 @@ jobs:
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng          
+          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
+          pip install --user grpcio-tools
          sudo rm -rfv /usr/bin/conda || true

      - name: Test coqui
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -60,7 +60,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -70,17 +70,27 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
          sudo apt-get install -y libopencv-dev
-          
+
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+
+          # The python3-grpc-tools package in 22.04 is too old
+          pip install --user grpcio-tools
+
          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers

@@ -89,10 +99,10 @@ jobs:
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
      - name: Cache grpc
        id: cache-grpc
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
@@ -108,11 +118,12 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          connect-timeout-seconds: 180

  tests-aio-container:
    runs-on: ubuntu-latest
@@ -163,8 +174,9 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          connect-timeout-seconds: 180

  tests-apple:
    runs-on: macOS-14
@@ -177,7 +189,7 @@ jobs:
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
          cache: false
@@ -186,7 +198,8 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
+          pip install --user grpcio-tools
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -196,5 +209,6 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3
-        timeout-minutes: 5
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          connect-timeout-seconds: 180
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,8 @@ backend-assets/*
 !backend-assets/.keep
 prepare
 /ggml-metal.metal
+
+# Protobuf generated files
+*.pb.go
+*pb2.py
+*pb2_grpc.py
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to localAI
+# Contributing to LocalAI

 Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.

@@ -29,8 +29,9 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time

 1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
 2. Navigate to the project directory: `cd LocalAI`
-3. Install the required dependencies: `make prepare`
-4. Run LocalAI: `make run`
+3. Install the required dependencies ( see https://localai.io/basics/build/#build-localai-locally )
+4. Build LocalAI: `make build`
+5. Run LocalAI: `./local-ai`

 ## Contributing

@@ -59,14 +60,29 @@ If you find a bug, have a feature request, or encounter any issues, please check

 `make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.

+### Running AIO tests
+
+All-In-One images has a set of tests that automatically verifies that most of the endpoints works correctly, a flow can be :
+
+```bash
+# Build the LocalAI docker image
+make DOCKER_IMAGE=local-ai docker
+
+# Build the corresponding AIO image
+BASE_IMAGE=local-ai DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+
+# Run the AIO e2e tests
+LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio make run-e2e-aio
+```
+
 ## Documentation

- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
-
+We are welcome the contribution of the documents, please open new PR or create a new issue. The documentation is available under `docs/` https://github.com/mudler/LocalAI/tree/master/docs
+ 
 ## Community and Communication

 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)

---
+---
--- a/38
+++ b/38
@@ -15,17 +15,30 @@ ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"

 ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
+    apt-get install -y ca-certificates curl python3-pip unzip && apt-get clean

 # Install Go
 RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin

+# Install grpc compilers
+ENV PATH $PATH:/root/go/bin
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+
+# Install protobuf (the version in 22.04 is too old)
+RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+    rm protoc.zip
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools
+
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates

@@ -68,7 +81,8 @@ RUN test -n "$TARGETARCH" \

 FROM requirements-core as requirements-extras

-RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+RUN apt install -y gpg && \
+    curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
@@ -100,7 +114,7 @@ ENV MAKEFLAGS=${MAKEFLAGS}
 WORKDIR /build

 RUN apt-get update && \
-    apt-get install -y g++ cmake git && \
+    apt-get install -y build-essential cmake git  && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

@@ -133,6 +147,12 @@ WORKDIR /build
 COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake git  && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 RUN make prepare

 # If we are building with clblas support, we need the libraries for the builds
@@ -191,6 +211,11 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
    apt-get clean \
    ; fi

+RUN apt-get update && \
+    apt-get install -y cmake git  && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 WORKDIR /build

 # we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
@@ -202,7 +227,7 @@ COPY . .
 COPY --from=builder /build/sources ./sources/
 COPY --from=grpc /build/grpc ./grpc/

-RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
+RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf /build/grpc

 # Copy the binary
 COPY --from=builder /build/local-ai ./
@@ -250,6 +275,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers-musicgen \
    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    make -C backend/python/parler-tts \
+    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/coqui \
    ; fi
--- a/175
+++ b/175
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=1b67731e184e27a465b8c5476061294a4af668ea
+CPPLLAMA_VERSION?=7593639ce335e8d7f89aa9a54d616951f273af60

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=8f253ef3af1c62c04316ba4afa7145fc4d701a8c
+WHISPER_CPP_VERSION?=b0c3cbf2e851cf232e432b590dcc514a689ec028

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -289,16 +289,21 @@ clean: ## Remove build related file
 	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
-	rm -rf backend-assets
+	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
 	$(MAKE) dropreplace
+	$(MAKE) protogen-clean
+	rmdir pkg/grpc/proto || true

 clean-tests:
 	rm -rf test-models
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

+halt-backends:		## Used to clean up stray backends sometimes left running when debugging manually
+	ps | grep 'backend-assets/grpc/' | awk '{print $$1}' | xargs -I {} kill -9 {}
+
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -363,13 +368,13 @@ run-e2e-image:

 run-e2e-aio:
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio

 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e

 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
@@ -377,15 +382,15 @@ teardown-e2e:

 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
@@ -416,30 +421,144 @@ help: ## Show this help.
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
 		}' $(MAKEFILE_LIST)

+.PHONY: protogen
 protogen: protogen-go protogen-python

+.PHONY: protogen-clean
+protogen-clean: protogen-go-clean protogen-python-clean
+
+.PHONY: protogen-go
 protogen-go:
+	mkdir -p pkg/grpc/proto
 	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto

-protogen-python:
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/coqui/ --grpc_python_out=backend/python/coqui/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/mamba/ --grpc_python_out=backend/python/mamba/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
+.PHONY: protogen-go-clean
+protogen-go-clean:
+	$(RM) pkg/grpc/proto/backend.pb.go pkg/grpc/proto/backend_grpc.pb.go
+	$(RM) bin/*
+
+.PHONY: protogen-python
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
+
+.PHONY: protogen-python-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
+
+.PHONY: autogptq-protogen
+autogptq-protogen:
+	$(MAKE) -C backend/python/autogptq protogen
+
+.PHONY: autogptq-protogen-clean
+autogptq-protogen-clean:
+	$(MAKE) -C backend/python/autogptq protogen-clean
+
+.PHONY: bark-protogen
+bark-protogen:
+	$(MAKE) -C backend/python/bark protogen
+
+.PHONY: bark-protogen-clean
+bark-protogen-clean:
+	$(MAKE) -C backend/python/bark protogen-clean
+
+.PHONY: coqui-protogen
+coqui-protogen:
+	$(MAKE) -C backend/python/coqui protogen
+
+.PHONY: coqui-protogen-clean
+coqui-protogen-clean:
+	$(MAKE) -C backend/python/coqui protogen-clean
+
+.PHONY: diffusers-protogen
+diffusers-protogen:
+	$(MAKE) -C backend/python/diffusers protogen
+
+.PHONY: diffusers-protogen-clean
+diffusers-protogen-clean:
+	$(MAKE) -C backend/python/diffusers protogen-clean
+
+.PHONY: exllama-protogen
+exllama-protogen:
+	$(MAKE) -C backend/python/exllama protogen
+
+.PHONY: exllama-protogen-clean
+exllama-protogen-clean:
+	$(MAKE) -C backend/python/exllama protogen-clean
+
+.PHONY: exllama2-protogen
+exllama2-protogen:
+	$(MAKE) -C backend/python/exllama2 protogen
+
+.PHONY: exllama2-protogen-clean
+exllama2-protogen-clean:
+	$(MAKE) -C backend/python/exllama2 protogen-clean
+
+.PHONY: mamba-protogen
+mamba-protogen:
+	$(MAKE) -C backend/python/mamba protogen
+
+.PHONY: mamba-protogen-clean
+mamba-protogen-clean:
+	$(MAKE) -C backend/python/mamba protogen-clean
+
+.PHONY: petals-protogen
+petals-protogen:
+	$(MAKE) -C backend/python/petals protogen
+
+.PHONY: petals-protogen-clean
+petals-protogen-clean:
+	$(MAKE) -C backend/python/petals protogen-clean
+
+.PHONY: sentencetransformers-protogen
+sentencetransformers-protogen:
+	$(MAKE) -C backend/python/sentencetransformers protogen
+
+.PHONY: sentencetransformers-protogen-clean
+sentencetransformers-protogen-clean:
+	$(MAKE) -C backend/python/sentencetransformers protogen-clean
+
+.PHONY: transformers-protogen
+transformers-protogen:
+	$(MAKE) -C backend/python/transformers protogen
+
+.PHONY: transformers-protogen-clean
+transformers-protogen-clean:
+	$(MAKE) -C backend/python/transformers protogen-clean
+
+.PHONY: parler-tts-protogen
+parler-tts-protogen:
+	$(MAKE) -C backend/python/parler-tts protogen
+
+.PHONY: parler-tts-protogen-clean
+parler-tts-protogen-clean:
+	$(MAKE) -C backend/python/parler-tts protogen-clean
+
+.PHONY: transformers-musicgen-protogen
+transformers-musicgen-protogen:
+	$(MAKE) -C backend/python/transformers-musicgen protogen
+
+.PHONY: transformers-musicgen-protogen-clean
+transformers-musicgen-protogen-clean:
+	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
+
+.PHONY: vall-e-x-protogen
+vall-e-x-protogen:
+	$(MAKE) -C backend/python/vall-e-x protogen
+
+.PHONY: vall-e-x-protogen-clean
+vall-e-x-protogen-clean:
+	$(MAKE) -C backend/python/vall-e-x protogen-clean
+
+.PHONY: vllm-protogen
+vllm-protogen:
+	$(MAKE) -C backend/python/vllm protogen
+
+.PHONY: vllm-protogen-clean
+vllm-protogen-clean:
+	$(MAKE) -C backend/python/vllm protogen-clean

 ## GRPC
 # Note: it is duplicated in the Dockerfile
-prepare-extra-conda-environments:
+prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
@@ -449,12 +568,13 @@ prepare-extra-conda-environments:
 	$(MAKE) -C backend/python/sentencetransformers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/transformers-musicgen
+	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2

-prepare-test-extra:
+prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers

@@ -478,7 +598,7 @@ backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

-backend-assets/grpc: replace
+backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

 backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
@@ -528,7 +648,10 @@ backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libb
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
-
+#	EXPERIMENTAL:
+ifeq ($(BUILD_TYPE),metal)
+	cp $(CURDIR)/sources/go-llama-ggml/llama.cpp/ggml-metal.metal backend-assets/grpc/
+endif
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
--- a/README.md
+++ b/README.md
@@ -50,17 +50,12 @@

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+- Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
 - Landing page: https://github.com/mudler/LocalAI/pull/1922
 - Openvino support: https://github.com/mudler/LocalAI/pull/1892
 - Vector store: https://github.com/mudler/LocalAI/pull/1795
 - All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
 - Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715
- Upload file API: https://github.com/mudler/LocalAI/pull/1703
- ROCm container images: https://github.com/mudler/LocalAI/pull/1595 / Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
- Mamba support: https://github.com/mudler/LocalAI/pull/1589
- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
- Img2vid https://github.com/mudler/LocalAI/pull/1442

 Hot topics (looking for contributors):
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -107,11 +107,15 @@ message PredictOptions {
  string NegativePrompt = 40;
  int32 NDraft = 41;
  repeated string Images = 42;
+  bool UseTokenizerTemplate = 43;
+  repeated Message Messages = 44;
 }

 // The response message containing the result
 message Reply {
  bytes message = 1;
+  int32 tokens = 2;
+  int32 prompt_tokens = 3;
 }

 message ModelOptions {
@@ -256,3 +260,8 @@ message StatusResponse {
  State state = 1;
  MemoryUsageData memory = 2;
 }
+
+message Message {
+  string role = 1;
+  string content = 2;
+}
--- a/backend/backend_grpc.pb.go
+++ b/backend/backend_grpc.pb.go
@@ -1,457 +0,0 @@
-// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
-// versions:
-// - protoc-gen-go-grpc v1.2.0
-// - protoc             v4.23.4
-// source: backend/backend.proto
-
-package proto
-
-import (
-	context "context"
-	grpc "google.golang.org/grpc"
-	codes "google.golang.org/grpc/codes"
-	status "google.golang.org/grpc/status"
-)
-
-// This is a compile-time assertion to ensure that this generated file
-// is compatible with the grpc package it is being compiled against.
-// Requires gRPC-Go v1.32.0 or later.
-const _ = grpc.SupportPackageIsVersion7
-
-// BackendClient is the client API for Backend service.
-//
-// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
-type BackendClient interface {
-	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
-	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
-	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
-	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
-	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
-	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
-	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
-	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
-	TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error)
-	Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error)
-}
-
-type backendClient struct {
-	cc grpc.ClientConnInterface
-}
-
-func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
-	return &backendClient{cc}
-}
-
-func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
-	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
-	if err != nil {
-		return nil, err
-	}
-	x := &backendPredictStreamClient{stream}
-	if err := x.ClientStream.SendMsg(in); err != nil {
-		return nil, err
-	}
-	if err := x.ClientStream.CloseSend(); err != nil {
-		return nil, err
-	}
-	return x, nil
-}
-
-type Backend_PredictStreamClient interface {
-	Recv() (*Reply, error)
-	grpc.ClientStream
-}
-
-type backendPredictStreamClient struct {
-	grpc.ClientStream
-}
-
-func (x *backendPredictStreamClient) Recv() (*Reply, error) {
-	m := new(Reply)
-	if err := x.ClientStream.RecvMsg(m); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
-	out := new(EmbeddingResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
-	out := new(TranscriptResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error) {
-	out := new(TokenizationResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TokenizeString", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error) {
-	out := new(StatusResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Status", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-// BackendServer is the server API for Backend service.
-// All implementations must embed UnimplementedBackendServer
-// for forward compatibility
-type BackendServer interface {
-	Health(context.Context, *HealthMessage) (*Reply, error)
-	Predict(context.Context, *PredictOptions) (*Reply, error)
-	LoadModel(context.Context, *ModelOptions) (*Result, error)
-	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
-	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
-	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
-	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
-	TTS(context.Context, *TTSRequest) (*Result, error)
-	TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error)
-	Status(context.Context, *HealthMessage) (*StatusResponse, error)
-	mustEmbedUnimplementedBackendServer()
-}
-
-// UnimplementedBackendServer must be embedded to have forward compatible implementations.
-type UnimplementedBackendServer struct {
-}
-
-func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
-}
-func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
-}
-func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
-}
-func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
-	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
-}
-func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
-}
-func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
-}
-func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
-}
-func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
-}
-func (UnimplementedBackendServer) TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TokenizeString not implemented")
-}
-func (UnimplementedBackendServer) Status(context.Context, *HealthMessage) (*StatusResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Status not implemented")
-}
-func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
-
-// UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
-// Use of this interface is not recommended, as added methods to BackendServer will
-// result in compilation errors.
-type UnsafeBackendServer interface {
-	mustEmbedUnimplementedBackendServer()
-}
-
-func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
-	s.RegisterService(&Backend_ServiceDesc, srv)
-}
-
-func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Health(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Health",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Predict(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Predict",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(ModelOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).LoadModel(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/LoadModel",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
-	m := new(PredictOptions)
-	if err := stream.RecvMsg(m); err != nil {
-		return err
-	}
-	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
-}
-
-type Backend_PredictStreamServer interface {
-	Send(*Reply) error
-	grpc.ServerStream
-}
-
-type backendPredictStreamServer struct {
-	grpc.ServerStream
-}
-
-func (x *backendPredictStreamServer) Send(m *Reply) error {
-	return x.ServerStream.SendMsg(m)
-}
-
-func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Embedding(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Embedding",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(GenerateImageRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).GenerateImage(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/GenerateImage",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TranscriptRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).AudioTranscription(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/AudioTranscription",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TTSRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TTS(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TTS",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TokenizeString_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TokenizeString(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TokenizeString",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TokenizeString(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Status(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Status",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Status(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-// Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
-// It's only intended for direct use with grpc.RegisterService,
-// and not to be introspected or modified (even as a copy)
-var Backend_ServiceDesc = grpc.ServiceDesc{
-	ServiceName: "backend.Backend",
-	HandlerType: (*BackendServer)(nil),
-	Methods: []grpc.MethodDesc{
-		{
-			MethodName: "Health",
-			Handler:    _Backend_Health_Handler,
-		},
-		{
-			MethodName: "Predict",
-			Handler:    _Backend_Predict_Handler,
-		},
-		{
-			MethodName: "LoadModel",
-			Handler:    _Backend_LoadModel_Handler,
-		},
-		{
-			MethodName: "Embedding",
-			Handler:    _Backend_Embedding_Handler,
-		},
-		{
-			MethodName: "GenerateImage",
-			Handler:    _Backend_GenerateImage_Handler,
-		},
-		{
-			MethodName: "AudioTranscription",
-			Handler:    _Backend_AudioTranscription_Handler,
-		},
-		{
-			MethodName: "TTS",
-			Handler:    _Backend_TTS_Handler,
-		},
-		{
-			MethodName: "TokenizeString",
-			Handler:    _Backend_TokenizeString_Handler,
-		},
-		{
-			MethodName: "Status",
-			Handler:    _Backend_Status_Handler,
-		},
-	},
-	Streams: []grpc.StreamDesc{
-		{
-			StreamName:    "PredictStream",
-			Handler:       _Backend_PredictStream_Handler,
-			ServerStreams: true,
-		},
-	},
-	Metadata: "backend/backend.proto",
-}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -5,7 +5,6 @@ SYSTEM ?= $(HOST_SYSTEM)
 TAG_LIB_GRPC?=v1.59.0
 GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
 GIT_CLONE_DEPTH?=1
-NUM_BUILD_THREADS?=$(shell nproc --ignore=1)

 INSTALLED_PACKAGES=installed_packages
 GRPC_REPO=grpc_repo
@@ -52,7 +51,7 @@ $(GRPC_REPO):

 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
-	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . -- -j ${NUM_BUILD_THREADS} && cmake --build . --target install -- -j ${NUM_BUILD_THREADS}
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install

 build: $(INSTALLED_PACKAGES)

--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2332,6 +2332,10 @@ public:
                std::string completion_text = result.result_json.value("content", "");

                reply.set_message(completion_text);
+                int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+                reply.set_tokens(tokens_predicted);
+                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+                reply.set_prompt_tokens(tokens_evaluated);

                // Send the reply
                writer->Write(reply);
@@ -2357,6 +2361,10 @@ public:
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
            completion_text = result.result_json.value("content", "");
+            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
+            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
+            reply->set_prompt_tokens(tokens_evaluated);
+            reply->set_tokens(tokens_predicted);
            reply->set_message(completion_text);
        }
        else
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -21,7 +21,7 @@ func runCommand(command []string) (string, error) {
 // AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
-    command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
 	out, err := runCommand(command)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
@@ -29,8 +29,8 @@ func audioToWav(src, dst string) error {
 	return nil
 }

-func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.Result, error) {
-	res := schema.Result{}
+func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.TranscriptionResult, error) {
+	res := schema.TranscriptionResult{}

 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -21,6 +21,6 @@ func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.Result, error) {
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
 	return Transcript(sd.whisper, opts.Dst, opts.Language, uint(opts.Threads))
 }
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,4 +1,13 @@
 .PHONY: autogptq
-autogptq:
+autogptq: protogen
 	$(MAKE) -C ../common-env/transformers

+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
@@ -39,7 +39,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.model_name = "Qwen-VL-Chat"
                model = AutoModelForCausalLM.from_pretrained(model_path, 
                    trust_remote_code=request.TrustRemoteCode,
-                    use_triton=request.UseTriton,
                    device_map="auto").eval()
            else:
                model = AutoGPTQForCausalLM.from_quantized(model_path,
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/autogptq/backend_pb2_grpc.py
+++ b/backend/python/autogptq/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -1,15 +1,25 @@
 .PHONY: ttsbark
-ttsbark:
+ttsbark: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running bark..."
 	bash run.sh
 	@echo "bark run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing bark..."
 	bash test.sh
 	@echo "bark tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/bark/backend_pb2_grpc.py
+++ b/backend/python/bark/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -2,6 +2,7 @@
 set -ex

 SKIP_CONDA=${SKIP_CONDA:-0}
+REQUIREMENTS_FILE=$1

 # Check if environment exist
 conda_env_exists(){
@@ -14,7 +15,7 @@ else
    export PATH=$PATH:/opt/conda/bin
    if conda_env_exists "transformers" ; then
        echo "Creating virtual environment..."
-        conda env create --name transformers --file $1
+        conda env create --name transformers --file $REQUIREMENTS_FILE
        echo "Virtual environment created."
    else 
        echo "Virtual environment already exists."
@@ -28,11 +29,16 @@ if [ -d "/opt/intel" ]; then
    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino]
 fi

-if [ "$PIP_CACHE_PURGE" = true ] ; then
-    if [ $SKIP_CONDA -eq 0 ]; then
-        # Activate conda environment
-        source activate transformers
-    fi
+# If we didn't skip conda, activate the environment
+# to install FlashAttention
+if [ $SKIP_CONDA -eq 0 ]; then
+    source activate transformers
+fi
+if [[ $REQUIREMENTS_FILE =~ -nvidia.yml$ ]]; then
+    #TODO: FlashAttention is supported on nvidia and ROCm, but ROCm install can't be done this easily
+    pip install flash-attn --no-build-isolation
+fi

+if [ "$PIP_CACHE_PURGE" = true ] ; then
    pip cache purge
 fi
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -116,7 +116,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -104,7 +104,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -108,7 +108,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.3.2
+      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
--- a/backend/python/coqui/Makefile
+++ b/backend/python/coqui/Makefile
@@ -1,15 +1,25 @@
 .PHONY: coqui
-coqui:
+coqui: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running coqui..."
 	bash run.sh
 	@echo "coqui run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing coqui..."
 	bash test.sh
 	@echo "coqui tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/coqui/backend_pb2_grpc.py
+++ b/backend/python/coqui/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -12,15 +12,25 @@ export SKIP_CONDA=1
 endif

 .PHONY: diffusers
-diffusers:
+diffusers: protogen
 	@echo "Installing $(CONDA_ENV_PATH)..."
 	bash install.sh $(CONDA_ENV_PATH)

 .PHONY: run
-run:
+run: protogen
 	@echo "Running diffusers..."
 	bash run.sh
 	@echo "Diffusers run."

-test:
+test: protogen
 	bash test.sh
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/backend_pb2_grpc.py
+++ b/backend/python/diffusers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,11 +1,21 @@
 export CONDA_ENV_PATH = "exllama.yml"

 .PHONY: exllama
-exllama:
+exllama: protogen
 	bash install.sh ${CONDA_ENV_PATH}

 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama..."
 	bash run.sh
 	@echo "exllama run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/backend_pb2_grpc.py
+++ b/backend/python/exllama/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,10 +1,20 @@
 .PHONY: exllama2
-exllama2:
+exllama2: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running exllama2..."
 	bash run.sh
 	@echo "exllama2 run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/exllama2/backend_pb2_grpc.py
+++ b/backend/python/exllama2/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/mamba/Makefile
+++ b/backend/python/mamba/Makefile
@@ -1,16 +1,26 @@
 .PHONY: mamba
-mamba:
+mamba: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running mamba..."
 	bash run.sh
 	@echo "mamba run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing mamba..."
 	bash test.sh
-	@echo "mamba tested."
+	@echo "mamba tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/mamba/backend_pb2.py
+++ b/backend/python/mamba/backend_pb2.py
--- a/backend/python/mamba/backend_pb2_grpc.py
+++ b/backend/python/mamba/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/parler-tts/Makefile
+++ b/backend/python/parler-tts/Makefile
@@ -0,0 +1,39 @@
+export CONDA_ENV_PATH = "parler.yml"
+SKIP_CONDA?=0
+ifeq ($(BUILD_TYPE), cublas)
+export CONDA_ENV_PATH = "parler-nvidia.yml"
+endif
+
+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
+.PHONY: parler-tts
+parler-tts: protogen
+	@echo "Installing $(CONDA_ENV_PATH)..."
+	bash install.sh $(CONDA_ENV_PATH)
+
+.PHONY: run
+run: protogen
+	@echo "Running transformers..."
+	bash run.sh
+	@echo "transformers run."
+
+.PHONY: test
+test: protogen
+	@echo "Testing transformers..."
+	bash test.sh
+	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -ex
+
+SKIP_CONDA=${SKIP_CONDA:-0}
+
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}
+
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "parler" ; then
+        echo "Creating virtual environment..."
+        conda env create --name parler --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ $SKIP_CONDA -ne 1 ]; then
+    # Activate conda environment
+    source activate parler
+    # https://github.com/descriptinc/audiotools/issues/101
+    # incompatible protobuf versions.
+    curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o $CONDA_PREFIX/lib/python3.11/site-packages/google/protobuf/internal/builder.py
+fi
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    if [ $SKIP_CONDA -ne 1 ]; then
+        # Activate conda environment
+        source activate parler
+    fi
+
+    pip cache purge
+fi
--- a/backend/python/parler-tts/parler-nvidia.yml
+++ b/backend/python/parler-tts/parler-nvidia.yml
@@ -0,0 +1,48 @@
+name: parler
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2023c=h04d1e81_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate>=0.11.0
+      - grpcio==1.59.0
+      - numpy==1.26.0
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.18.1
+      - nvidia-nvjitlink-cu12==12.2.140
+      - nvidia-nvtx-cu12==12.1.105
+      - torch==2.1.0
+      - transformers>=4.34.0
+      - descript-audio-codec
+      - sentencepiece
+      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
+prefix: /opt/conda/envs/diffusers
--- a/backend/python/parler-tts/parler.yml
+++ b/backend/python/parler-tts/parler.yml
@@ -0,0 +1,36 @@
+name: parler
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2023c=h04d1e81_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate>=0.11.0
+      - numpy==1.26.0
+      - grpcio==1.59.0
+      - torch==2.1.0
+      - transformers>=4.34.0
+      - descript-audio-codec
+      - sentencepiece
+      - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
+prefix: /opt/conda/envs/parler
--- a/backend/python/parler-tts/parler_tts_server.py
+++ b/backend/python/parler-tts/parler_tts_server.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for MusicgenForConditionalGeneration models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+from scipy.io.wavfile import write as write_wav
+
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+import soundfile as sf  
+import torch
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        try:
+            self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        model_name = request.model
+        voice = request.voice
+        if voice == "":
+            voice = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
+        if model_name == "":
+            return backend_pb2.Result(success=False, message="request.model is required")
+        try:
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            input_ids = self.tokenizer(voice, return_tensors="pt").input_ids.to(device)
+            prompt_input_ids = self.tokenizer(request.text, return_tensors="pt").input_ids.to(device)
+           
+            generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+            audio_arr = generation.cpu().numpy().squeeze()
+            print("[parler-tts] TTS generated!", file=sys.stderr)
+            sf.write(request.dst, audio_arr, self.model.config.sampling_rate)
+            print("[parler-tts] TTS saved to", request.dst, file=sys.stderr)
+            print("[parler-tts] TTS for", file=sys.stderr)
+            print(request, file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("[parler-tts] Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("[parler-tts] Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+    print(f"[parler-tts] startup: {args}", file=sys.stderr)
+    serve(args.addr)
--- a/backend/python/parler-tts/run.sh
+++ b/backend/python/parler-tts/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the parler-tts server with conda
+
+echo "Launching gRPC server for parler-tts"
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate parler
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/parler_tts_server.py $@
--- a/backend/python/parler-tts/test.sh
+++ b/backend/python/parler-tts/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the transformers server with conda
+
+# Activate conda environment
+source activate parler
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test_parler.py
--- a/backend/python/parler-tts/test_parler.py
+++ b/backend/python/parler-tts/test_parler.py
@@ -0,0 +1,81 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "parler_tts_server.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
+                self.assertTrue(response.success)
+                tts_request = backend_pb2.TTSRequest(text="Hey, how are you doing today?")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,17 +1,27 @@
 .PHONY: petals
-petals:
+petals: protogen
 	@echo "Creating virtual environment..."
 	bash install.sh "petals.yml"
 	@echo "Virtual environment created."

 .PHONY: run
-run:
+run: protogen
 	@echo "Running petals..."
 	bash run.sh
 	@echo "petals run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing petals..."
 	bash test.sh
 	@echo "petals tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/petals/backend_pb2_grpc.py
+++ b/backend/python/petals/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/sentencetransformers/Makefile
+++ b/backend/python/sentencetransformers/Makefile
@@ -1,17 +1,27 @@
 .PHONY: sentencetransformers
-sentencetransformers:
+sentencetransformers: protogen
 	$(MAKE) -C ../common-env/transformers


 .PHONY: run
-run:
+run: protogen
 	@echo "Running sentencetransformers..."
 	bash run.sh
 	@echo "sentencetransformers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing sentencetransformers..."
 	bash test.sh
 	@echo "sentencetransformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/sentencetransformers/backend_pb2_grpc.py
+++ b/backend/python/sentencetransformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/Makefile
+++ b/backend/python/transformers-musicgen/Makefile
@@ -1,16 +1,25 @@
-
 .PHONY: transformers-musicgen
-transformers-musicgen:
+transformers-musicgen: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2_grpc.py
+++ b/backend/python/transformers-musicgen/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/transformers-musicgen/run.sh
+++ b/backend/python/transformers-musicgen/run.sh
@@ -8,7 +8,7 @@ echo "Launching gRPC server for transformers-musicgen"
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
-source activate transformers-musicgen
+source activate transformers

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/transformers/Makefile
+++ b/backend/python/transformers/Makefile
@@ -1,16 +1,26 @@
 .PHONY: transformers
-transformers:
+transformers: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running transformers..."
 	bash run.sh
 	@echo "transformers run."

 # It is not working well by using command line. It only6 works with IDE like VSCode.
 .PHONY: test
-test:
+test: protogen
 	@echo "Testing transformers..."
 	bash test.sh
 	@echo "transformers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/transformers/backend_pb2_grpc.py
+++ b/backend/python/transformers/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@@ -3,18 +3,28 @@ export SKIP_CONDA=1
 endif

 .PHONY: ttsvalle
-ttsvalle:
+ttsvalle: protogen
 	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
-run:
+run: protogen
 	@echo "Running ttsvalle..."
 	bash run.sh
 	@echo "ttsvalle run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing valle..."
 	bash test.sh
 	@echo "valle tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/vall-e-x/backend_pb2.py
+++ b/backend/python/vall-e-x/backend_pb2.py
--- a/backend/python/vall-e-x/backend_pb2_grpc.py
+++ b/backend/python/vall-e-x/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vllm/Makefile
+++ b/backend/python/vllm/Makefile
@@ -1,15 +1,25 @@
 .PHONY: vllm
-vllm:
+vllm: protogen
 	$(MAKE) -C ../common-env/transformers

 .PHONY: run
-run:
+run: protogen
 	@echo "Running vllm..."
 	bash run.sh
 	@echo "vllm run."

 .PHONY: test
-test:
+test: protogen
 	@echo "Testing vllm..."
 	bash test.sh
-	@echo "vllm tested."
+	@echo "vllm tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/vllm/backend_pb2.py
+++ b/backend/python/vllm/backend_pb2.py
--- a/backend/python/vllm/backend_pb2_grpc.py
+++ b/backend/python/vllm/backend_pb2_grpc.py
@@ -1,363 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import grpc
-
-import backend_pb2 as backend__pb2
-
-
-class BackendStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Health = channel.unary_unary(
-                '/backend.Backend/Health',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Predict = channel.unary_unary(
-                '/backend.Backend/Predict',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.LoadModel = channel.unary_unary(
-                '/backend.Backend/LoadModel',
-                request_serializer=backend__pb2.ModelOptions.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.PredictStream = channel.unary_stream(
-                '/backend.Backend/PredictStream',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.Reply.FromString,
-                )
-        self.Embedding = channel.unary_unary(
-                '/backend.Backend/Embedding',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.EmbeddingResult.FromString,
-                )
-        self.GenerateImage = channel.unary_unary(
-                '/backend.Backend/GenerateImage',
-                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.AudioTranscription = channel.unary_unary(
-                '/backend.Backend/AudioTranscription',
-                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
-                response_deserializer=backend__pb2.TranscriptResult.FromString,
-                )
-        self.TTS = channel.unary_unary(
-                '/backend.Backend/TTS',
-                request_serializer=backend__pb2.TTSRequest.SerializeToString,
-                response_deserializer=backend__pb2.Result.FromString,
-                )
-        self.TokenizeString = channel.unary_unary(
-                '/backend.Backend/TokenizeString',
-                request_serializer=backend__pb2.PredictOptions.SerializeToString,
-                response_deserializer=backend__pb2.TokenizationResponse.FromString,
-                )
-        self.Status = channel.unary_unary(
-                '/backend.Backend/Status',
-                request_serializer=backend__pb2.HealthMessage.SerializeToString,
-                response_deserializer=backend__pb2.StatusResponse.FromString,
-                )
-
-
-class BackendServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Health(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def LoadModel(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def PredictStream(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Embedding(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GenerateImage(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def AudioTranscription(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TTS(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenizeString(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Status(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-
-def add_BackendServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-            'Health': grpc.unary_unary_rpc_method_handler(
-                    servicer.Health,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Predict': grpc.unary_unary_rpc_method_handler(
-                    servicer.Predict,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'LoadModel': grpc.unary_unary_rpc_method_handler(
-                    servicer.LoadModel,
-                    request_deserializer=backend__pb2.ModelOptions.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'PredictStream': grpc.unary_stream_rpc_method_handler(
-                    servicer.PredictStream,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.Reply.SerializeToString,
-            ),
-            'Embedding': grpc.unary_unary_rpc_method_handler(
-                    servicer.Embedding,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
-            ),
-            'GenerateImage': grpc.unary_unary_rpc_method_handler(
-                    servicer.GenerateImage,
-                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
-                    servicer.AudioTranscription,
-                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
-                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
-            ),
-            'TTS': grpc.unary_unary_rpc_method_handler(
-                    servicer.TTS,
-                    request_deserializer=backend__pb2.TTSRequest.FromString,
-                    response_serializer=backend__pb2.Result.SerializeToString,
-            ),
-            'TokenizeString': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenizeString,
-                    request_deserializer=backend__pb2.PredictOptions.FromString,
-                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
-            ),
-            'Status': grpc.unary_unary_rpc_method_handler(
-                    servicer.Status,
-                    request_deserializer=backend__pb2.HealthMessage.FromString,
-                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
-            ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'backend.Backend', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
-
-
- # This class is part of an EXPERIMENTAL API.
-class Backend(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Health(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Predict(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def LoadModel(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
-            backend__pb2.ModelOptions.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def PredictStream(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.Reply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Embedding(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.EmbeddingResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def GenerateImage(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
-            backend__pb2.GenerateImageRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def AudioTranscription(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
-            backend__pb2.TranscriptRequest.SerializeToString,
-            backend__pb2.TranscriptResult.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TTS(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
-            backend__pb2.TTSRequest.SerializeToString,
-            backend__pb2.Result.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def TokenizeString(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
-            backend__pb2.PredictOptions.SerializeToString,
-            backend__pb2.TokenizationResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
-
-    @staticmethod
-    def Status(request,
-            target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
-            backend__pb2.HealthMessage.SerializeToString,
-            backend__pb2.StatusResponse.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/vllm/backend_vllm.py
+++ b/backend/python/vllm/backend_vllm.py
@@ -14,6 +14,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
+from vllm.transformers_utils.tokenizer import get_tokenizer

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -71,7 +72,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))

-    def LoadModel(self, request, context):
+    async def LoadModel(self, request, context):
        """
        Loads a language model.

@@ -103,6 +104,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        try:
+           engine_model_config = await self.llm.get_model_config()
+           self.tokenizer = get_tokenizer(
+               engine_model_config.tokenizer,
+               tokenizer_mode=engine_model_config.tokenizer_mode,
+               trust_remote_code=engine_model_config.trust_remote_code,
+               truncation_side="left",
+           )
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
        return backend_pb2.Result(message="Model loaded successfully", success=True)

    async def Predict(self, request, context):
@@ -161,9 +174,15 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.Seed != 0:
            sampling_params.seed = request.Seed

+        prompt = request.Prompt
+        
+        # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
        # Generate text
        request_id = random_uuid()
-        outputs = self.llm.generate(request.Prompt, sampling_params, request_id)
+        outputs = self.llm.generate(prompt, sampling_params, request_id)

        # Stream the results
        generated_text = ""
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -2,14 +2,100 @@ package backend

 import (
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/google/uuid"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
+type EmbeddingsBackendService struct {
+	ml        *model.ModelLoader
+	bcl       *config.BackendConfigLoader
+	appConfig *config.ApplicationConfig
+}
+
+func NewEmbeddingsBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *EmbeddingsBackendService {
+	return &EmbeddingsBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
+	}
+}
+
+func (ebs *EmbeddingsBackendService) Embeddings(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.OpenAIResponse] {
+
+	resultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	go func(request *schema.OpenAIRequest) {
+		if request.Model == "" {
+			request.Model = model.StableDiffusionBackend
+		}
+
+		bc, request, err := ebs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, ebs.appConfig)
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+			close(resultChannel)
+			return
+		}
+
+		items := []schema.Item{}
+
+		for i, s := range bc.InputToken {
+			// get the model function to call for the result
+			embedFn, err := modelEmbedding("", s, ebs.ml, bc, ebs.appConfig)
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		for i, s := range bc.InputStrings {
+			// get the model function to call for the result
+			embedFn, err := modelEmbedding(s, []int{}, ebs.ml, bc, ebs.appConfig)
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:    items,
+			Object:  "list",
+		}
+		resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: resp}
+		close(resultChannel)
+	}(request)
+	return resultChannel
+}
+
+func modelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 	modelFile := backendConfig.Model

 	grpcOpts := gRPCModelOpts(backendConfig)
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -1,18 +1,252 @@
 package backend

 import (
-	"github.com/go-skynet/LocalAI/core/config"
+	"bufio"
+	"encoding/base64"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"

+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog/log"
+
+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 )

-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
+type ImageGenerationBackendService struct {
+	ml                        *model.ModelLoader
+	bcl                       *config.BackendConfigLoader
+	appConfig                 *config.ApplicationConfig
+	BaseUrlForGeneratedImages string
+}
+
+func NewImageGenerationBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *ImageGenerationBackendService {
+	return &ImageGenerationBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
+	}
+}
+
+func (igbs *ImageGenerationBackendService) GenerateImage(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.OpenAIResponse] {
+	resultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	go func(request *schema.OpenAIRequest) {
+		bc, request, err := igbs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, igbs.appConfig)
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+			close(resultChannel)
+			return
+		}
+
+		src := ""
+		if request.File != "" {
+
+			var fileData []byte
+			// check if input.File is an URL, if so download it and save it
+			// to a temporary file
+			if strings.HasPrefix(request.File, "http://") || strings.HasPrefix(request.File, "https://") {
+				out, err := downloadFile(request.File)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("failed downloading file:%w", err)}
+					close(resultChannel)
+					return
+				}
+				defer os.RemoveAll(out)
+
+				fileData, err = os.ReadFile(out)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("failed reading file:%w", err)}
+					close(resultChannel)
+					return
+				}
+
+			} else {
+				// base 64 decode the file and write it somewhere
+				// that we will cleanup
+				fileData, err = base64.StdEncoding.DecodeString(request.File)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+			}
+
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(igbs.appConfig.ImageDir, "b64")
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(fileData)
+			if err != nil {
+				outputFile.Close()
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", bc)
+
+		switch bc.Backend {
+		case "stablediffusion":
+			bc.Backend = model.StableDiffusionBackend
+		case "tinydream":
+			bc.Backend = model.TinyDreamBackend
+		case "":
+			bc.Backend = model.StableDiffusionBackend
+			if bc.Model == "" {
+				bc.Model = "stablediffusion_assets" // TODO: check?
+			}
+		}
+
+		sizeParts := strings.Split(request.Size, "x")
+		if len(sizeParts) != 2 {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
+			close(resultChannel)
+			return
+		}
+		width, err := strconv.Atoi(sizeParts[0])
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
+			close(resultChannel)
+			return
+		}
+		height, err := strconv.Atoi(sizeParts[1])
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
+			close(resultChannel)
+			return
+		}
+
+		b64JSON := false
+		if request.ResponseFormat.Type == "b64_json" {
+			b64JSON = true
+		}
+		// src and clip_skip
+		var result []schema.Item
+		for _, i := range bc.PromptStrings {
+			n := request.N
+			if request.N == 0 {
+				n = 1
+			}
+			for j := 0; j < n; j++ {
+				prompts := strings.Split(i, "|")
+				positive_prompt := prompts[0]
+				negative_prompt := ""
+				if len(prompts) > 1 {
+					negative_prompt = prompts[1]
+				}
+
+				mode := 0
+				step := bc.Step
+				if step == 0 {
+					step = 15
+				}
+
+				if request.Mode != 0 {
+					mode = request.Mode
+				}
+
+				if request.Step != 0 {
+					step = request.Step
+				}
+
+				tempDir := ""
+				if !b64JSON {
+					tempDir = igbs.appConfig.ImageDir
+				}
+				// Create a temporary file
+				outputFile, err := os.CreateTemp(tempDir, "b64")
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+				outputFile.Close()
+				output := outputFile.Name() + ".png"
+				// Rename the temporary file
+				err = os.Rename(outputFile.Name(), output)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+
+				if request.Seed == nil {
+					zVal := 0 // Idiomatic way to do this? Actually needed?
+					request.Seed = &zVal
+				}
+
+				fn, err := imageGeneration(height, width, mode, step, *request.Seed, positive_prompt, negative_prompt, src, output, igbs.ml, bc, igbs.appConfig)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+				if err := fn(); err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+
+				item := &schema.Item{}
+
+				if b64JSON {
+					defer os.RemoveAll(output)
+					data, err := os.ReadFile(output)
+					if err != nil {
+						resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+						close(resultChannel)
+						return
+					}
+					item.B64JSON = base64.StdEncoding.EncodeToString(data)
+				} else {
+					base := filepath.Base(output)
+					item.URL = igbs.BaseUrlForGeneratedImages + base
+				}
+
+				result = append(result, *item)
+			}
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Data:    result,
+		}
+		resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: resp}
+		close(resultChannel)
+	}(request)
+	return resultChannel
+}
+
+func imageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
+
 	threads := backendConfig.Threads
 	if *threads == 0 && appConfig.Threads != 0 {
 		threads = &appConfig.Threads
 	}
+
 	gRPCOpts := gRPCModelOpts(backendConfig)
+
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
@@ -50,3 +284,24 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat

 	return fn, nil
 }
+
+// TODO: Replace this function with pkg/downloader - no reason to have a (crappier) bespoke download file fn here, but get things working before that change.
+func downloadFile(url string) (string, error) {
+	// Get the data
+	resp, err := http.Get(url)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	// Create the file
+	out, err := os.CreateTemp("", "image")
+	if err != nil {
+		return "", err
+	}
+	defer out.Close()
+
+	// Write the body to file
+	_, err = io.Copy(out, resp.Body)
+	return out.Name(), err
+}
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -2,6 +2,7 @@ package backend

 import (
 	"context"
+	"fmt"
 	"os"
 	"regexp"
 	"strings"
@@ -9,16 +10,23 @@ import (
 	"unicode/utf8"

 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/rs/zerolog/log"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-type LLMResponse struct {
-	Response string // should this be []byte?
-	Usage    TokenUsage
+type LLMRequest struct {
+	Id          int // TODO Remove if not used.
+	Text        string
+	Images      []string
+	RawMessages []schema.Message
+	// TODO: Other Modalities?
 }

 type TokenUsage struct {
@@ -26,85 +34,127 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
-	modelFile := c.Model
-	threads := c.Threads
-	if *threads == 0 && o.Threads != 0 {
-		threads = &o.Threads
+type LLMResponse struct {
+	Request  *LLMRequest
+	Response string // should this be []byte?
+	Usage    TokenUsage
+}
+
+// TODO: Does this belong here or in core/services/openai.go?
+type LLMResponseBundle struct {
+	Request  *schema.OpenAIRequest
+	Response []schema.Choice
+	Usage    TokenUsage
+}
+
+type LLMBackendService struct {
+	bcl        *config.BackendConfigLoader
+	ml         *model.ModelLoader
+	appConfig  *config.ApplicationConfig
+	ftMutex    sync.Mutex
+	cutstrings map[string]*regexp.Regexp
+}
+
+func NewLLMBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *LLMBackendService {
+	return &LLMBackendService{
+		bcl:        bcl,
+		ml:         ml,
+		appConfig:  appConfig,
+		ftMutex:    sync.Mutex{},
+		cutstrings: make(map[string]*regexp.Regexp),
 	}
-	grpcOpts := gRPCModelOpts(c)
+}
+
+// TODO: Should ctx param be removed and replaced with hardcoded req.Context?
+func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest, bc *config.BackendConfig, enableTokenChannel bool) (
+	resultChannel <-chan concurrency.ErrorOr[*LLMResponse], tokenChannel <-chan concurrency.ErrorOr[*LLMResponse], err error) {
+
+	threads := bc.Threads
+	if (threads == nil || *threads == 0) && llmbs.appConfig.Threads != 0 {
+		threads = &llmbs.appConfig.Threads
+	}
+
+	grpcOpts := gRPCModelOpts(bc)

 	var inferenceModel grpc.Backend
-	var err error

-	opts := modelOpts(c, o, []model.Option{
+	opts := modelOpts(bc, llmbs.appConfig, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
-		model.WithAssetDir(o.AssetsDestination),
-		model.WithModel(modelFile),
-		model.WithContext(o.Context),
+		model.WithAssetDir(llmbs.appConfig.AssetsDestination),
+		model.WithModel(bc.Model),
+		model.WithContext(llmbs.appConfig.Context),
 	})

-	if c.Backend != "" {
-		opts = append(opts, model.WithBackendString(c.Backend))
+	if bc.Backend != "" {
+		opts = append(opts, model.WithBackendString(bc.Backend))
 	}

-	// Check if the modelFile exists, if it doesn't try to load it from the gallery
-	if o.AutoloadGalleries { // experimental
-		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
+	// Check if bc.Model exists, if it doesn't try to load it from the gallery
+	if llmbs.appConfig.AutoloadGalleries { // experimental
+		if _, err := os.Stat(bc.Model); os.IsNotExist(err) {
 			utils.ResetDownloadTimers()
 			// if we failed to load the model, we try to download it
-			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			err := gallery.InstallModelFromGalleryByName(llmbs.appConfig.Galleries, bc.Model, llmbs.appConfig.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
 			if err != nil {
-				return nil, err
+				return nil, nil, err
 			}
 		}
 	}

-	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(opts...)
+	if bc.Backend == "" {
+		log.Debug().Msgf("backend not known for %q, falling back to greedy loader to find it", bc.Model)
+		inferenceModel, err = llmbs.ml.GreedyLoader(opts...)
 	} else {
-		inferenceModel, err = loader.BackendLoader(opts...)
+		inferenceModel, err = llmbs.ml.BackendLoader(opts...)
 	}

 	if err != nil {
-		return nil, err
+		log.Error().Err(err).Msg("[llmbs.Inference] failed to load a backend")
+		return
 	}

-	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
-	fn := func() (LLMResponse, error) {
-		opts := gRPCPredictOpts(c, loader.ModelPath)
-		opts.Prompt = s
-		opts.Images = images
+	grpcPredOpts := gRPCPredictOpts(bc, llmbs.appConfig.ModelPath)
+	grpcPredOpts.Prompt = req.Text
+	grpcPredOpts.Images = req.Images

-		tokenUsage := TokenUsage{}
-
-		// check the per-model feature flag for usage, since tokenCallback may have a cost.
-		// Defaults to off as for now it is still experimental
-		if c.FeatureFlag.Enabled("usage") {
-			userTokenCallback := tokenCallback
-			if userTokenCallback == nil {
-				userTokenCallback = func(token string, usage TokenUsage) bool {
-					return true
-				}
+	if bc.TemplateConfig.UseTokenizerTemplate && req.Text == "" {
+		grpcPredOpts.UseTokenizerTemplate = true
+		protoMessages := make([]*proto.Message, len(req.RawMessages), len(req.RawMessages))
+		for i, message := range req.RawMessages {
+			protoMessages[i] = &proto.Message{
+				Role: message.Role,
 			}
-
-			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
-			if pErr == nil && promptInfo.Length > 0 {
-				tokenUsage.Prompt = int(promptInfo.Length)
-			}
-
-			tokenCallback = func(token string, usage TokenUsage) bool {
-				tokenUsage.Completion++
-				return userTokenCallback(token, tokenUsage)
+			switch ct := message.Content.(type) {
+			case string:
+				protoMessages[i].Content = ct
+			default:
+				err = fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
+				return
 			}
 		}
+	}

-		if tokenCallback != nil {
-			ss := ""
+	tokenUsage := TokenUsage{}

+	promptInfo, pErr := inferenceModel.TokenizeString(ctx, grpcPredOpts)
+	if pErr == nil && promptInfo.Length > 0 {
+		tokenUsage.Prompt = int(promptInfo.Length)
+	}
+
+	rawResultChannel := make(chan concurrency.ErrorOr[*LLMResponse])
+	// TODO this next line is the biggest argument for taking named return values _back_ out!!!
+	var rawTokenChannel chan concurrency.ErrorOr[*LLMResponse]
+
+	if enableTokenChannel {
+		rawTokenChannel = make(chan concurrency.ErrorOr[*LLMResponse])
+
+		// TODO Needs better name
+		ss := ""
+
+		go func() {
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
+			err := inferenceModel.PredictStream(ctx, grpcPredOpts, func(chars []byte) {
 				partialRune = append(partialRune, chars...)

 				for len(partialRune) > 0 {
@@ -114,48 +164,126 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
 						break
 					}

-					tokenCallback(string(r), tokenUsage)
+					tokenUsage.Completion++
+					rawTokenChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
+						Response: string(r),
+						Usage:    tokenUsage,
+					}}
+
 					ss += string(r)

 					partialRune = partialRune[size:]
 				}
 			})
-			return LLMResponse{
-				Response: ss,
-				Usage:    tokenUsage,
-			}, err
-		} else {
-			// TODO: Is the chicken bit the only way to get here? is that acceptable?
-			reply, err := inferenceModel.Predict(ctx, opts)
+			close(rawTokenChannel)
 			if err != nil {
-				return LLMResponse{}, err
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
+			} else {
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
+					Response: ss,
+					Usage:    tokenUsage,
+				}}
 			}
-			return LLMResponse{
-				Response: string(reply.Message),
-				Usage:    tokenUsage,
-			}, err
-		}
+			close(rawResultChannel)
+		}()
+	} else {
+		go func() {
+			reply, err := inferenceModel.Predict(ctx, grpcPredOpts)
+			if tokenUsage.Prompt == 0 {
+				tokenUsage.Prompt = int(reply.PromptTokens)
+			}
+			if tokenUsage.Completion == 0 {
+				tokenUsage.Completion = int(reply.Tokens)
+			}
+			if err != nil {
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
+				close(rawResultChannel)
+			} else {
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
+					Response: string(reply.Message),
+					Usage:    tokenUsage,
+				}}
+				close(rawResultChannel)
+			}
+		}()
 	}

-	return fn, nil
+	resultChannel = rawResultChannel
+	tokenChannel = rawTokenChannel
+	return
 }

-var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
-var mu sync.Mutex = sync.Mutex{}
+// TODO: Should predInput be a seperate param still, or should this fn handle extracting it from request??
+func (llmbs *LLMBackendService) GenerateText(predInput string, request *schema.OpenAIRequest, bc *config.BackendConfig,
+	mappingFn func(*LLMResponse) schema.Choice, enableCompletionChannels bool, enableTokenChannels bool) (
+	// Returns:
+	resultChannel <-chan concurrency.ErrorOr[*LLMResponseBundle], completionChannels []<-chan concurrency.ErrorOr[*LLMResponse], tokenChannels []<-chan concurrency.ErrorOr[*LLMResponse], err error) {

-func Finetune(config config.BackendConfig, input, prediction string) string {
+	rawChannel := make(chan concurrency.ErrorOr[*LLMResponseBundle])
+	resultChannel = rawChannel
+
+	if request.N == 0 { // number of completions to return
+		request.N = 1
+	}
+	images := []string{}
+	for _, m := range request.Messages {
+		images = append(images, m.StringImages...)
+	}
+
+	for i := 0; i < request.N; i++ {
+
+		individualResultChannel, tokenChannel, infErr := llmbs.Inference(request.Context, &LLMRequest{
+			Text:        predInput,
+			Images:      images,
+			RawMessages: request.Messages,
+		}, bc, enableTokenChannels)
+		if infErr != nil {
+			err = infErr // Avoids complaints about redeclaring err but looks dumb
+			return
+		}
+		completionChannels = append(completionChannels, individualResultChannel)
+		tokenChannels = append(tokenChannels, tokenChannel)
+	}
+
+	go func() {
+		initialBundle := LLMResponseBundle{
+			Request:  request,
+			Response: []schema.Choice{},
+			Usage:    TokenUsage{},
+		}
+
+		wg := concurrency.SliceOfChannelsReducer(completionChannels, rawChannel, func(iv concurrency.ErrorOr[*LLMResponse], ov concurrency.ErrorOr[*LLMResponseBundle]) concurrency.ErrorOr[*LLMResponseBundle] {
+			if iv.Error != nil {
+				ov.Error = iv.Error
+				// TODO: Decide if we should wipe partials or not?
+				return ov
+			}
+			ov.Value.Usage.Prompt += iv.Value.Usage.Prompt
+			ov.Value.Usage.Completion += iv.Value.Usage.Completion
+
+			ov.Value.Response = append(ov.Value.Response, mappingFn(iv.Value))
+			return ov
+		}, concurrency.ErrorOr[*LLMResponseBundle]{Value: &initialBundle}, true)
+		wg.Wait()
+
+	}()
+
+	return
+}
+
+func (llmbs *LLMBackendService) Finetune(config config.BackendConfig, input, prediction string) string {
 	if config.Echo {
 		prediction = input + prediction
 	}

 	for _, c := range config.Cutstrings {
-		mu.Lock()
-		reg, ok := cutstrings[c]
+		llmbs.ftMutex.Lock()
+		reg, ok := llmbs.cutstrings[c]
 		if !ok {
-			cutstrings[c] = regexp.MustCompile(c)
-			reg = cutstrings[c]
+			llmbs.cutstrings[c] = regexp.MustCompile(c)
+			reg = llmbs.cutstrings[c]
 		}
-		mu.Unlock()
+		llmbs.ftMutex.Unlock()
 		prediction = reg.ReplaceAllString(prediction, "")
 	}

--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -10,7 +10,7 @@ import (
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+func modelOpts(bc *config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
 	if so.SingleBackend {
 		opts = append(opts, model.WithSingleActiveBackend())
 	}
@@ -19,12 +19,12 @@ func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []mode
 		opts = append(opts, model.EnableParallelRequests)
 	}

-	if c.GRPC.Attempts != 0 {
-		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+	if bc.GRPC.Attempts != 0 {
+		opts = append(opts, model.WithGRPCAttempts(bc.GRPC.Attempts))
 	}

-	if c.GRPC.AttemptsSleepTime != 0 {
-		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+	if bc.GRPC.AttemptsSleepTime != 0 {
+		opts = append(opts, model.WithGRPCAttemptsDelay(bc.GRPC.AttemptsSleepTime))
 	}

 	for k, v := range so.ExternalGRPCBackends {
@@ -34,7 +34,7 @@ func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []mode
 	return opts
 }

-func getSeed(c config.BackendConfig) int32 {
+func getSeed(c *config.BackendConfig) int32 {
 	seed := int32(*c.Seed)
 	if seed == config.RAND_SEED {
 		seed = rand.Int31()
@@ -43,7 +43,7 @@ func getSeed(c config.BackendConfig) int32 {
 	return seed
 }

-func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
+func gRPCModelOpts(c *config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
@@ -104,47 +104,47 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	}
 }

-func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOptions {
+func gRPCPredictOpts(bc *config.BackendConfig, modelPath string) *pb.PredictOptions {
 	promptCachePath := ""
-	if c.PromptCachePath != "" {
-		p := filepath.Join(modelPath, c.PromptCachePath)
+	if bc.PromptCachePath != "" {
+		p := filepath.Join(modelPath, bc.PromptCachePath)
 		os.MkdirAll(filepath.Dir(p), 0755)
 		promptCachePath = p
 	}

 	return &pb.PredictOptions{
-		Temperature:         float32(*c.Temperature),
-		TopP:                float32(*c.TopP),
-		NDraft:              c.NDraft,
-		TopK:                int32(*c.TopK),
-		Tokens:              int32(*c.Maxtokens),
-		Threads:             int32(*c.Threads),
-		PromptCacheAll:      c.PromptCacheAll,
-		PromptCacheRO:       c.PromptCacheRO,
+		Temperature:         float32(*bc.Temperature),
+		TopP:                float32(*bc.TopP),
+		NDraft:              bc.NDraft,
+		TopK:                int32(*bc.TopK),
+		Tokens:              int32(*bc.Maxtokens),
+		Threads:             int32(*bc.Threads),
+		PromptCacheAll:      bc.PromptCacheAll,
+		PromptCacheRO:       bc.PromptCacheRO,
 		PromptCachePath:     promptCachePath,
-		F16KV:               *c.F16,
-		DebugMode:           *c.Debug,
-		Grammar:             c.Grammar,
-		NegativePromptScale: c.NegativePromptScale,
-		RopeFreqBase:        c.RopeFreqBase,
-		RopeFreqScale:       c.RopeFreqScale,
-		NegativePrompt:      c.NegativePrompt,
-		Mirostat:            int32(*c.LLMConfig.Mirostat),
-		MirostatETA:         float32(*c.LLMConfig.MirostatETA),
-		MirostatTAU:         float32(*c.LLMConfig.MirostatTAU),
-		Debug:               *c.Debug,
-		StopPrompts:         c.StopWords,
-		Repeat:              int32(c.RepeatPenalty),
-		NKeep:               int32(c.Keep),
-		Batch:               int32(c.Batch),
-		IgnoreEOS:           c.IgnoreEOS,
-		Seed:                getSeed(c),
-		FrequencyPenalty:    float32(c.FrequencyPenalty),
-		MLock:               *c.MMlock,
-		MMap:                *c.MMap,
-		MainGPU:             c.MainGPU,
-		TensorSplit:         c.TensorSplit,
-		TailFreeSamplingZ:   float32(*c.TFZ),
-		TypicalP:            float32(*c.TypicalP),
+		F16KV:               *bc.F16,
+		DebugMode:           *bc.Debug,
+		Grammar:             bc.Grammar,
+		NegativePromptScale: bc.NegativePromptScale,
+		RopeFreqBase:        bc.RopeFreqBase,
+		RopeFreqScale:       bc.RopeFreqScale,
+		NegativePrompt:      bc.NegativePrompt,
+		Mirostat:            int32(*bc.LLMConfig.Mirostat),
+		MirostatETA:         float32(*bc.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(*bc.LLMConfig.MirostatTAU),
+		Debug:               *bc.Debug,
+		StopPrompts:         bc.StopWords,
+		Repeat:              int32(bc.RepeatPenalty),
+		NKeep:               int32(bc.Keep),
+		Batch:               int32(bc.Batch),
+		IgnoreEOS:           bc.IgnoreEOS,
+		Seed:                getSeed(bc),
+		FrequencyPenalty:    float32(bc.FrequencyPenalty),
+		MLock:               *bc.MMlock,
+		MMap:                *bc.MMap,
+		MainGPU:             bc.MainGPU,
+		TensorSplit:         bc.TensorSplit,
+		TailFreeSamplingZ:   float32(*bc.TFZ),
+		TypicalP:            float32(*bc.TypicalP),
 	}
 }
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -7,11 +7,48 @@ import (
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelTranscription(audio, language string, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.Result, error) {
+type TranscriptionBackendService struct {
+	ml        *model.ModelLoader
+	bcl       *config.BackendConfigLoader
+	appConfig *config.ApplicationConfig
+}
+
+func NewTranscriptionBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *TranscriptionBackendService {
+	return &TranscriptionBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
+	}
+}
+
+func (tbs *TranscriptionBackendService) Transcribe(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.TranscriptionResult] {
+	responseChannel := make(chan concurrency.ErrorOr[*schema.TranscriptionResult])
+	go func(request *schema.OpenAIRequest) {
+		bc, request, err := tbs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, tbs.appConfig)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Error: fmt.Errorf("failed reading parameters from request:%w", err)}
+			close(responseChannel)
+			return
+		}
+
+		tr, err := modelTranscription(request.File, request.Language, tbs.ml, bc, tbs.appConfig)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Error: err}
+			close(responseChannel)
+			return
+		}
+		responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Value: tr}
+		close(responseChannel)
+	}(request)
+	return responseChannel
+}
+
+func modelTranscription(audio, language string, ml *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {

 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -7,29 +7,60 @@ import (
 	"path/filepath"

 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-func generateUniqueFileName(dir, baseName, ext string) string {
-	counter := 1
-	fileName := baseName + ext
+type TextToSpeechBackendService struct {
+	ml        *model.ModelLoader
+	bcl       *config.BackendConfigLoader
+	appConfig *config.ApplicationConfig
+}

-	for {
-		filePath := filepath.Join(dir, fileName)
-		_, err := os.Stat(filePath)
-		if os.IsNotExist(err) {
-			return fileName
-		}
-
-		counter++
-		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
+func NewTextToSpeechBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *TextToSpeechBackendService {
+	return &TextToSpeechBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
 	}
 }

-func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+func (ttsbs *TextToSpeechBackendService) TextToAudioFile(request *schema.TTSRequest) <-chan concurrency.ErrorOr[*string] {
+	responseChannel := make(chan concurrency.ErrorOr[*string])
+	go func(request *schema.TTSRequest) {
+		cfg, err := ttsbs.bcl.LoadBackendConfigFileByName(request.Model, ttsbs.appConfig.ModelPath,
+			config.LoadOptionDebug(ttsbs.appConfig.Debug),
+			config.LoadOptionThreads(ttsbs.appConfig.Threads),
+			config.LoadOptionContextSize(ttsbs.appConfig.ContextSize),
+			config.LoadOptionF16(ttsbs.appConfig.F16),
+		)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*string]{Error: err}
+			close(responseChannel)
+			return
+		}
+
+		if request.Backend != "" {
+			cfg.Backend = request.Backend
+		}
+
+		outFile, _, err := modelTTS(cfg.Backend, request.Input, cfg.Model, request.Voice, ttsbs.ml, ttsbs.appConfig, cfg)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*string]{Error: err}
+			close(responseChannel)
+			return
+		}
+		responseChannel <- concurrency.ErrorOr[*string]{Value: &outFile}
+		close(responseChannel)
+	}(request)
+	return responseChannel
+}
+
+func modelTTS(backend, text, modelFile string, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig *config.BackendConfig) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -37,7 +68,7 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,

 	grpcOpts := gRPCModelOpts(backendConfig)

-	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
+	opts := modelOpts(&config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
@@ -87,3 +118,19 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,

 	return filePath, res, err
 }
+
+func generateUniqueFileName(dir, baseName, ext string) string {
+	counter := 1
+	fileName := baseName + ext
+
+	for {
+		filePath := filepath.Join(dir, fileName)
+		_, err := os.Stat(filePath)
+		if os.IsNotExist(err) {
+			return fileName
+		}
+
+		counter++
+		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
+	}
+}
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -0,0 +1,20 @@
+package cli
+
+import "embed"
+
+type Context struct {
+	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
+	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug" help:"Set the level of logs to output [${enum}]"`
+
+	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
+	BackendAssets embed.FS `kong:"-"`
+}
+
+var CLI struct {
+	Context `embed:""`
+
+	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
+	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
+	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
+}
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -0,0 +1,74 @@
+package cli
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/rs/zerolog/log"
+	"github.com/schollz/progressbar/v3"
+)
+
+type ModelsCMDFlags struct {
+	Galleries  string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models"`
+	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+}
+
+type ModelsList struct {
+	ModelsCMDFlags `embed:""`
+}
+
+type ModelsInstall struct {
+	ModelArgs []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
+
+	ModelsCMDFlags `embed:""`
+}
+
+type ModelsCMD struct {
+	List    ModelsList    `cmd:"" help:"List the models avaiable in your galleries" default:"withargs"`
+	Install ModelsInstall `cmd:"" help:"Install a model from the gallery"`
+}
+
+func (ml *ModelsList) Run(ctx *Context) error {
+	var galleries []gallery.Gallery
+	if err := json.Unmarshal([]byte(ml.Galleries), &galleries); err != nil {
+		log.Error().Err(err).Msg("unable to load galleries")
+	}
+
+	models, err := gallery.AvailableGalleryModels(galleries, ml.ModelsPath)
+	if err != nil {
+		return err
+	}
+	for _, model := range models {
+		if model.Installed {
+			fmt.Printf(" * %s@%s (installed)\n", model.Gallery.Name, model.Name)
+		} else {
+			fmt.Printf(" - %s@%s\n", model.Gallery.Name, model.Name)
+		}
+	}
+	return nil
+}
+
+func (mi *ModelsInstall) Run(ctx *Context) error {
+	modelName := mi.ModelArgs[0]
+
+	var galleries []gallery.Gallery
+	if err := json.Unmarshal([]byte(mi.Galleries), &galleries); err != nil {
+		log.Error().Err(err).Msg("unable to load galleries")
+	}
+
+	progressBar := progressbar.NewOptions(
+		1000,
+		progressbar.OptionSetDescription(fmt.Sprintf("downloading model %s", modelName)),
+		progressbar.OptionShowBytes(false),
+		progressbar.OptionClearOnFinish(),
+	)
+	progressCallback := func(fileName string, current string, total string, percentage float64) {
+		progressBar.Set(int(percentage * 10))
+	}
+	err := gallery.InstallModelFromGallery(galleries, modelName, mi.ModelsPath, gallery.GalleryModel{}, progressCallback)
+	if err != nil {
+		return err
+	}
+	return nil
+}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -0,0 +1,155 @@
+package cli
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/http"
+	"github.com/go-skynet/LocalAI/core/startup"
+	"github.com/rs/zerolog/log"
+)
+
+type RunCMD struct {
+	ModelArgs []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
+
+	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+	ImagePath         string `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
+	AudioPath         string `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
+	UploadPath        string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
+	ConfigPath        string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
+	LocalaiConfigDir  string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
+	// The alias on this option is there to preserve functionality with the old `--config-file` parameter
+	ModelsConfigFile string `env:"LOCALAI_MODELS_CONFIG_FILE,CONFIG_FILE" aliases:"config-file" help:"YAML file containing a list of model backend configs" group:"storage"`
+
+	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models"`
+	AutoloadGalleries   bool     `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
+	RemoteLibrary       string   `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
+	PreloadModels       string   `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
+	Models              []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"`
+	PreloadModelsConfig string   `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`
+
+	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
+	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" default:"4" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
+
+	Address          string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
+	CORS             bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
+	CORSAllowOrigins string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
+	UploadLimit      int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
+	APIKeys          []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
+	DisableWelcome   bool     `env:"LOCALAI_DISABLE_WELCOME,DISABLE_WELCOME" default:"false" help:"Disable welcome pages" group:"api"`
+
+	ParallelRequests     bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
+	SingleActiveBackend  bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
+	PreloadBackendOnly   bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
+	ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
+	EnableWatchdogIdle   bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
+	WatchdogIdleTimeout  string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
+	EnableWatchdogBusy   bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
+	WatchdogBusyTimeout  string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
+}
+
+func (r *RunCMD) Run(ctx *Context) error {
+	opts := []config.AppOption{
+		config.WithConfigFile(r.ModelsConfigFile),
+		config.WithJSONStringPreload(r.PreloadModels),
+		config.WithYAMLConfigPreload(r.PreloadModelsConfig),
+		config.WithModelPath(r.ModelsPath),
+		config.WithContextSize(r.ContextSize),
+		config.WithDebug(*ctx.LogLevel == "debug"),
+		config.WithImageDir(r.ImagePath),
+		config.WithAudioDir(r.AudioPath),
+		config.WithUploadDir(r.UploadPath),
+		config.WithConfigsDir(r.ConfigPath),
+		config.WithF16(r.F16),
+		config.WithStringGalleries(r.Galleries),
+		config.WithModelLibraryURL(r.RemoteLibrary),
+		config.WithDisableMessage(false),
+		config.WithCors(r.CORS),
+		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
+		config.WithThreads(r.Threads),
+		config.WithBackendAssets(ctx.BackendAssets),
+		config.WithBackendAssetsOutput(r.BackendAssetsPath),
+		config.WithUploadLimitMB(r.UploadLimit),
+		config.WithApiKeys(r.APIKeys),
+		config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
+	}
+
+	idleWatchDog := r.EnableWatchdogIdle
+	busyWatchDog := r.EnableWatchdogBusy
+
+	if r.DisableWelcome {
+		opts = append(opts, config.DisableWelcomePage)
+	}
+
+	if idleWatchDog || busyWatchDog {
+		opts = append(opts, config.EnableWatchDog)
+		if idleWatchDog {
+			opts = append(opts, config.EnableWatchDogIdleCheck)
+			dur, err := time.ParseDuration(r.WatchdogIdleTimeout)
+			if err != nil {
+				return err
+			}
+			opts = append(opts, config.SetWatchDogIdleTimeout(dur))
+		}
+		if busyWatchDog {
+			opts = append(opts, config.EnableWatchDogBusyCheck)
+			dur, err := time.ParseDuration(r.WatchdogBusyTimeout)
+			if err != nil {
+				return err
+			}
+			opts = append(opts, config.SetWatchDogBusyTimeout(dur))
+		}
+	}
+	if r.ParallelRequests {
+		opts = append(opts, config.EnableParallelBackendRequests)
+	}
+	if r.SingleActiveBackend {
+		opts = append(opts, config.EnableSingleBackend)
+	}
+
+	// split ":" to get backend name and the uri
+	for _, v := range r.ExternalGRPCBackends {
+		backend := v[:strings.IndexByte(v, ':')]
+		uri := v[strings.IndexByte(v, ':')+1:]
+		opts = append(opts, config.WithExternalBackend(backend, uri))
+	}
+
+	if r.AutoloadGalleries {
+		opts = append(opts, config.EnableGalleriesAutoload)
+	}
+
+	if r.PreloadBackendOnly {
+		_, err := startup.Startup(opts...)
+		return err
+	}
+
+	application, err := startup.Startup(opts...)
+
+	if err != nil {
+		return fmt.Errorf("failed basic startup tasks with error %s", err.Error())
+	}
+
+	// Watch the configuration directory
+	// If the directory does not exist, we don't watch it
+	if _, err := os.Stat(r.LocalaiConfigDir); err == nil {
+		closeConfigWatcherFn, err := startup.WatchConfigDirectory(r.LocalaiConfigDir, application.ApplicationConfig)
+		defer closeConfigWatcherFn()
+
+		if err != nil {
+			return fmt.Errorf("failed while watching configuration directory %s", r.LocalaiConfigDir)
+		}
+	}
+
+	appHTTP, err := http.App(application)
+	if err != nil {
+		log.Error().Err(err).Msg("error during HTTP App construction")
+		return err
+	}
+
+	return appHTTP.Listen(r.Address)
+}
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -0,0 +1,65 @@
+package cli
+
+import (
+	"context"
+	"errors"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/model"
+)
+
+type TranscriptCMD struct {
+	Filename string `arg:""`
+
+	Backend           string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
+	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
+	Language          string `short:"l" help:"Language of the audio file"`
+	Threads           int    `short:"t" default:"1" help:"Number of threads used for parallel computation"`
+	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+}
+
+func (t *TranscriptCMD) Run(ctx *Context) error {
+	opts := &config.ApplicationConfig{
+		ModelPath:         t.ModelsPath,
+		Context:           context.Background(),
+		AssetsDestination: t.BackendAssetsPath,
+	}
+
+	cl := config.NewBackendConfigLoader()
+	ml := model.NewModelLoader(opts.ModelPath)
+	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
+		return err
+	}
+
+	c, exists := cl.GetBackendConfig(t.Model)
+	if !exists {
+		return errors.New("model not found")
+	}
+
+	c.Threads = &t.Threads
+
+	defer ml.StopAllGRPC()
+
+	tbs := backend.NewTranscriptionBackendService(ml, cl, opts)
+
+	resultChannel := tbs.Transcribe(&schema.OpenAIRequest{
+		PredictionOptions: schema.PredictionOptions{
+			Language: t.Language,
+		},
+		File: t.Filename,
+	})
+
+	r := <-resultChannel
+
+	if r.Error != nil {
+		return r.Error
+	}
+	for _, segment := range r.Value.Segments {
+		fmt.Println(segment.Start.String(), "-", segment.Text)
+	}
+	return nil
+}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -0,0 +1,71 @@
+package cli
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/model"
+)
+
+type TTSCMD struct {
+	Text []string `arg:""`
+
+	Backend           string `short:"b" default:"piper" help:"Backend to run the TTS model"`
+	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
+	Voice             string `short:"v" help:"Voice name to run the TTS"`
+	OutputFile        string `short:"o" type:"path" help:"The path to write the output wav file"`
+	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+}
+
+func (t *TTSCMD) Run(ctx *Context) error {
+	outputFile := t.OutputFile
+	outputDir := t.BackendAssetsPath
+	if outputFile != "" {
+		outputDir = filepath.Dir(outputFile)
+	}
+
+	text := strings.Join(t.Text, " ")
+
+	opts := &config.ApplicationConfig{
+		ModelPath:         t.ModelsPath,
+		Context:           context.Background(),
+		AudioDir:          outputDir,
+		AssetsDestination: t.BackendAssetsPath,
+	}
+	ml := model.NewModelLoader(opts.ModelPath)
+
+	defer ml.StopAllGRPC()
+
+	ttsbs := backend.NewTextToSpeechBackendService(ml, config.NewBackendConfigLoader(), opts)
+
+	request := &schema.TTSRequest{
+		Model:   t.Model,
+		Input:   text,
+		Backend: t.Backend,
+		Voice:   t.Voice,
+	}
+
+	resultsChannel := ttsbs.TextToAudioFile(request)
+
+	rawResult := <-resultsChannel
+
+	if rawResult.Error != nil {
+		return rawResult.Error
+	}
+	if outputFile != "" {
+		if err := os.Rename(*rawResult.Value, outputFile); err != nil {
+			return err
+		}
+		fmt.Printf("Generated file %q\n", outputFile)
+	} else {
+		fmt.Printf("Generated file %q\n", *rawResult.Value)
+	}
+	return nil
+}
--- a/core/clients/store.go
+++ b/core/clients/store.go
@@ -0,0 +1,151 @@
+package clients
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+)
+
+// Define a struct to hold the store API client
+type StoreClient struct {
+	BaseURL string
+	Client  *http.Client
+}
+
+type SetRequest struct {
+	Keys   [][]float32 `json:"keys"`
+	Values []string    `json:"values"`
+}
+
+type GetRequest struct {
+	Keys [][]float32 `json:"keys"`
+}
+
+type GetResponse struct {
+	Keys   [][]float32 `json:"keys"`
+	Values []string    `json:"values"`
+}
+
+type DeleteRequest struct {
+	Keys [][]float32 `json:"keys"`
+}
+
+type FindRequest struct {
+	TopK int       `json:"topk"`
+	Key  []float32 `json:"key"`
+}
+
+type FindResponse struct {
+	Keys         [][]float32 `json:"keys"`
+	Values       []string    `json:"values"`
+	Similarities []float32   `json:"similarities"`
+}
+
+// Constructor for StoreClient
+func NewStoreClient(baseUrl string) *StoreClient {
+	return &StoreClient{
+		BaseURL: baseUrl,
+		Client:  &http.Client{},
+	}
+}
+
+// Implement Set method
+func (c *StoreClient) Set(req SetRequest) error {
+	return c.doRequest("stores/set", req)
+}
+
+// Implement Get method
+func (c *StoreClient) Get(req GetRequest) (*GetResponse, error) {
+	body, err := c.doRequestWithResponse("stores/get", req)
+	if err != nil {
+		return nil, err
+	}
+
+	var resp GetResponse
+	err = json.Unmarshal(body, &resp)
+	if err != nil {
+		return nil, err
+	}
+
+	return &resp, nil
+}
+
+// Implement Delete method
+func (c *StoreClient) Delete(req DeleteRequest) error {
+	return c.doRequest("stores/delete", req)
+}
+
+// Implement Find method
+func (c *StoreClient) Find(req FindRequest) (*FindResponse, error) {
+	body, err := c.doRequestWithResponse("stores/find", req)
+	if err != nil {
+		return nil, err
+	}
+
+	var resp FindResponse
+	err = json.Unmarshal(body, &resp)
+	if err != nil {
+		return nil, err
+	}
+
+	return &resp, nil
+}
+
+// Helper function to perform a request without expecting a response body
+func (c *StoreClient) doRequest(path string, data interface{}) error {
+	jsonData, err := json.Marshal(data)
+	if err != nil {
+		return err
+	}
+
+	req, err := http.NewRequest("POST", c.BaseURL+"/"+path, bytes.NewBuffer(jsonData))
+	if err != nil {
+		return err
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := c.Client.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("API request to %s failed with status code %d", path, resp.StatusCode)
+	}
+
+	return nil
+}
+
+// Helper function to perform a request and parse the response body
+func (c *StoreClient) doRequestWithResponse(path string, data interface{}) ([]byte, error) {
+	jsonData, err := json.Marshal(data)
+	if err != nil {
+		return nil, err
+	}
+
+	req, err := http.NewRequest("POST", c.BaseURL+"/"+path, bytes.NewBuffer(jsonData))
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := c.Client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return nil, fmt.Errorf("API request to %s failed with status code %d", path, resp.StatusCode)
+	}
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+
+	return body, nil
+}
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -1,22 +1,7 @@
 package config

 import (
-	"errors"
-	"fmt"
-	"io/fs"
-	"os"
-	"path/filepath"
-	"sort"
-	"strings"
-	"sync"
-
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/downloader"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-	"github.com/rs/zerolog/log"
-	"gopkg.in/yaml.v3"
-
-	"github.com/charmbracelet/glamour"
 )

 const (
@@ -165,11 +150,12 @@ type Functions struct {
 }

 type TemplateConfig struct {
-	Chat        string `yaml:"chat"`
-	ChatMessage string `yaml:"chat_message"`
-	Completion  string `yaml:"completion"`
-	Edit        string `yaml:"edit"`
-	Functions   string `yaml:"function"`
+	Chat                 string `yaml:"chat"`
+	ChatMessage          string `yaml:"chat_message"`
+	Completion           string `yaml:"completion"`
+	Edit                 string `yaml:"edit"`
+	Functions            string `yaml:"function"`
+	UseTokenizerTemplate bool   `yaml:"use_tokenizer_template"`
 }

 func (c *BackendConfig) SetFunctionCallString(s string) {
@@ -198,7 +184,7 @@ func (c *BackendConfig) FunctionToCall() string {
 }

 func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
-	lo := &LoadOptions{}
+	lo := &ConfigLoaderOptions{}
 	lo.Apply(opts...)

 	ctx := lo.ctxSize
@@ -311,287 +297,3 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}
 }
-
-////// Config Loader ////////
-
-type BackendConfigLoader struct {
-	configs map[string]BackendConfig
-	sync.Mutex
-}
-
-type LoadOptions struct {
-	debug            bool
-	threads, ctxSize int
-	f16              bool
-}
-
-func LoadOptionDebug(debug bool) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.debug = debug
-	}
-}
-
-func LoadOptionThreads(threads int) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.threads = threads
-	}
-}
-
-func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.ctxSize = ctxSize
-	}
-}
-
-func LoadOptionF16(f16 bool) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.f16 = f16
-	}
-}
-
-type ConfigLoaderOption func(*LoadOptions)
-
-func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
-	for _, l := range options {
-		l(lo)
-	}
-}
-
-// Load a config file for a model
-func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
-
-	// Load a config file if present after the model name
-	cfg := &BackendConfig{
-		PredictionOptions: schema.PredictionOptions{
-			Model: modelName,
-		},
-	}
-
-	cfgExisting, exists := cl.GetBackendConfig(modelName)
-	if exists {
-		cfg = &cfgExisting
-	} else {
-		// Try loading a model config file
-		modelConfig := filepath.Join(modelPath, modelName+".yaml")
-		if _, err := os.Stat(modelConfig); err == nil {
-			if err := cl.LoadBackendConfig(
-				modelConfig, opts...,
-			); err != nil {
-				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-			}
-			cfgExisting, exists = cl.GetBackendConfig(modelName)
-			if exists {
-				cfg = &cfgExisting
-			}
-		}
-	}
-
-	cfg.SetDefaults(opts...)
-
-	return cfg, nil
-}
-
-func NewBackendConfigLoader() *BackendConfigLoader {
-	return &BackendConfigLoader{
-		configs: make(map[string]BackendConfig),
-	}
-}
-func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
-	c := &[]*BackendConfig{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	for _, cc := range *c {
-		cc.SetDefaults(opts...)
-	}
-
-	return *c, nil
-}
-
-func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
-	lo := &LoadOptions{}
-	lo.Apply(opts...)
-
-	c := &BackendConfig{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	c.SetDefaults(opts...)
-	return c, nil
-}
-
-func (cm *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadBackendConfigFile(file, opts...)
-	if err != nil {
-		return fmt.Errorf("cannot load config file: %w", err)
-	}
-
-	for _, cc := range c {
-		cm.configs[cc.Name] = *cc
-	}
-	return nil
-}
-
-func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
-	cl.Lock()
-	defer cl.Unlock()
-	c, err := ReadBackendConfig(file, opts...)
-	if err != nil {
-		return fmt.Errorf("cannot read config file: %w", err)
-	}
-
-	cl.configs[c.Name] = *c
-	return nil
-}
-
-func (cl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
-	cl.Lock()
-	defer cl.Unlock()
-	v, exists := cl.configs[m]
-	return v, exists
-}
-
-func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
-	cl.Lock()
-	defer cl.Unlock()
-	var res []BackendConfig
-	for _, v := range cl.configs {
-		res = append(res, v)
-	}
-
-	sort.SliceStable(res, func(i, j int) bool {
-		return res[i].Name < res[j].Name
-	})
-
-	return res
-}
-
-func (cl *BackendConfigLoader) ListBackendConfigs() []string {
-	cl.Lock()
-	defer cl.Unlock()
-	var res []string
-	for k := range cl.configs {
-		res = append(res, k)
-	}
-	return res
-}
-
-// Preload prepare models if they are not local but url or huggingface repositories
-func (cl *BackendConfigLoader) Preload(modelPath string) error {
-	cl.Lock()
-	defer cl.Unlock()
-
-	status := func(fileName, current, total string, percent float64) {
-		utils.DisplayDownloadFunction(fileName, current, total, percent)
-	}
-
-	log.Info().Msgf("Preloading models from %s", modelPath)
-
-	renderMode := "dark"
-	if os.Getenv("COLOR") != "" {
-		renderMode = os.Getenv("COLOR")
-	}
-
-	glamText := func(t string) {
-		out, err := glamour.Render(t, renderMode)
-		if err == nil && os.Getenv("NO_COLOR") == "" {
-			fmt.Println(out)
-		} else {
-			fmt.Println(t)
-		}
-	}
-
-	for i, config := range cl.configs {
-
-		// Download files and verify their SHA
-		for _, file := range config.DownloadFiles {
-			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
-
-			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
-				return err
-			}
-			// Create file path
-			filePath := filepath.Join(modelPath, file.Filename)
-
-			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
-				return err
-			}
-		}
-
-		modelURL := config.PredictionOptions.Model
-		modelURL = downloader.ConvertURL(modelURL)
-
-		if downloader.LooksLikeURL(modelURL) {
-			// md5 of model name
-			md5Name := utils.MD5(modelURL)
-
-			// check if file exists
-			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
-				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
-				if err != nil {
-					return err
-				}
-			}
-
-			cc := cl.configs[i]
-			c := &cc
-			c.PredictionOptions.Model = md5Name
-			cl.configs[i] = *c
-		}
-		if cl.configs[i].Name != "" {
-			glamText(fmt.Sprintf("**Model name**: _%s_", cl.configs[i].Name))
-		}
-		if cl.configs[i].Description != "" {
-			//glamText("**Description**")
-			glamText(cl.configs[i].Description)
-		}
-		if cl.configs[i].Usage != "" {
-			//glamText("**Usage**")
-			glamText(cl.configs[i].Usage)
-		}
-	}
-	return nil
-}
-
-// LoadBackendConfigsFromPath reads all the configurations of the models from a path
-// (non-recursive)
-func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
-	cm.Lock()
-	defer cm.Unlock()
-	entries, err := os.ReadDir(path)
-	if err != nil {
-		return err
-	}
-	files := make([]fs.FileInfo, 0, len(entries))
-	for _, entry := range entries {
-		info, err := entry.Info()
-		if err != nil {
-			return err
-		}
-		files = append(files, info)
-	}
-	for _, file := range files {
-		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
-			continue
-		}
-		c, err := ReadBackendConfig(filepath.Join(path, file.Name()), opts...)
-		if err == nil {
-			cm.configs[c.Name] = *c
-		}
-	}
-
-	return nil
-}
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -0,0 +1,509 @@
+package config
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+
+	"github.com/charmbracelet/glamour"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/downloader"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/rs/zerolog/log"
+	"gopkg.in/yaml.v2"
+)
+
+type BackendConfigLoader struct {
+	configs map[string]BackendConfig
+	sync.Mutex
+}
+
+type ConfigLoaderOptions struct {
+	debug            bool
+	threads, ctxSize int
+	f16              bool
+}
+
+func LoadOptionDebug(debug bool) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.debug = debug
+	}
+}
+
+func LoadOptionThreads(threads int) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.threads = threads
+	}
+}
+
+func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.ctxSize = ctxSize
+	}
+}
+
+func LoadOptionF16(f16 bool) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.f16 = f16
+	}
+}
+
+type ConfigLoaderOption func(*ConfigLoaderOptions)
+
+func (lo *ConfigLoaderOptions) Apply(options ...ConfigLoaderOption) {
+	for _, l := range options {
+		l(lo)
+	}
+}
+
+func NewBackendConfigLoader() *BackendConfigLoader {
+	return &BackendConfigLoader{
+		configs: make(map[string]BackendConfig),
+	}
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	c, err := readBackendConfig(file, opts...)
+	if err != nil {
+		return fmt.Errorf("cannot read config file: %w", err)
+	}
+
+	bcl.configs[c.Name] = *c
+	return nil
+}
+
+func (bcl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
+	bcl.Lock()
+	defer bcl.Unlock()
+	v, exists := bcl.configs[m]
+	return v, exists
+}
+
+func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
+	bcl.Lock()
+	defer bcl.Unlock()
+	var res []BackendConfig
+	for _, v := range bcl.configs {
+		res = append(res, v)
+	}
+	sort.SliceStable(res, func(i, j int) bool {
+		return res[i].Name < res[j].Name
+	})
+	return res
+}
+
+func (bcl *BackendConfigLoader) ListBackendConfigs() []string {
+	bcl.Lock()
+	defer bcl.Unlock()
+	var res []string
+	for k := range bcl.configs {
+		res = append(res, k)
+	}
+	return res
+}
+
+// Preload prepare models if they are not local but url or huggingface repositories
+func (bcl *BackendConfigLoader) Preload(modelPath string) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+
+	status := func(fileName, current, total string, percent float64) {
+		utils.DisplayDownloadFunction(fileName, current, total, percent)
+	}
+
+	log.Info().Msgf("Preloading models from %s", modelPath)
+
+	renderMode := "dark"
+	if os.Getenv("COLOR") != "" {
+		renderMode = os.Getenv("COLOR")
+	}
+
+	glamText := func(t string) {
+		out, err := glamour.Render(t, renderMode)
+		if err == nil && os.Getenv("NO_COLOR") == "" {
+			fmt.Println(out)
+		} else {
+			fmt.Println(t)
+		}
+	}
+
+	for i, config := range bcl.configs {
+
+		// Download files and verify their SHA
+		for _, file := range config.DownloadFiles {
+			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
+
+			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
+				return err
+			}
+			// Create file path
+			filePath := filepath.Join(modelPath, file.Filename)
+
+			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
+				return err
+			}
+		}
+
+		modelURL := config.PredictionOptions.Model
+		modelURL = downloader.ConvertURL(modelURL)
+
+		if downloader.LooksLikeURL(modelURL) {
+			// md5 of model name
+			md5Name := utils.MD5(modelURL)
+
+			// check if file exists
+			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
+				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
+				if err != nil {
+					return err
+				}
+			}
+
+			cc := bcl.configs[i]
+			c := &cc
+			c.PredictionOptions.Model = md5Name
+			bcl.configs[i] = *c
+		}
+		if bcl.configs[i].Name != "" {
+			glamText(fmt.Sprintf("**Model name**: _%s_", bcl.configs[i].Name))
+		}
+		if bcl.configs[i].Description != "" {
+			//glamText("**Description**")
+			glamText(bcl.configs[i].Description)
+		}
+		if bcl.configs[i].Usage != "" {
+			//glamText("**Usage**")
+			glamText(bcl.configs[i].Usage)
+		}
+	}
+	return nil
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		return err
+	}
+	files := make([]fs.FileInfo, 0, len(entries))
+	for _, entry := range entries {
+		info, err := entry.Info()
+		if err != nil {
+			return err
+		}
+		files = append(files, info)
+	}
+	for _, file := range files {
+		// Skip templates, YAML and .keep files
+		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
+			continue
+		}
+		c, err := readBackendConfig(filepath.Join(path, file.Name()), opts...)
+		if err == nil {
+			bcl.configs[c.Name] = *c
+		}
+	}
+
+	return nil
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	c, err := readBackendConfigFile(file, opts...)
+	if err != nil {
+		return fmt.Errorf("cannot load config file: %w", err)
+	}
+
+	for _, cc := range c {
+		bcl.configs[cc.Name] = *cc
+	}
+	return nil
+}
+
+//////////
+
+// Load a config file for a model
+func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName string, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+
+	// Load a config file if present after the model name
+	cfg := &BackendConfig{
+		PredictionOptions: schema.PredictionOptions{
+			Model: modelName,
+		},
+	}
+
+	cfgExisting, exists := bcl.GetBackendConfig(modelName)
+	if exists {
+		cfg = &cfgExisting
+	} else {
+		// Load a config file if present after the model name
+		modelConfig := filepath.Join(modelPath, modelName+".yaml")
+		if _, err := os.Stat(modelConfig); err == nil {
+			if err := bcl.LoadBackendConfig(modelConfig); err != nil {
+				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+			}
+			cfgExisting, exists = bcl.GetBackendConfig(modelName)
+			if exists {
+				cfg = &cfgExisting
+			}
+		}
+	}
+
+	cfg.SetDefaults(opts...)
+	return cfg, nil
+}
+
+func readBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
+	c := &[]*BackendConfig{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	for _, cc := range *c {
+		cc.SetDefaults(opts...)
+	}
+
+	return *c, nil
+}
+
+func readBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+	c := &BackendConfig{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	c.SetDefaults(opts...)
+	return c, nil
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfigForModelAndOpenAIRequest(modelFile string, input *schema.OpenAIRequest, appConfig *ApplicationConfig) (*BackendConfig, *schema.OpenAIRequest, error) {
+	cfg, err := bcl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+		LoadOptionContextSize(appConfig.ContextSize),
+		LoadOptionDebug(appConfig.Debug),
+		LoadOptionF16(appConfig.F16),
+		LoadOptionThreads(appConfig.Threads),
+	)
+
+	// Set the parameters for the language model prediction
+	updateBackendConfigFromOpenAIRequest(cfg, input)
+
+	return cfg, input, err
+}
+
+func updateBackendConfigFromOpenAIRequest(bc *BackendConfig, request *schema.OpenAIRequest) {
+	if request.Echo {
+		bc.Echo = request.Echo
+	}
+	if request.TopK != nil && *request.TopK != 0 {
+		bc.TopK = request.TopK
+	}
+	if request.TopP != nil && *request.TopP != 0 {
+		bc.TopP = request.TopP
+	}
+
+	if request.Backend != "" {
+		bc.Backend = request.Backend
+	}
+
+	if request.ClipSkip != 0 {
+		bc.Diffusers.ClipSkip = request.ClipSkip
+	}
+
+	if request.ModelBaseName != "" {
+		bc.AutoGPTQ.ModelBaseName = request.ModelBaseName
+	}
+
+	if request.NegativePromptScale != 0 {
+		bc.NegativePromptScale = request.NegativePromptScale
+	}
+
+	if request.UseFastTokenizer {
+		bc.UseFastTokenizer = request.UseFastTokenizer
+	}
+
+	if request.NegativePrompt != "" {
+		bc.NegativePrompt = request.NegativePrompt
+	}
+
+	if request.RopeFreqBase != 0 {
+		bc.RopeFreqBase = request.RopeFreqBase
+	}
+
+	if request.RopeFreqScale != 0 {
+		bc.RopeFreqScale = request.RopeFreqScale
+	}
+
+	if request.Grammar != "" {
+		bc.Grammar = request.Grammar
+	}
+
+	if request.Temperature != nil && *request.Temperature != 0 {
+		bc.Temperature = request.Temperature
+	}
+
+	if request.Maxtokens != nil && *request.Maxtokens != 0 {
+		bc.Maxtokens = request.Maxtokens
+	}
+
+	switch stop := request.Stop.(type) {
+	case string:
+		if stop != "" {
+			bc.StopWords = append(bc.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				bc.StopWords = append(bc.StopWords, s)
+			}
+		}
+	}
+
+	if len(request.Tools) > 0 {
+		for _, tool := range request.Tools {
+			request.Functions = append(request.Functions, tool.Function)
+		}
+	}
+
+	if request.ToolsChoice != nil {
+		var toolChoice grammar.Tool
+		switch content := request.ToolsChoice.(type) {
+		case string:
+			_ = json.Unmarshal([]byte(content), &toolChoice)
+		case map[string]interface{}:
+			dat, _ := json.Marshal(content)
+			_ = json.Unmarshal(dat, &toolChoice)
+		}
+		request.FunctionCall = map[string]interface{}{
+			"name": toolChoice.Function.Name,
+		}
+	}
+
+	// Decode each request's message content
+	index := 0
+	for i, m := range request.Messages {
+		switch content := m.Content.(type) {
+		case string:
+			request.Messages[i].StringContent = content
+		case []interface{}:
+			dat, _ := json.Marshal(content)
+			c := []schema.Content{}
+			json.Unmarshal(dat, &c)
+			for _, pp := range c {
+				if pp.Type == "text" {
+					request.Messages[i].StringContent = pp.Text
+				} else if pp.Type == "image_url" {
+					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
+					base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
+					if err == nil {
+						request.Messages[i].StringImages = append(request.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+						// set a placeholder for each image
+						request.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + request.Messages[i].StringContent
+						index++
+					} else {
+						fmt.Print("Failed encoding image", err)
+					}
+				}
+			}
+		}
+	}
+
+	if request.RepeatPenalty != 0 {
+		bc.RepeatPenalty = request.RepeatPenalty
+	}
+
+	if request.FrequencyPenalty != 0 {
+		bc.FrequencyPenalty = request.FrequencyPenalty
+	}
+
+	if request.PresencePenalty != 0 {
+		bc.PresencePenalty = request.PresencePenalty
+	}
+
+	if request.Keep != 0 {
+		bc.Keep = request.Keep
+	}
+
+	if request.Batch != 0 {
+		bc.Batch = request.Batch
+	}
+
+	if request.IgnoreEOS {
+		bc.IgnoreEOS = request.IgnoreEOS
+	}
+
+	if request.Seed != nil {
+		bc.Seed = request.Seed
+	}
+
+	if request.TypicalP != nil {
+		bc.TypicalP = request.TypicalP
+	}
+
+	switch inputs := request.Input.(type) {
+	case string:
+		if inputs != "" {
+			bc.InputStrings = append(bc.InputStrings, inputs)
+		}
+	case []interface{}:
+		for _, pp := range inputs {
+			switch i := pp.(type) {
+			case string:
+				bc.InputStrings = append(bc.InputStrings, i)
+			case []interface{}:
+				tokens := []int{}
+				for _, ii := range i {
+					tokens = append(tokens, int(ii.(float64)))
+				}
+				bc.InputToken = append(bc.InputToken, tokens)
+			}
+		}
+	}
+
+	// Can be either a string or an object
+	switch fnc := request.FunctionCall.(type) {
+	case string:
+		if fnc != "" {
+			bc.SetFunctionCallString(fnc)
+		}
+	case map[string]interface{}:
+		var name string
+		n, exists := fnc["name"]
+		if exists {
+			nn, e := n.(string)
+			if e {
+				name = nn
+			}
+		}
+		bc.SetFunctionCallNameString(name)
+	}
+
+	switch p := request.Prompt.(type) {
+	case string:
+		bc.PromptStrings = append(bc.PromptStrings, p)
+	case []interface{}:
+		for _, pp := range p {
+			if s, ok := pp.(string); ok {
+				bc.PromptStrings = append(bc.PromptStrings, s)
+			}
+		}
+	}
+}
--- a/core/config/exports_test.go
+++ b/core/config/exports_test.go
@@ -0,0 +1,6 @@
+package config
+
+// This file re-exports private functions to be used directly in unit tests.
+// Since this file's name ends in _test.go, theoretically these should not be exposed past the tests.
+
+var ReadBackendConfigFile = readBackendConfigFile
--- a/core/http/api.go
+++ b/core/http/api.go
@@ -1,23 +1,20 @@
 package http

 import (
-	"encoding/json"
 	"errors"
-	"os"
 	"strings"

-	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/go-skynet/LocalAI/core"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
 	"github.com/gofiber/swagger" // swagger handler

 	"github.com/go-skynet/LocalAI/core/http/endpoints/elevenlabs"
 	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
 	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
-
-	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/core/services"
 	"github.com/go-skynet/LocalAI/internal"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	model "github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
@@ -55,13 +52,12 @@ func readAuthHeader(c *fiber.Ctx) string {
 // @securityDefinitions.apikey BearerAuth
 // @in header
 // @name Authorization
-
-func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
+func App(application *core.Application) (*fiber.App, error) {
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
 		Views:                 renderEngine(),
-		BodyLimit:             appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
-		DisableStartupMessage: appConfig.DisableMessage,
+		BodyLimit:             application.ApplicationConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		DisableStartupMessage: application.ApplicationConfig.DisableMessage,
 		// Override default error handler
 		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
 			// Status code defaults to 500
@@ -82,7 +78,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		},
 	})

-	if appConfig.Debug {
+	if application.ApplicationConfig.Debug {
 		app.Use(logger.New(logger.Config{
 			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
 		}))
@@ -90,7 +86,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi

 	// Default middleware config

-	if !appConfig.Debug {
+	if !application.ApplicationConfig.Debug {
 		app.Use(recover.New())
 	}

@@ -108,25 +104,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi

 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
-		if len(appConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		// Check for api_keys.json file
-		fileContent, err := os.ReadFile("api_keys.json")
-		if err == nil {
-			// Parse JSON content from the file
-			var fileKeys []string
-			err := json.Unmarshal(fileContent, &fileKeys)
-			if err != nil {
-				return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{"message": "Error parsing api_keys.json"})
-			}
-
-			// Add file keys to options.ApiKeys
-			appConfig.ApiKeys = append(appConfig.ApiKeys, fileKeys...)
-		}
-
-		if len(appConfig.ApiKeys) == 0 {
+		if len(application.ApplicationConfig.ApiKeys) == 0 {
 			return c.Next()
 		}

@@ -142,7 +120,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		}

 		apiKey := authHeaderParts[1]
-		for _, key := range appConfig.ApiKeys {
+		for _, key := range application.ApplicationConfig.ApiKeys {
 			if apiKey == key {
 				return c.Next()
 			}
@@ -151,20 +129,22 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
 	}

-	if appConfig.CORS {
+	if application.ApplicationConfig.CORS {
 		var c func(ctx *fiber.Ctx) error
-		if appConfig.CORSAllowOrigins == "" {
+		if application.ApplicationConfig.CORSAllowOrigins == "" {
 			c = cors.New()
 		} else {
-			c = cors.New(cors.Config{AllowOrigins: appConfig.CORSAllowOrigins})
+			c = cors.New(cors.Config{AllowOrigins: application.ApplicationConfig.CORSAllowOrigins})
 		}

 		app.Use(c)
 	}

+	fiberContextExtractor := fiberContext.NewFiberContextExtractor(application.ModelLoader, application.ApplicationConfig)
+
 	// LocalAI API endpoints
-	galleryService := services.NewGalleryService(appConfig.ModelPath)
-	galleryService.Start(appConfig.Context, cl)
+	galleryService := services.NewGalleryService(application.ApplicationConfig.ModelPath)
+	galleryService.Start(application.ApplicationConfig.Context, application.BackendConfigLoader)

 	app.Get("/version", auth, func(c *fiber.Ctx) error {
 		return c.JSON(struct {
@@ -172,29 +152,17 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		}{Version: internal.PrintableVersion()})
 	})

-	// Make sure directories exists
-	os.MkdirAll(appConfig.ImageDir, 0755)
-	os.MkdirAll(appConfig.AudioDir, 0755)
-	os.MkdirAll(appConfig.UploadDir, 0755)
-	os.MkdirAll(appConfig.ConfigsDir, 0755)
-	os.MkdirAll(appConfig.ModelPath, 0755)
-
-	// Load config jsons
-	utils.LoadConfig(appConfig.UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
-
 	app.Get("/swagger/*", swagger.HandlerDefault) // default

 	welcomeRoute(
 		app,
-		cl,
-		ml,
-		appConfig,
+		application.BackendConfigLoader,
+		application.ModelLoader,
+		application.ApplicationConfig,
 		auth,
 	)

-	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
+	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(application.ApplicationConfig.Galleries, application.ApplicationConfig.ModelPath, galleryService)
 	app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
 	app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
 	app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
@@ -203,83 +171,85 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
 	app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())

-	app.Post("/tts", auth, localai.TTSEndpoint(cl, ml, appConfig))
-
-	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
-
 	// Stores
-	sl := model.NewModelLoader("")
-	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
-	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
-	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
-	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))
+	storeLoader := model.NewModelLoader("") // TODO: Investigate if this should be migrated to application and reused. Should the path be configurable? Merging for now.
+	app.Post("/stores/set", auth, localai.StoresSetEndpoint(storeLoader, application.ApplicationConfig))
+	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(storeLoader, application.ApplicationConfig))
+	app.Post("/stores/get", auth, localai.StoresGetEndpoint(storeLoader, application.ApplicationConfig))
+	app.Post("/stores/find", auth, localai.StoresFindEndpoint(storeLoader, application.ApplicationConfig))

-	// openAI compatible API endpoint
+	// openAI compatible API endpoints

 	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(fiberContextExtractor, application.OpenAIService))

 	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/v1/edits", auth, openai.EditEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/edits", auth, openai.EditEndpoint(fiberContextExtractor, application.OpenAIService))

 	// assistant
-	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	// TODO: Refactor this to the new style eventually
+	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))

 	// files
-	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
-	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Post("/files", auth, openai.UploadFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/v1/files", auth, openai.ListFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/files", auth, openai.ListFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(application.BackendConfigLoader, application.ApplicationConfig))

 	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))

 	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))

 	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(fiberContextExtractor, application.TranscriptionBackendService))
+	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))

 	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, ml, appConfig))
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(fiberContextExtractor, application.ImageGenerationBackendService))

-	if appConfig.ImageDir != "" {
-		app.Static("/generated-images", appConfig.ImageDir)
+	// Elevenlabs
+	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
+
+	// LocalAI TTS?
+	app.Post("/tts", auth, localai.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
+
+	if application.ApplicationConfig.ImageDir != "" {
+		app.Static("/generated-images", application.ApplicationConfig.ImageDir)
 	}

-	if appConfig.AudioDir != "" {
-		app.Static("/generated-audio", appConfig.AudioDir)
+	if application.ApplicationConfig.AudioDir != "" {
+		app.Static("/generated-audio", application.ApplicationConfig.AudioDir)
 	}

 	ok := func(c *fiber.Ctx) error {
@@ -291,13 +261,12 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	app.Get("/readyz", ok)

 	// Experimental Backend Statistics Module
-	backendMonitor := services.NewBackendMonitor(cl, ml, appConfig) // Split out for now
-	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(backendMonitor))
-	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(backendMonitor))
+	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(application.BackendMonitorService))
+	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(application.BackendMonitorService))

 	// models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(cl, ml))
-	app.Get("/models", auth, openai.ListModelsEndpoint(cl, ml))
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(application.ListModelsService))
+	app.Get("/models", auth, openai.ListModelsEndpoint(application.ListModelsService))

 	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())

--- a/core/http/api_test.go
+++ b/core/http/api_test.go
@@ -12,7 +12,9 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strings"

+	"github.com/go-skynet/LocalAI/core"
 	"github.com/go-skynet/LocalAI/core/config"
 	. "github.com/go-skynet/LocalAI/core/http"
 	"github.com/go-skynet/LocalAI/core/schema"
@@ -43,6 +45,7 @@ Can you help rephrasing sentences?
 type modelApplyRequest struct {
 	ID        string                 `json:"id"`
 	URL       string                 `json:"url"`
+	ConfigURL string                 `json:"config_url"`
 	Name      string                 `json:"name"`
 	Overrides map[string]interface{} `json:"overrides"`
 }
@@ -204,9 +207,7 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string
 	var modelDir string
-	var bcl *config.BackendConfigLoader
-	var ml *model.ModelLoader
-	var applicationConfig *config.ApplicationConfig
+	var application *core.Application

 	commonOpts := []config.AppOption{
 		config.WithDebug(true),
@@ -251,7 +252,7 @@ var _ = Describe("API test", func() {
 				},
 			}

-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithGalleries(galleries),
@@ -260,7 +261,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())

-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -366,6 +367,29 @@ var _ = Describe("API test", func() {
 				Expect(err).ToNot(HaveOccurred())
 				Expect(content["backend"]).To(Equal("llama"))
 			})
+			It("apply models from config", func() {
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				Eventually(func() []string {
+					models, _ := client.ListModels(context.TODO())
+					modelList := []string{}
+					for _, m := range models.Models {
+						modelList = append(modelList, m.ID)
+					}
+					return modelList
+				}, "360s", "10s").Should(ContainElements("hermes-2-pro-mistral"))
+			})
 			It("apply models without overrides", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
@@ -450,11 +474,11 @@ var _ = Describe("API test", func() {
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
-				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+				Expect(resp2.Choices[0].Message.ToolCalls[0].Function).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.ToolCalls[0].Function.Name)

 				var res map[string]string
-				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.ToolCalls[0].Function.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
@@ -463,9 +487,9 @@ var _ = Describe("API test", func() {
 			})

 			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
+				// if runtime.GOOS != "linux" {
+				// 	Skip("test supported only on linux")
+				// }
 				modelName := "codellama"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
@@ -480,7 +504,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
+				}, "480s", "10s").Should(Equal(true))

 				By("testing chat")
 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
@@ -527,11 +551,13 @@ var _ = Describe("API test", func() {
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
-				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+				fmt.Printf("\n--- %+v\n\n", resp2.Choices[0].Message)
+				Expect(resp2.Choices[0].Message.ToolCalls).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.ToolCalls[0]).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.ToolCalls[0].Function.Name)

 				var res map[string]string
-				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.ToolCalls[0].Function.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
@@ -585,7 +611,7 @@ var _ = Describe("API test", func() {
 				},
 			}

-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithAudioDir(tmpdir),
@@ -596,7 +622,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(tmpdir))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -700,14 +726,14 @@ var _ = Describe("API test", func() {

 			var err error

-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

@@ -737,6 +763,11 @@ var _ = Describe("API test", func() {
 			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions via ggml", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel.ggml", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -744,6 +775,11 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate chat completions via ggml", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel.ggml", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -751,6 +787,11 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate completions from model configs", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -758,6 +799,11 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate chat completions from model configs", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -844,9 +890,9 @@ var _ = Describe("API test", func() {

 		Context("backends", func() {
 			It("runs rwkv completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
+				// if runtime.GOOS != "linux" {
+				// 	Skip("test supported only on linux")
+				// }
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices) > 0).To(BeTrue())
@@ -867,17 +913,20 @@ var _ = Describe("API test", func() {
 					}

 					Expect(err).ToNot(HaveOccurred())
-					text += response.Choices[0].Text
-					tokens++
+
+					if len(response.Choices) > 0 {
+						text += response.Choices[0].Text
+						tokens++
+					}
 				}
 				Expect(text).ToNot(BeEmpty())
 				Expect(text).To(ContainSubstring("five"))
 				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
 			})
 			It("runs rwkv chat completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
+				// if runtime.GOOS != "linux" {
+				// 	Skip("test supported only on linux")
+				// }
 				resp, err := client.CreateChatCompletion(context.TODO(),
 					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -986,14 +1035,14 @@ var _ = Describe("API test", func() {
 			c, cancel = context.WithCancel(context.Background())

 			var err error
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 					config.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -1017,18 +1066,33 @@ var _ = Describe("API test", func() {
 			}
 		})
 		It("can generate chat completions from config file (list1)", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate chat completions from config file (list2)", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate edit completions from config file", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			request := openaigo.EditCreateRequestBody{
 				Model:       "list2",
 				Instruction: "foo",
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -1,43 +1,88 @@
 package fiberContext

 import (
+	"context"
+	"encoding/json"
 	"fmt"
 	"strings"

+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

+type FiberContextExtractor struct {
+	ml        *model.ModelLoader
+	appConfig *config.ApplicationConfig
+}
+
+func NewFiberContextExtractor(ml *model.ModelLoader, appConfig *config.ApplicationConfig) *FiberContextExtractor {
+	return &FiberContextExtractor{
+		ml:        ml,
+		appConfig: appConfig,
+	}
+}
+
 // ModelFromContext returns the model from the context
 // If no model is specified, it will take the first available
 // Takes a model string as input which should be the one received from the user request.
 // It returns the model name resolved from the context and an error if any.
-func ModelFromContext(ctx *fiber.Ctx, loader *model.ModelLoader, modelInput string, firstModel bool) (string, error) {
-	if ctx.Params("model") != "" {
-		modelInput = ctx.Params("model")
+func (fce *FiberContextExtractor) ModelFromContext(ctx *fiber.Ctx, modelInput string, firstModel bool) (string, error) {
+	ctxPM := ctx.Params("model")
+	if ctxPM != "" {
+		log.Debug().Msgf("[FCE] Overriding param modelInput %q with ctx.Params value %q", modelInput, ctxPM)
+		modelInput = ctxPM
 	}

 	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
-	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+	bearer := strings.TrimPrefix(ctx.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && fce.ml.ExistsInModelPath(bearer)

 	// If no model was specified, take the first available
 	if modelInput == "" && !bearerExists && firstModel {
-		models, _ := loader.ListModels()
+		models, _ := fce.ml.ListModels()
 		if len(models) > 0 {
 			modelInput = models[0]
-			log.Debug().Msgf("No model specified, using: %s", modelInput)
+			log.Debug().Msgf("[FCE] No model specified, using first available: %s", modelInput)
 		} else {
-			log.Debug().Msgf("No model specified, returning error")
-			return "", fmt.Errorf("no model specified")
+			log.Warn().Msgf("[FCE] No model specified, none available")
+			return "", fmt.Errorf("[fce] no model specified, none available")
 		}
 	}

 	// If a model is found in bearer token takes precedence
 	if bearerExists {
-		log.Debug().Msgf("Using model from bearer token: %s", bearer)
+		log.Debug().Msgf("[FCE] Using model from bearer token: %s", bearer)
 		modelInput = bearer
 	}
+
+	if modelInput == "" {
+		log.Warn().Msg("[FCE] modelInput is empty")
+	}
 	return modelInput, nil
 }
+
+// TODO: Do we still need the first return value?
+func (fce *FiberContextExtractor) OpenAIRequestFromContext(c *fiber.Ctx, firstModel bool) (string, *schema.OpenAIRequest, error) {
+	input := new(schema.OpenAIRequest)
+
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
+	}
+
+	received, _ := json.Marshal(input)
+
+	ctx, cancel := context.WithCancel(fce.appConfig.Context)
+	input.Context = ctx
+	input.Cancel = cancel
+
+	log.Debug().Msgf("Request received: %s", string(received))
+
+	var err error
+	input.Model, err = fce.ModelFromContext(c, input.Model, firstModel)
+
+	return input.Model, input, err
+}
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -2,9 +2,7 @@ package elevenlabs

 import (
 	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
 	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
@@ -17,7 +15,7 @@ import (
 // @Param request body schema.TTSRequest true "query params"
 // @Success 200 {string} binary	 "Response"
 // @Router /v1/text-to-speech/{voice-id} [post]
-func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func TTSEndpoint(fce *fiberContext.FiberContextExtractor, ttsbs *backend.TextToSpeechBackendService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

 		input := new(schema.ElevenLabsTTSRequest)
@@ -28,34 +26,21 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			return err
 		}

-		modelFile, err := fiberContext.ModelFromContext(c, ml, input.ModelID, false)
+		var err error
+		input.ModelID, err = fce.ModelFromContext(c, input.ModelID, false)
 		if err != nil {
-			modelFile = input.ModelID
 			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
 		}

-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
-		} else {
-			if input.ModelID != "" {
-				modelFile = input.ModelID
-			} else {
-				modelFile = cfg.Model
-			}
+		responseChannel := ttsbs.TextToAudioFile(&schema.TTSRequest{
+			Model: input.ModelID,
+			Voice: voiceID,
+			Input: input.Text,
+		})
+		rawValue := <-responseChannel
+		if rawValue.Error != nil {
+			return rawValue.Error
 		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
-
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
-		if err != nil {
-			return err
-		}
-		return c.Download(filePath)
+		return c.Download(*rawValue.Value)
 	}
 }
--- a/core/http/endpoints/localai/backend_monitor.go
+++ b/core/http/endpoints/localai/backend_monitor.go
@@ -6,7 +6,7 @@ import (
 	"github.com/gofiber/fiber/v2"
 )

-func BackendMonitorEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error {
+func BackendMonitorEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

 		input := new(schema.BackendMonitorRequest)
@@ -23,7 +23,7 @@ func BackendMonitorEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error
 	}
 }

-func BackendShutdownEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error {
+func BackendShutdownEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.BackendMonitorRequest)
 		// Get input data from the request body
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -19,7 +19,8 @@ type ModelGalleryEndpointService struct {
 }

 type GalleryModel struct {
-	ID string `json:"id"`
+	ID        string `json:"id"`
+	ConfigURL string `json:"config_url"`
 	gallery.GalleryModel
 }

@@ -64,6 +65,7 @@ func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fibe
 			Id:          uuid.String(),
 			GalleryName: input.ID,
 			Galleries:   mgs.galleries,
+			ConfigURL:   input.ConfigURL,
 		}
 		return c.JSON(struct {
 			ID        string `json:"uuid"`
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -2,9 +2,7 @@ package localai

 import (
 	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
 	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
@@ -16,45 +14,26 @@ import (
 // @Param request body schema.TTSRequest true "query params"
 // @Success 200 {string} binary	 "Response"
 // @Router /v1/audio/speech [post]
-func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func TTSEndpoint(fce *fiberContext.FiberContextExtractor, ttsbs *backend.TextToSpeechBackendService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-
+		var err error
 		input := new(schema.TTSRequest)

 		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
+		if err = c.BodyParser(input); err != nil {
 			return err
 		}

-		modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, false)
+		input.Model, err = fce.ModelFromContext(c, input.Model, false)
 		if err != nil {
-			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}

-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
-		} else {
-			modelFile = cfg.Model
+		responseChannel := ttsbs.TextToAudioFile(input)
+		rawValue := <-responseChannel
+		if rawValue.Error != nil {
+			return rawValue.Error
 		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
-
-		if input.Backend != "" {
-			cfg.Backend = input.Backend
-		}
-
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
-		if err != nil {
-			return err
-		}
-		return c.Download(filePath)
+		return c.Download(*rawValue.Value)
 	}
 }
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -339,7 +339,7 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 			}
 		}

-		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find "))
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistantID %q", assistantID))
 	}
 }

--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -5,17 +5,11 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
-	"strings"
-	"time"

-	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/grammar"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/go-skynet/LocalAI/core/services"
 	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -25,408 +19,82 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	emptyMessage := ""
-	id := uuid.New().String()
-	created := int(time.Now().Unix())
-
-	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		initialMessage := schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
-			Object:  "chat.completion.chunk",
-		}
-		responses <- initialMessage
-
-		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			resp := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
-				Object:  "chat.completion.chunk",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     usage.Prompt,
-					CompletionTokens: usage.Completion,
-					TotalTokens:      usage.Prompt + usage.Completion,
-				},
-			}
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		result := ""
-		_, tokenUsage, _ := ComputeChoices(req, prompt, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			result += s
-			// TODO: Change generated BNF grammar to be compliant with the schema so we can
-			// stream the result token by token here.
-			return true
-		})
-
-		results := parseFunctionCall(result, config.FunctionsConfig.ParallelCalls)
-		noActionToRun := len(results) > 0 && results[0].name == noAction
-
-		switch {
-		case noActionToRun:
-			initialMessage := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
-				Object:  "chat.completion.chunk",
-			}
-			responses <- initialMessage
-
-			result, err := handleQuestion(config, req, ml, startupOptions, results[0].arguments, prompt)
-			if err != nil {
-				log.Error().Err(err).Msg("error handling question")
-				return
-			}
-
-			resp := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0}},
-				Object:  "chat.completion.chunk",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     tokenUsage.Prompt,
-					CompletionTokens: tokenUsage.Completion,
-					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
-				},
-			}
-
-			responses <- resp
-
-		default:
-			for i, ss := range results {
-				name, args := ss.name, ss.arguments
-
-				initialMessage := schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []schema.Choice{{
-						Delta: &schema.Message{
-							Role: "assistant",
-							ToolCalls: []schema.ToolCall{
-								{
-									Index: i,
-									ID:    id,
-									Type:  "function",
-									FunctionCall: schema.FunctionCall{
-										Name: name,
-									},
-								},
-							},
-						}}},
-					Object: "chat.completion.chunk",
-				}
-				responses <- initialMessage
-
-				responses <- schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []schema.Choice{{
-						Delta: &schema.Message{
-							Role: "assistant",
-							ToolCalls: []schema.ToolCall{
-								{
-									Index: i,
-									ID:    id,
-									Type:  "function",
-									FunctionCall: schema.FunctionCall{
-										Arguments: args,
-									},
-								},
-							},
-						}}},
-					Object: "chat.completion.chunk",
-				}
-			}
-		}
-
-		close(responses)
-	}
-
+func ChatEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAIService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		processFunctions := false
-		funcs := grammar.Functions{}
-		modelFile, input, err := readRequest(c, ml, startupOptions, true)
+		_, request, err := fce.OpenAIRequestFromContext(c, false)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+			return fmt.Errorf("failed reading parameters from request: %w", err)
 		}

-		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, startupOptions.Debug, startupOptions.Threads, startupOptions.ContextSize, startupOptions.F16)
+		traceID, finalResultChannel, _, tokenChannel, err := oais.Chat(request, false, request.Stream)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-		log.Debug().Msgf("Configuration read: %+v", config)
-
-		// Allow the user to set custom actions via config file
-		// to be "embedded" in each model
-		noActionName := "answer"
-		noActionDescription := "use this action to answer without performing any action"
-
-		if config.FunctionsConfig.NoActionFunctionName != "" {
-			noActionName = config.FunctionsConfig.NoActionFunctionName
-		}
-		if config.FunctionsConfig.NoActionDescriptionName != "" {
-			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
+			return err
 		}

-		if input.ResponseFormat.Type == "json_object" {
-			input.Grammar = grammar.JSONBNF
-		}
+		if request.Stream {

-		config.Grammar = input.Grammar
+			log.Debug().Msgf("Chat Stream request received")

-		// process functions if we have any defined or if we have a function call string
-		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
-			log.Debug().Msgf("Response needs to process functions")
-
-			processFunctions = true
-
-			noActionGrammar := grammar.Function{
-				Name:        noActionName,
-				Description: noActionDescription,
-				Parameters: map[string]interface{}{
-					"properties": map[string]interface{}{
-						"message": map[string]interface{}{
-							"type":        "string",
-							"description": "The message to reply the user with",
-						}},
-				},
-			}
-
-			// Append the no action function
-			funcs = append(funcs, input.Functions...)
-			if !config.FunctionsConfig.DisableNoAction {
-				funcs = append(funcs, noActionGrammar)
-			}
-
-			// Force picking one of the functions by the request
-			if config.FunctionToCall() != "" {
-				funcs = funcs.Select(config.FunctionToCall())
-			}
-
-			// Update input grammar
-			jsStruct := funcs.ToJSONStructure()
-			config.Grammar = jsStruct.Grammar("", config.FunctionsConfig.ParallelCalls)
-		} else if input.JSONFunctionGrammarObject != nil {
-			config.Grammar = input.JSONFunctionGrammarObject.Grammar("", config.FunctionsConfig.ParallelCalls)
-		}
-
-		// functions are not supported in stream mode (yet?)
-		toStream := input.Stream
-
-		log.Debug().Msgf("Parameters: %+v", config)
-
-		var predInput string
-
-		suppressConfigSystemPrompt := false
-		mess := []string{}
-		for messageIndex, i := range input.Messages {
-			var content string
-			role := i.Role
-
-			// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
-			// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
-			if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
-				roleFn := "assistant_function_call"
-				r := config.Roles[roleFn]
-				if r != "" {
-					role = roleFn
-				}
-			}
-			r := config.Roles[role]
-			contentExists := i.Content != nil && i.StringContent != ""
-
-			fcall := i.FunctionCall
-			if len(i.ToolCalls) > 0 {
-				fcall = i.ToolCalls
-			}
-
-			// First attempt to populate content via a chat message specific template
-			if config.TemplateConfig.ChatMessage != "" {
-				chatMessageData := model.ChatMessageTemplateData{
-					SystemPrompt: config.SystemPrompt,
-					Role:         r,
-					RoleName:     role,
-					Content:      i.StringContent,
-					FunctionCall: fcall,
-					FunctionName: i.Name,
-					LastMessage:  messageIndex == (len(input.Messages) - 1),
-					Function:     config.Grammar != "" && (messageIndex == (len(input.Messages) - 1)),
-					MessageIndex: messageIndex,
-				}
-				templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
-				if err != nil {
-					log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
-				} else {
-					if templatedChatMessage == "" {
-						log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
-						continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
-					}
-					log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
-					content = templatedChatMessage
-				}
-			}
-
-			marshalAnyRole := func(f any) {
-				j, err := json.Marshal(f)
-				if err == nil {
-					if contentExists {
-						content += "\n" + fmt.Sprint(r, " ", string(j))
-					} else {
-						content = fmt.Sprint(r, " ", string(j))
-					}
-				}
-			}
-			marshalAny := func(f any) {
-				j, err := json.Marshal(f)
-				if err == nil {
-					if contentExists {
-						content += "\n" + string(j)
-					} else {
-						content = string(j)
-					}
-				}
-			}
-			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
-			if content == "" {
-				if r != "" {
-					if contentExists {
-						content = fmt.Sprint(r, i.StringContent)
-					}
-
-					if i.FunctionCall != nil {
-						marshalAnyRole(i.FunctionCall)
-					}
-					if i.ToolCalls != nil {
-						marshalAnyRole(i.ToolCalls)
-					}
-				} else {
-					if contentExists {
-						content = fmt.Sprint(i.StringContent)
-					}
-					if i.FunctionCall != nil {
-						marshalAny(i.FunctionCall)
-					}
-					if i.ToolCalls != nil {
-						marshalAny(i.ToolCalls)
-					}
-				}
-				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
-				if contentExists && role == "system" {
-					suppressConfigSystemPrompt = true
-				}
-			}
-
-			mess = append(mess, content)
-		}
-
-		predInput = strings.Join(mess, "\n")
-		log.Debug().Msgf("Prompt (before templating): %s", predInput)
-
-		if toStream {
-			log.Debug().Msgf("Stream request received")
 			c.Context().SetContentType("text/event-stream")
 			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//	c.Set("Content-Type", "text/event-stream")
+			//
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
-		}
-
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
-
-		if config.TemplateConfig.Chat != "" && !processFunctions {
-			templateFile = config.TemplateConfig.Chat
-		}
-
-		if config.TemplateConfig.Functions != "" && processFunctions {
-			templateFile = config.TemplateConfig.Functions
-		}
-
-		if templateFile != "" {
-			templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-				SystemPrompt:         config.SystemPrompt,
-				SuppressSystemPrompt: suppressConfigSystemPrompt,
-				Input:                predInput,
-				Functions:            funcs,
-			})
-			if err == nil {
-				predInput = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", predInput)
-			} else {
-				log.Debug().Msgf("Template failed loading: %s", err.Error())
-			}
-		}
-
-		log.Debug().Msgf("Prompt (after templating): %s", predInput)
-		if processFunctions {
-			log.Debug().Msgf("Grammar: %+v", config.Grammar)
-		}
-
-		switch {
-		case toStream:
-			responses := make(chan schema.OpenAIResponse)
-
-			if !processFunctions {
-				go process(predInput, input, config, ml, responses)
-			} else {
-				go processTools(noActionName, predInput, input, config, ml, responses)
-			}

 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
 				usage := &schema.OpenAIUsage{}
 				toolsCalled := false
-				for ev := range responses {
-					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
-					if len(ev.Choices[0].Delta.ToolCalls) > 0 {
+				for ev := range tokenChannel {
+					if ev.Error != nil {
+						log.Debug().Err(ev.Error).Msg("chat streaming responseChannel error")
+						request.Cancel()
+						break
+					}
+					usage = &ev.Value.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+
+					if len(ev.Value.Choices[0].Delta.ToolCalls) > 0 {
 						toolsCalled = true
 					}
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
-					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					if ev.Error != nil {
+						log.Debug().Err(ev.Error).Msg("[ChatEndpoint] error to debug during tokenChannel handler")
+						enc.Encode(ev.Error)
+					} else {
+						enc.Encode(ev.Value)
+					}
+					log.Debug().Msgf("chat streaming sending chunk: %s", buf.String())
 					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
 					if err != nil {
-						log.Debug().Msgf("Sending chunk failed: %v", err)
-						input.Cancel()
+						log.Debug().Err(err).Msgf("Sending chunk failed")
+						request.Cancel()
+						break
+					}
+					err = w.Flush()
+					if err != nil {
+						log.Debug().Msg("error while flushing, closing connection")
+						request.Cancel()
 						break
 					}
-					w.Flush()
 				}

 				finishReason := "stop"
 				if toolsCalled {
 					finishReason = "tool_calls"
-				} else if toolsCalled && len(input.Tools) == 0 {
+				} else if toolsCalled && len(request.Tools) == 0 {
 					finishReason = "function_call"
 				}

 				resp := &schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      traceID.ID,
+					Created: traceID.Created,
+					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							FinishReason: finishReason,
 							Index:        0,
-							Delta:        &schema.Message{Content: &emptyMessage},
+							Delta:        &schema.Message{Content: ""},
 						}},
 					Object: "chat.completion.chunk",
 					Usage:  *usage,
@@ -437,202 +105,21 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				w.WriteString("data: [DONE]\n\n")
 				w.Flush()
 			}))
+
 			return nil
-
-		// no streaming mode
-		default:
-			result, tokenUsage, err := ComputeChoices(input, predInput, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
-				if !processFunctions {
-					// no function is called, just reply and use stop as finish reason
-					*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
-					return
-				}
-
-				results := parseFunctionCall(s, config.FunctionsConfig.ParallelCalls)
-				noActionsToRun := len(results) > 0 && results[0].name == noActionName
-
-				switch {
-				case noActionsToRun:
-					result, err := handleQuestion(config, input, ml, startupOptions, results[0].arguments, predInput)
-					if err != nil {
-						log.Error().Err(err).Msg("error handling question")
-						return
-					}
-					*c = append(*c, schema.Choice{
-						Message: &schema.Message{Role: "assistant", Content: &result}})
-				default:
-					toolChoice := schema.Choice{
-						Message: &schema.Message{
-							Role: "assistant",
-						},
-					}
-
-					if len(input.Tools) > 0 {
-						toolChoice.FinishReason = "tool_calls"
-					}
-
-					for _, ss := range results {
-						name, args := ss.name, ss.arguments
-						if len(input.Tools) > 0 {
-							// If we are using tools, we condense the function calls into
-							// a single response choice with all the tools
-							toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls,
-								schema.ToolCall{
-									ID:   id,
-									Type: "function",
-									FunctionCall: schema.FunctionCall{
-										Name:      name,
-										Arguments: args,
-									},
-								},
-							)
-						} else {
-							// otherwise we return more choices directly
-							*c = append(*c, schema.Choice{
-								FinishReason: "function_call",
-								Message: &schema.Message{
-									Role: "assistant",
-									FunctionCall: map[string]interface{}{
-										"name":      name,
-										"arguments": args,
-									},
-								},
-							})
-						}
-					}
-
-					if len(input.Tools) > 0 {
-						// we need to append our result if we are using tools
-						*c = append(*c, toolChoice)
-					}
-				}
-
-			}, nil)
-			if err != nil {
-				return err
-			}
-
-			resp := &schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: result,
-				Object:  "chat.completion",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     tokenUsage.Prompt,
-					CompletionTokens: tokenUsage.Completion,
-					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
-				},
-			}
-			respData, _ := json.Marshal(resp)
-			log.Debug().Msgf("Response: %s", respData)
-
-			// Return the prediction in the response body
-			return c.JSON(resp)
 		}

+		// TODO is this proper to have exclusive from Stream, or do we need to issue both responses?
+		rawResponse := <-finalResultChannel
+
+		if rawResponse.Error != nil {
+			return rawResponse.Error
+		}
+
+		jsonResult, _ := json.Marshal(rawResponse.Value)
+		log.Debug().Str("jsonResult", string(jsonResult)).Msg("Chat Final Response")
+
+		// Return the prediction in the response body
+		return c.JSON(rawResponse.Value)
 	}
 }
-
-func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, args, prompt string) (string, error) {
-	log.Debug().Msgf("nothing to do, computing a reply")
-
-	// If there is a message that the LLM already sends as part of the JSON reply, use it
-	arguments := map[string]interface{}{}
-	json.Unmarshal([]byte(args), &arguments)
-	m, exists := arguments["message"]
-	if exists {
-		switch message := m.(type) {
-		case string:
-			if message != "" {
-				log.Debug().Msgf("Reply received from LLM: %s", message)
-				message = backend.Finetune(*config, prompt, message)
-				log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
-
-				return message, nil
-			}
-		}
-	}
-
-	log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
-	// Otherwise ask the LLM to understand the JSON output and the context, and return a message
-	// Note: This costs (in term of CPU/GPU) another computation
-	config.Grammar = ""
-	images := []string{}
-	for _, m := range input.Messages {
-		images = append(images, m.StringImages...)
-	}
-
-	predFunc, err := backend.ModelInference(input.Context, prompt, images, ml, *config, o, nil)
-	if err != nil {
-		log.Error().Err(err).Msg("model inference failed")
-		return "", err
-	}
-
-	prediction, err := predFunc()
-	if err != nil {
-		log.Error().Err(err).Msg("prediction failed")
-		return "", err
-	}
-	return backend.Finetune(*config, prompt, prediction.Response), nil
-}
-
-type funcCallResults struct {
-	name      string
-	arguments string
-}
-
-func parseFunctionCall(llmresult string, multipleResults bool) []funcCallResults {
-	results := []funcCallResults{}
-
-	// TODO: use generics to avoid this code duplication
-	if multipleResults {
-		ss := []map[string]interface{}{}
-		s := utils.EscapeNewLines(llmresult)
-		json.Unmarshal([]byte(s), &ss)
-		log.Debug().Msgf("Function return: %s %+v", s, ss)
-
-		for _, s := range ss {
-			func_name, ok := s["function"]
-			if !ok {
-				continue
-			}
-			args, ok := s["arguments"]
-			if !ok {
-				continue
-			}
-			d, _ := json.Marshal(args)
-			funcName, ok := func_name.(string)
-			if !ok {
-				continue
-			}
-			results = append(results, funcCallResults{name: funcName, arguments: string(d)})
-		}
-	} else {
-		// As we have to change the result before processing, we can't stream the answer token-by-token (yet?)
-		ss := map[string]interface{}{}
-		// This prevent newlines to break JSON parsing for clients
-		s := utils.EscapeNewLines(llmresult)
-		json.Unmarshal([]byte(s), &ss)
-		log.Debug().Msgf("Function return: %s %+v", s, ss)
-
-		// The grammar defines the function name as "function", while OpenAI returns "name"
-		func_name, ok := ss["function"]
-		if !ok {
-			return results
-		}
-		// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
-		args, ok := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
-		if !ok {
-			return results
-		}
-		d, _ := json.Marshal(args)
-		funcName, ok := func_name.(string)
-		if !ok {
-			return results
-		}
-		results = append(results, funcCallResults{name: funcName, arguments: string(d)})
-	}
-
-	return results
-}
--- a/Show More
+++ b/Show More