fix(llama.cpp): fix eos without cache

fix(whisper.cpp): Add stubs and -lcuda
deps(whisper.cpp): update, fix cublas build
2026-05-21 15:15:40 -04:00 · 2024-03-18 12:14:16 +01:00 · 2024-03-18 12:13:39 +01:00 · 2024-03-16 10:38:57 +01:00 · 2024-03-15 23:51:03 +00:00 · 2024-03-15 18:14:23 +01:00
295 changed files with 11391 additions and 9289 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,3 +3,4 @@ models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
+Dockerfile
--- a/.env
+++ b/.env
@@ -18,7 +18,7 @@

 ## Default path for models
 #
-MODELS_PATH=/models
+# MODELS_PATH=/models

 ## Enable debug mode
 # DEBUG=true
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -2,9 +2,7 @@
 name: Bug report
 about: Create a report to help us improve
 title: ''
-labels: bug
-assignees: mudler
-
+labels: bug, unconfirmed, up-for-grabs
 ---

 <!-- Thanks for helping us to improve LocalAI! We welcome all bug reports. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,9 +2,7 @@
 name: Feature request
 about: Suggest an idea for this project
 title: ''
-labels: enhancement
-assignees: mudler
-
+labels: enhancement, up-for-grabs
 ---

 <!-- Thanks for helping us to improve LocalAI! We welcome all feature requests. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -0,0 +1,116 @@
+---
+name: 'build container images tests'
+
+on:
+  pull_request:
+
+concurrency:
+  group: ci-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  extras-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: 'sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+  core-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: 'sycl-f16-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -2,7 +2,6 @@
 name: 'build container images'

 on:
-  pull_request:
  push:
    branches:
      - master
@@ -14,7 +13,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  extras-image-build:
+  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
@@ -26,6 +25,7 @@ jobs:
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -37,6 +37,7 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
+          # Extra images
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
@@ -45,6 +46,7 @@ jobs:
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -52,6 +54,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -61,6 +64,7 @@ jobs:
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -70,6 +74,7 @@ jobs:
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -79,6 +84,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -88,6 +94,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
@@ -95,7 +102,90 @@ jobs:
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
+            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            runs-on: 'arc-runner-set'
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f32-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          # Core images
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f16-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f32-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f16-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f32-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            runs-on: 'arc-runner-set'
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            runs-on: 'arc-runner-set'
+  
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -108,6 +198,7 @@ jobs:
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -122,6 +213,7 @@ jobs:
            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
+            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
          - build-type: 'cublas'
            cuda-major-version: "11"
@@ -131,6 +223,7 @@ jobs:
            tag-suffix: '-cublas-cuda11-core'
            ffmpeg: ''
            image-type: 'core'
+            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
          - build-type: 'cublas'
            cuda-major-version: "12"
@@ -140,6 +233,7 @@ jobs:
            tag-suffix: '-cublas-cuda12-core'
            ffmpeg: ''
            image-type: 'core'
+            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
          - build-type: 'cublas'
            cuda-major-version: "11"
@@ -150,6 +244,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -159,3 +254,4 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -4,6 +4,11 @@ name: 'build container images (reusable)'
 on:
  workflow_call:
    inputs:
+      base-image:
+        description: 'Base image'
+        required: false
+        default: ''
+        type: string
      build-type:
        description: 'Build type'
        default: ''
@@ -64,42 +69,47 @@ jobs:
          && sudo apt-get install -y git
      - name: Checkout
        uses: actions/checkout@v4
-      # - name: Release space from worker
-      #   run: |
-      #     echo "Listing top largest packages"
-      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-      #     head -n 30 <<< "${pkgs}"
-      #     echo
-      #     df -h
-      #     echo
-      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-      #     sudo rm -rf /usr/local/lib/android
-      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-      #     sudo rm -rf /usr/share/dotnet
-      #     sudo apt-get remove -y '^mono-.*' || true
-      #     sudo apt-get remove -y '^ghc-.*' || true
-      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-      #     sudo apt-get remove -y 'php.*' || true
-      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-      #     sudo apt-get remove -y '^google-.*' || true
-      #     sudo apt-get remove -y azure-cli || true
-      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-      #     sudo apt-get remove -y '^gfortran-.*' || true
-      #     sudo apt-get remove -y microsoft-edge-stable || true
-      #     sudo apt-get remove -y firefox || true
-      #     sudo apt-get remove -y powershell || true
-      #     sudo apt-get remove -y r-base-core || true
-      #     sudo apt-get autoremove -y
-      #     sudo apt-get clean
-      #     echo
-      #     echo "Listing top largest packages"
-      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-      #     head -n 30 <<< "${pkgs}"
-      #     echo
-      #     sudo rm -rfv build || true
-      #     df -h
+      - name: Release space from worker
+        if: inputs.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
@@ -149,6 +159,7 @@ jobs:
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
+            BASE_IMAGE=${{ inputs.base-image }}
          context: .
          file: ./Dockerfile
          platforms: ${{ inputs.platforms }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -20,6 +20,10 @@ jobs:
            defines: '-DLLAMA_AVX2=OFF'
          - build: 'avx512'
            defines: '-DLLAMA_AVX512=ON'
+          - build: 'cuda12'
+            defines: ''
+          - build: 'cuda11'
+            defines: ''
    runs-on: ubuntu-latest
    steps:
      - name: Clone
@@ -33,19 +37,47 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-
+      - name: Install CUDA Dependencies
+        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
+        run: |
+          if [ "${{ matrix.build }}" == "cuda12" ]; then
+            export CUDA_VERSION=12-3
+          else
+            export CUDA_VERSION=11-7
+          fi
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v3
+        with:
+          path: grpc
+          key: ${{ runner.os }}-grpc
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-                -DgRPC_BUILD_TESTS=OFF \
-                ../.. && sudo make -j12 install
-
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make -j12
+      - name: Install gRPC
+        run: |
+          cd grpc && cd cmake/build && sudo make -j12 install
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
-          STATIC=true make dist
+          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
+            export BUILD_TYPE=cublas
+            export PATH=/usr/local/cuda/bin:$PATH
+            make dist
+          else
+            STATIC=true make dist
+          fi
      - uses: actions/upload-artifact@v3
        with:
          name: ${{ matrix.build }}
@@ -57,6 +89,35 @@ jobs:
          files: |
            release/*

+  build-stablediffusion:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          sudo apt-get install -y --no-install-recommends libopencv-dev
+          sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+      - name: Build stablediffusion
+        run: |
+          make backend-assets/grpc/stablediffusion
+          mkdir -p release && cp backend-assets/grpc/stablediffusion release
+      - uses: actions/upload-artifact@v3
+        with:
+          name: stablediffusion
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v1
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
  build-macOS:
    strategy:
      matrix:
@@ -97,4 +158,4 @@ jobs:
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
-            release/*
+            release/*
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -164,34 +164,74 @@ jobs:

           

-  tests-bark:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+  # tests-bark:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Release space from worker
+  #       run: |
+  #           echo "Listing top largest packages"
+  #           pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+  #           head -n 30 <<< "${pkgs}"
+  #           echo
+  #           df -h
+  #           echo
+  #           sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+  #           sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+  #           sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+  #           sudo rm -rf /usr/local/lib/android
+  #           sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+  #           sudo rm -rf /usr/share/dotnet
+  #           sudo apt-get remove -y '^mono-.*' || true
+  #           sudo apt-get remove -y '^ghc-.*' || true
+  #           sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+  #           sudo apt-get remove -y 'php.*' || true
+  #           sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+  #           sudo apt-get remove -y '^google-.*' || true
+  #           sudo apt-get remove -y azure-cli || true
+  #           sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+  #           sudo apt-get remove -y '^gfortran-.*' || true
+  #           sudo apt-get remove -y microsoft-edge-stable || true
+  #           sudo apt-get remove -y firefox || true
+  #           sudo apt-get remove -y powershell || true
+  #           sudo apt-get remove -y r-base-core || true
+  #           sudo apt-get autoremove -y
+  #           sudo apt-get clean
+  #           echo
+  #           echo "Listing top largest packages"
+  #           pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+  #           head -n 30 <<< "${pkgs}"
+  #           echo
+  #           sudo rm -rfv build || true
+  #           sudo rm -rf /usr/share/dotnet || true
+  #           sudo rm -rf /opt/ghc || true
+  #           sudo rm -rf "/usr/local/share/boost" || true
+  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+  #           df -h
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+  #            sudo apt-get update && \
+  #            sudo apt-get install -y conda
+  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
          
-          sudo rm -rfv /usr/bin/conda || true
+  #         sudo rm -rfv /usr/bin/conda || true

-      - name: Test bark
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/bark
-           make -C backend/python/bark test
+  #     - name: Test bark
+  #       run: |
+  #          export PATH=$PATH:/opt/conda/bin
+  #          make -C backend/python/bark
+  #          make -C backend/python/bark test

           
  # Below tests needs GPU. Commented out for now
@@ -274,4 +314,4 @@ jobs:
        run: |
           export PATH=$PATH:/opt/conda/bin
           make -C backend/python/coqui
-           make -C backend/python/coqui test
+           make -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -86,11 +86,22 @@ jobs:
          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
-
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v3
+        with:
+          path: grpc
+          key: ${{ runner.os }}-grpc
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-                -DgRPC_BUILD_TESTS=OFF \
-                ../.. && sudo make -j12 install
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make -j12
+      - name: Install gRPC
+        run: |
+          cd grpc && cd cmake/build && sudo make -j12 install
      - name: Test
        run: |
          GO_TAGS="stablediffusion tts" make test
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ local-ai
 !charts/*
 # prevent above rules from omitting the api/localai folder
 !api/localai
+!core/**/localai

 # Ignore models
 models/*
@@ -34,6 +35,7 @@ release/
 .idea

 # Generated during build
-backend-assets/
+backend-assets/*
+!backend-assets/.keep
 prepare
 /ggml-metal.metal
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "docs/themes/hugo-theme-relearn"]
 	path = docs/themes/hugo-theme-relearn
 	url = https://github.com/McShelby/hugo-theme-relearn.git
+[submodule "docs/themes/lotusdocs"]
+	path = docs/themes/lotusdocs
+	url = https://github.com/colinwilson/lotusdocs
--- a/89
+++ b/89
@@ -1,10 +1,12 @@
-ARG GO_VERSION=1.21-bullseye
 ARG IMAGE_TYPE=extras
+ARG BASE_IMAGE=ubuntu:22.04
+
 # extras or core
+FROM ${BASE_IMAGE} as requirements-core

+USER root

-FROM golang:$GO_VERSION as requirements-core
-
+ARG GO_VERSION=1.21.7
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
 ARG CUDA_MINOR_VERSION=7
@@ -12,15 +14,17 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
+ENV DEBIAN_FRONTEND=noninteractive
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"

-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
-
-ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
+    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean

+# Install Go
+RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
+ENV PATH $PATH:/usr/local/go/bin

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -32,15 +36,19 @@ RUN echo "Target Variant: $TARGETVARIANT"
 # CuBLAS requirements
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    apt-get install -y software-properties-common && \
-    apt-add-repository contrib && \
-    curl -O https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
-    dpkg -i cuda-keyring_1.0-1_all.deb && \
-    rm -f cuda-keyring_1.0-1_all.deb && \
+    curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
+    rm -f cuda-keyring_1.1-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi
+
+# Cuda
 ENV PATH /usr/local/cuda/bin:${PATH}

+# HipBLAS requirements
+ENV PATH /opt/rocm/bin:${PATH}
+
 # OpenBLAS requirements and stable diffusion
 RUN apt-get install -y \
    libopenblas-dev \
@@ -64,12 +72,18 @@ RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmo
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
    apt-get update && \
-    apt-get install -y conda
+    apt-get install -y conda && apt-get clean

 ENV PATH="/root/.cargo/bin:${PATH}"
+RUN apt-get install -y python3-pip && apt-get clean
 RUN pip install --upgrade pip
+
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN apt-get install -y espeak-ng espeak
+RUN apt-get install -y espeak-ng espeak && apt-get clean
+
+RUN if [ ! -e /usr/bin/python ]; then \
+	  ln -s /usr/bin/python3 /usr/bin/python \
+    ; fi

 ###################################
 ###################################
@@ -91,6 +105,13 @@ COPY . .
 COPY .git .
 RUN make prepare

+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+    apt-get update && \
+    apt-get install -y libclblast-dev && \
+    apt-get clean \
+    ; fi
+
 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

@@ -127,10 +148,18 @@ ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
+ENV PIP_CACHE_PURGE=true

 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg \
+    apt-get install -y ffmpeg && apt-get clean \
+    ; fi
+
+# Add OpenCL
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+    apt-get update && \
+    apt-get install -y libclblast1 && \
+    apt-get clean \
    ; fi

 WORKDIR /build
@@ -157,41 +186,47 @@ COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/

 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
+	 make -C backend/python/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
+	 make -C backend/python/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
+	 make -C backend/python/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
+	 make -C backend/python/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
+	 make -C backend/python/mamba \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
+	 make -C backend/python/sentencetransformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
+	 make -C backend/python/transformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
+	 make -C backend/python/vall-e-x \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
+	 make -C backend/python/exllama \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
+     make -C backend/python/exllama2 \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
+	 make -C backend/python/petals \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \
+	 make -C backend/python/transformers-musicgen \
    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	 make -C backend/python/coqui \
+    ; fi
+
+# Make sure the models directory exists
+RUN mkdir -p /build/models

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Ettore Di Giacinto
+Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/202
+++ b/202
@@ -4,34 +4,31 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
+GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

-CPPLLAMA_VERSION?=cb1e2818e0e12ec99f7236ec5d4f3ffd8bcc2f4a
+CPPLLAMA_VERSION?=4755afd1cbd40d93c017e5b98c39796f52345314

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8

-# go-ggml-transformers version
-GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
-
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
-RWKV_VERSION?=633c5a3485c403cb2520693dc0991a25dace9f0f
+RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
+WHISPER_CPP_VERSION?=a56f435fd475afd7edf02bfbf9f8c77f527198c2

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d

 # go-piper version
-PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
+PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=902db5f066fd137697e3b69d0fa10d4782bd2c2f
+STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485

 # tinydream version
 TINYDREAM_VERSION?=772a9c0d9aaf768290e63cca3c904fe69faf677a
@@ -47,6 +44,8 @@ BUILD_ID?=git

 TEST_DIR=/tmp/test

+TEST_FLAKES?=5
+
 RANDOM := $(shell bash -c 'echo $$RANDOM')

 VERSION?=$(shell git describe --always --tags || echo "dev" )
@@ -92,14 +91,19 @@ ifeq ($(BUILD_TYPE),openblas)
 	export WHISPER_OPENBLAS=1
 endif

+
 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
+	CGO_LDFLAGS+=-lcublas -lcudart -lculibos -lcublasLt -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
+# required by whisper.cpp
 	export WHISPER_CUBLAS=1
+	CGO_LDFLAGS+=-L$(CUDA_PATH)/stubs -lcuda
 endif

 ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
@@ -108,7 +112,7 @@ ifeq ($(BUILD_TYPE),hipblas)
 	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
-	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link
+	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif

 ifeq ($(BUILD_TYPE),metal)
@@ -140,19 +144,33 @@ endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
-	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/sources/go-piper/piper/src/cpp -I$(shell pwd)/sources/go-piper/piper/build/fi/include -I$(shell pwd)/sources/go-piper/piper/build/pi/include -I$(shell pwd)/sources/go-piper/piper/build/si/include
- 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/sources/go-piper/piper/build/fi/lib -L$(shell pwd)/sources/go-piper/piper/build/pi/lib -L$(shell pwd)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
+	PIPER_CGO_CXXFLAGS+=-I$(CURDIR)/sources/go-piper/piper/src/cpp -I$(CURDIR)/sources/go-piper/piper/build/fi/include -I$(CURDIR)/sources/go-piper/piper/build/pi/include -I$(CURDIR)/sources/go-piper/piper/build/si/include
+	PIPER_CGO_LDFLAGS+=-L$(CURDIR)/sources/go-piper/piper/build/fi/lib -L$(CURDIR)/sources/go-piper/piper/build/pi/lib -L$(CURDIR)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-ggml backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
+ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
+ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
+ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
+ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
+ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
+
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
+TEST_PATHS?=./api/... ./pkg/... ./core/...

 # If empty, then we build all
 ifeq ($(GRPC_BACKENDS),)
 	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
 endif

+ifeq ($(BUILD_API_ONLY),true)
+	GRPC_BACKENDS=
+endif
+
 .PHONY: all test build vendor

 all: help
@@ -213,14 +231,6 @@ backend-assets/espeak-ng-data: sources/go-piper
 sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

-## CEREBRAS GPT
-sources/go-ggml-transformers:
-	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp sources/go-ggml-transformers
-	cd sources/go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-ggml-transformers/libtransformers.a: sources/go-ggml-transformers
-	$(MAKE) -C sources/go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
-
 sources/whisper.cpp:
 	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
@@ -246,21 +256,20 @@ sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main

 backend/cpp/llama/llama.cpp:
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp	
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp

-get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/go-ggml-transformers sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
 	touch $@

 replace:
-	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/sources/go-ggml-transformers
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/sources/go-rwkv
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/sources/whisper.cpp
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(shell pwd)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/sources/go-bert
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/sources/go-stable-diffusion
-	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(shell pwd)/sources/go-tiny-dream
-	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/sources/go-piper
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
+	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -272,7 +281,6 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C sources/go-llama clean
 	$(MAKE) -C sources/go-llama-ggml clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-ggml-transformers clean
 	$(MAKE) -C sources/go-rwkv clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
@@ -290,19 +298,17 @@ clean: ## Remove build related file
 	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
-	rm -rf ./backend/cpp/grpc/grpc_repo
-	rm -rf ./backend/cpp/grpc/build
-	rm -rf ./backend/cpp/grpc/installed_packages
+	rm -rf backend-assets
+	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean

 ## Build:

-build: grpcs prepare ## Build the project
+build: backend-assets grpcs prepare ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
-
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 dist: build
@@ -319,7 +325,7 @@ run: prepare ## run local-ai
 test-models/testmodel:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
+	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -328,7 +334,7 @@ test-models/testmodel:
 	cp tests/models_fixtures/* test-models

 prepare-test: grpcs
-	cp -rf backend-assets api
+	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models

 test: prepare test-models/testmodel grpcs
@@ -336,7 +342,7 @@ test: prepare test-models/testmodel grpcs
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
@@ -365,23 +371,23 @@ teardown-e2e:

 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)

 test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)

 test-container:
 	docker build --target requirements -t local-ai-test-container .
@@ -417,6 +423,7 @@ protogen-python:
 	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
 	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
 	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/mamba/ --grpc_python_out=backend/python/mamba/ backend/backend.proto
 	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto

 ## GRPC
@@ -427,6 +434,7 @@ prepare-extra-conda-environments:
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/vllm
+	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/transformers-musicgen
@@ -443,17 +451,20 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test

+backend-assets:
+	mkdir -p backend-assets
+ifeq ($(BUILD_API_ONLY),true)
+	touch backend-assets/keep
+endif
+
 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

 backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama LIBRARY_PATH=$(shell pwd)/sources/go-llama \
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
 # TODO: every binary should have its own folder instead, so can have different  implementations
-ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
-endif

 ## BACKEND CPP LLAMA START
 # Sets the variables in case it has to build the gRPC locally.
@@ -467,72 +478,40 @@ ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \

 backend/cpp/llama/grpc-server:
 ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
-	backend/cpp/grpc/script/build_grpc.sh ${INSTALLED_PACKAGES}
+	$(MAKE) -C backend/cpp/grpc build
 	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
 	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
-	export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
+	export PATH="${INSTALLED_PACKAGES}/bin:${PATH}" && \
 	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
 endif
 ## BACKEND CPP LLAMA END
-		
+
 ##
 backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
 	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
+	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

 backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama-ggml
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama-ggml LIBRARY_PATH=$(shell pwd)/sources/go-llama-ggml \
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

 backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/

-backend-assets/grpc/dolly: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./backend/go/llm/dolly/
-
-backend-assets/grpc/gpt2: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./backend/go/llm/gpt2/
-
-backend-assets/grpc/gptj: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./backend/go/llm/gptj/
-
-backend-assets/grpc/gptneox: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./backend/go/llm/gptneox/
-
-backend-assets/grpc/mpt: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./backend/go/llm/mpt/
-
-backend-assets/grpc/replit: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./backend/go/llm/replit/
-
-backend-assets/grpc/falcon-ggml: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./backend/go/llm/falcon-ggml/
-
-backend-assets/grpc/starcoder: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./backend/go/llm/starcoder/
-
 backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-rwkv LIBRARY_PATH=$(shell pwd)/sources/go-rwkv \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv

 backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-bert LIBRARY_PATH=$(shell pwd)/sources/go-bert \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
@@ -540,21 +519,48 @@ backend-assets/grpc/langchain-huggingface: backend-assets/grpc

 backend-assets/grpc/stablediffusion: backend-assets/grpc
 	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
+		$(MAKE) sources/go-stable-diffusion; \
 		$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
-		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/sources/go-stable-diffusion/ \
+		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-stable-diffusion/ LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion; \
 	fi

 backend-assets/grpc/tinydream: backend-assets/grpc sources/go-tiny-dream/libtinydream.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-tiny-dream \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream

 backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
-	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/sources/go-piper \
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

 backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/whisper.cpp LIBRARY_PATH=$(shell pwd)/sources/whisper.cpp \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/

 grpcs: prepare $(GRPC_BACKENDS)
+
+DOCKER_IMAGE?=local-ai
+IMAGE_TYPE?=core
+BASE_IMAGE?=ubuntu:22.04
+
+docker:
+	docker build \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS=$(GO_TAGS) \
+		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
+		-t $(DOCKER_IMAGE) .
+
+docker-image-intel:
+	docker build \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS="none" \
+		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+
+docker-image-intel-xpu:
+	docker build \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS="none" \
+		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
--- a/README.md
+++ b/README.md
@@ -20,6 +20,9 @@
 </a>
 </p>

+[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
+[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)
+
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
 > [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
@@ -40,21 +43,35 @@

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726
+- Upload file API: https://github.com/mudler/LocalAI/pull/1703
+- Tools API support: https://github.com/mudler/LocalAI/pull/1715
+- LLaVa 1.6: https://github.com/mudler/LocalAI/pull/1714
+- ROCm container images: https://github.com/mudler/LocalAI/pull/1595
+- Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
+- Deprecation of old backends: https://github.com/mudler/LocalAI/issues/1651
+- Mamba support: https://github.com/mudler/LocalAI/pull/1589
 - Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
 - 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
- Inline templates: https://github.com/mudler/LocalAI/pull/1452
- Mixtral: https://github.com/mudler/LocalAI/pull/1449
 - Img2vid https://github.com/mudler/LocalAI/pull/1442
- Musicgen https://github.com/mudler/LocalAI/pull/1387

 Hot topics (looking for contributors):
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
+- Assistant API: https://github.com/mudler/LocalAI/issues/1273
+- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
+- Vulkan: https://github.com/mudler/LocalAI/issues/1647

 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22

 ## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)

+For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. For those in a hurry, here's a straightforward one-liner to launch a LocalAI instance with [phi-2](https://huggingface.co/microsoft/phi-2) using `docker`:
+
+```
+docker run -ti -p 8080:8080 localai/localai:v2.9.0-ffmpeg-core phi-2
+```
+
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -92,17 +109,20 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
+  

 ### 🔗 Resources

- 🆕 New! [LLM finetuning guide](https://localai.io/advanced/fine-tuning/)
+- 🆕 New! [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
 - [How to build locally](https://localai.io/basics/build/index.html)
 - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
- [Projects integrating LocalAI](https://localai.io/integrations/)
- [How tos section](https://localai.io/howtos/) (curated by our community)
+- [Projects integrating LocalAI](https://localai.io/docs/integrations/)
+- [How tos section](https://io.midori-ai.xyz/howtos/) (curated by our community)

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

+- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/ai/answers/tiZMDoZzZV6TLxgDXNBnFE/deploying-helm-charts-on-aws-eks)
+- [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
 - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
 - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
@@ -162,7 +182,6 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
 - https://github.com/rhasspy/piper
- https://github.com/cmp-nct/ggllm.cpp

 ## 🤗 Contributors

--- a/SECURITY.md
+++ b/SECURITY.md
@@ -0,0 +1,42 @@
+# Security Policy
+
+## Introduction
+
+At LocalAI, we take the security of our software seriously. We understand the importance of protecting our community from vulnerabilities and are committed to ensuring the safety and security of our users.
+
+## Supported Versions
+
+We provide support and updates for certain versions of our software. The following table outlines which versions are currently supported with security updates:
+
+| Version | Supported          |
+| ------- | ------------------ |
+| > 2.0   | :white_check_mark: |
+| < 2.0   | :x:                |
+
+Please ensure that you are using a supported version to receive the latest security updates.
+
+## Reporting a Vulnerability
+
+We encourage the responsible disclosure of any security vulnerabilities. If you believe you've found a security issue in our software, we kindly ask you to follow the steps below to report it to us:
+
+1. **Email Us:** Send an email to [security@localai.io](mailto:security@localai.io) with a detailed report. Please do not disclose the vulnerability publicly or to any third parties before it has been addressed by us.
+
+2. **Expect a Response:** We aim to acknowledge receipt of vulnerability reports within 48 hours. Our security team will review your report and work closely with you to understand the impact and ensure a thorough investigation.
+
+3. **Collaboration:** If the vulnerability is accepted, we will work with you and our community to address the issue promptly. We'll keep you informed throughout the resolution process and may request additional information or collaboration.
+
+4. **Disclosure:** Once the vulnerability has been resolved, we encourage a coordinated disclosure. We believe in transparency and will work with you to ensure that our community is informed in a responsible manner.
+
+## Use of Third-Party Platforms
+
+As a Free and Open Source Software (FOSS) organization, we do not offer monetary bounties. However, researchers who wish to report vulnerabilities can also do so via [Huntr](https://huntr.dev/bounties), a platform that recognizes contributions to open source security.
+
+## Contact
+
+For any security-related inquiries beyond vulnerability reporting, please contact us at [security@localai.io](mailto:security@localai.io).
+
+## Acknowledgments
+
+We appreciate the efforts of those who contribute to the security of our project. Your responsible disclosure is invaluable to the safety and integrity of LocalAI.
+
+Thank you for helping us keep LocalAI secure.
--- a/api/api.go
+++ b/api/api.go
@@ -1,302 +0,0 @@
-package api
-
-import (
-	"encoding/json"
-	"errors"
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-
-	config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/api/localai"
-	"github.com/go-skynet/LocalAI/api/openai"
-	"github.com/go-skynet/LocalAI/api/options"
-	"github.com/go-skynet/LocalAI/api/schema"
-	"github.com/go-skynet/LocalAI/internal"
-	"github.com/go-skynet/LocalAI/metrics"
-	"github.com/go-skynet/LocalAI/pkg/assets"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-
-	"github.com/gofiber/fiber/v2"
-	"github.com/gofiber/fiber/v2/middleware/cors"
-	"github.com/gofiber/fiber/v2/middleware/logger"
-	"github.com/gofiber/fiber/v2/middleware/recover"
-	"github.com/rs/zerolog"
-	"github.com/rs/zerolog/log"
-)
-
-func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader, error) {
-	options := options.NewOptions(opts...)
-
-	zerolog.SetGlobalLevel(zerolog.InfoLevel)
-	if options.Debug {
-		zerolog.SetGlobalLevel(zerolog.DebugLevel)
-	}
-
-	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
-	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
-
-	modelPath := options.Loader.ModelPath
-	if len(options.ModelsURL) > 0 {
-		for _, url := range options.ModelsURL {
-			if utils.LooksLikeURL(url) {
-				// md5 of model name
-				md5Name := utils.MD5(url)
-
-				// check if file exists
-				if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
-					err := utils.DownloadFile(url, filepath.Join(modelPath, md5Name)+".yaml", "", func(fileName, current, total string, percent float64) {
-						utils.DisplayDownloadFunction(fileName, current, total, percent)
-					})
-					if err != nil {
-						log.Error().Msgf("error loading model: %s", err.Error())
-					}
-				}
-			}
-		}
-	}
-
-	cl := config.NewConfigLoader()
-	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
-		log.Error().Msgf("error loading config files: %s", err.Error())
-	}
-
-	if options.ConfigFile != "" {
-		if err := cl.LoadConfigFile(options.ConfigFile); err != nil {
-			log.Error().Msgf("error loading config file: %s", err.Error())
-		}
-	}
-
-	if err := cl.Preload(options.Loader.ModelPath); err != nil {
-		log.Error().Msgf("error downloading models: %s", err.Error())
-	}
-
-	if options.PreloadJSONModels != "" {
-		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
-	if options.PreloadModelsFromPath != "" {
-		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
-	if options.Debug {
-		for _, v := range cl.ListConfigs() {
-			cfg, _ := cl.GetConfig(v)
-			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
-		}
-	}
-
-	if options.AssetsDestination != "" {
-		// Extract files from the embedded FS
-		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
-		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
-		if err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
-		}
-	}
-
-	// turn off any process that was started by GRPC if the context is canceled
-	go func() {
-		<-options.Context.Done()
-		log.Debug().Msgf("Context canceled, shutting down")
-		options.Loader.StopAllGRPC()
-	}()
-
-	if options.WatchDog {
-		wd := model.NewWatchDog(
-			options.Loader,
-			options.WatchDogBusyTimeout,
-			options.WatchDogIdleTimeout,
-			options.WatchDogBusy,
-			options.WatchDogIdle)
-		options.Loader.SetWatchDog(wd)
-		go wd.Run()
-		go func() {
-			<-options.Context.Done()
-			log.Debug().Msgf("Context canceled, shutting down")
-			wd.Shutdown()
-		}()
-	}
-
-	return options, cl, nil
-}
-
-func App(opts ...options.AppOption) (*fiber.App, error) {
-
-	options, cl, err := Startup(opts...)
-	if err != nil {
-		return nil, fmt.Errorf("failed basic startup tasks with error %s", err.Error())
-	}
-
-	// Return errors as JSON responses
-	app := fiber.New(fiber.Config{
-		BodyLimit:             options.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
-		DisableStartupMessage: options.DisableMessage,
-		// Override default error handler
-		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
-			// Status code defaults to 500
-			code := fiber.StatusInternalServerError
-
-			// Retrieve the custom status code if it's a *fiber.Error
-			var e *fiber.Error
-			if errors.As(err, &e) {
-				code = e.Code
-			}
-
-			// Send custom error page
-			return ctx.Status(code).JSON(
-				schema.ErrorResponse{
-					Error: &schema.APIError{Message: err.Error(), Code: code},
-				},
-			)
-		},
-	})
-
-	if options.Debug {
-		app.Use(logger.New(logger.Config{
-			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
-		}))
-	}
-
-	// Default middleware config
-	app.Use(recover.New())
-	if options.Metrics != nil {
-		app.Use(metrics.APIMiddleware(options.Metrics))
-	}
-
-	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
-	auth := func(c *fiber.Ctx) error {
-		if len(options.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		// Check for api_keys.json file
-		fileContent, err := os.ReadFile("api_keys.json")
-		if err == nil {
-			// Parse JSON content from the file
-			var fileKeys []string
-			err := json.Unmarshal(fileContent, &fileKeys)
-			if err != nil {
-				return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{"message": "Error parsing api_keys.json"})
-			}
-
-			// Add file keys to options.ApiKeys
-			options.ApiKeys = append(options.ApiKeys, fileKeys...)
-		}
-
-		if len(options.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		authHeader := c.Get("Authorization")
-		if authHeader == "" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-		}
-		authHeaderParts := strings.Split(authHeader, " ")
-		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
-		}
-
-		apiKey := authHeaderParts[1]
-		for _, key := range options.ApiKeys {
-			if apiKey == key {
-				return c.Next()
-			}
-		}
-
-		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
-
-	}
-
-	if options.CORS {
-		var c func(ctx *fiber.Ctx) error
-		if options.CORSAllowOrigins == "" {
-			c = cors.New()
-		} else {
-			c = cors.New(cors.Config{AllowOrigins: options.CORSAllowOrigins})
-		}
-
-		app.Use(c)
-	}
-
-	// LocalAI API endpoints
-	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
-	galleryService.Start(options.Context, cl)
-
-	app.Get("/version", auth, func(c *fiber.Ctx) error {
-		return c.JSON(struct {
-			Version string `json:"version"`
-		}{Version: internal.PrintableVersion()})
-	})
-
-	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
-	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
-	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
-	app.Get("/models/galleries", auth, modelGalleryService.ListModelGalleriesEndpoint())
-	app.Post("/models/galleries", auth, modelGalleryService.AddModelGalleryEndpoint())
-	app.Delete("/models/galleries", auth, modelGalleryService.RemoveModelGalleryEndpoint())
-	app.Get("/models/jobs/:uuid", auth, modelGalleryService.GetOpStatusEndpoint())
-	app.Get("/models/jobs", auth, modelGalleryService.GetAllStatusEndpoint())
-
-	// openAI compatible API endpoint
-
-	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, options))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, options))
-
-	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, options))
-	app.Post("/edits", auth, openai.EditEndpoint(cl, options))
-
-	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, options))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cl, options))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, options))
-
-	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
-
-	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, options))
-	app.Post("/tts", auth, localai.TTSEndpoint(cl, options))
-
-	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, options))
-
-	if options.ImageDir != "" {
-		app.Static("/generated-images", options.ImageDir)
-	}
-
-	if options.AudioDir != "" {
-		app.Static("/generated-audio", options.AudioDir)
-	}
-
-	ok := func(c *fiber.Ctx) error {
-		return c.SendStatus(200)
-	}
-
-	// Kubernetes health checks
-	app.Get("/healthz", ok)
-	app.Get("/readyz", ok)
-
-	// Experimental Backend Statistics Module
-	backendMonitor := localai.NewBackendMonitor(cl, options) // Split out for now
-	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
-	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))
-
-	// models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
-	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
-
-	app.Get("/metrics", metrics.MetricsHandler())
-
-	return app, nil
-}
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -1,61 +0,0 @@
-package backend
-
-import (
-	config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/api/options"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-)
-
-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
-
-	opts := modelOpts(c, o, []model.Option{
-		model.WithBackendString(c.Backend),
-		model.WithAssetDir(o.AssetsDestination),
-		model.WithThreads(uint32(c.Threads)),
-		model.WithContext(o.Context),
-		model.WithModel(c.Model),
-		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
-			CUDA:          c.CUDA || c.Diffusers.CUDA,
-			SchedulerType: c.Diffusers.SchedulerType,
-			PipelineType:  c.Diffusers.PipelineType,
-			CFGScale:      c.Diffusers.CFGScale,
-			LoraAdapter:   c.LoraAdapter,
-			LoraScale:     c.LoraScale,
-			LoraBase:      c.LoraBase,
-			IMG2IMG:       c.Diffusers.IMG2IMG,
-			CLIPModel:     c.Diffusers.ClipModel,
-			CLIPSubfolder: c.Diffusers.ClipSubFolder,
-			CLIPSkip:      int32(c.Diffusers.ClipSkip),
-			ControlNet:    c.Diffusers.ControlNet,
-		}),
-	})
-
-	inferenceModel, err := loader.BackendLoader(
-		opts...,
-	)
-	if err != nil {
-		return nil, err
-	}
-
-	fn := func() error {
-		_, err := inferenceModel.GenerateImage(
-			o.Context,
-			&proto.GenerateImageRequest{
-				Height:           int32(height),
-				Width:            int32(width),
-				Mode:             int32(mode),
-				Step:             int32(step),
-				Seed:             int32(seed),
-				CLIPSkip:         int32(c.Diffusers.ClipSkip),
-				PositivePrompt:   positive_prompt,
-				NegativePrompt:   negative_prompt,
-				Dst:              dst,
-				Src:              src,
-				EnableParameters: c.Diffusers.EnableParameters,
-			})
-		return err
-	}
-
-	return fn, nil
-}
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -1,127 +0,0 @@
-package backend
-
-import (
-	"os"
-	"path/filepath"
-
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-
-	config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/api/options"
-)
-
-func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
-	if o.SingleBackend {
-		opts = append(opts, model.WithSingleActiveBackend())
-	}
-
-	if o.ParallelBackendRequests {
-		opts = append(opts, model.EnableParallelRequests)
-	}
-
-	if c.GRPC.Attempts != 0 {
-		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
-	}
-
-	if c.GRPC.AttemptsSleepTime != 0 {
-		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
-
-	return opts
-}
-
-func gRPCModelOpts(c config.Config) *pb.ModelOptions {
-	b := 512
-	if c.Batch != 0 {
-		b = c.Batch
-	}
-
-	return &pb.ModelOptions{
-		ContextSize:    int32(c.ContextSize),
-		Seed:           int32(c.Seed),
-		NBatch:         int32(b),
-		NoMulMatQ:      c.NoMulMatQ,
-		CUDA:           c.CUDA, // diffusers, transformers
-		DraftModel:     c.DraftModel,
-		AudioPath:      c.VallE.AudioPath,
-		Quantization:   c.Quantization,
-		MMProj:         c.MMProj,
-		YarnExtFactor:  c.YarnExtFactor,
-		YarnAttnFactor: c.YarnAttnFactor,
-		YarnBetaFast:   c.YarnBetaFast,
-		YarnBetaSlow:   c.YarnBetaSlow,
-		LoraAdapter:    c.LoraAdapter,
-		LoraBase:       c.LoraBase,
-		LoraScale:      c.LoraScale,
-		NGQA:           c.NGQA,
-		RMSNormEps:     c.RMSNormEps,
-		F16Memory:      c.F16,
-		MLock:          c.MMlock,
-		RopeFreqBase:   c.RopeFreqBase,
-		RopeFreqScale:  c.RopeFreqScale,
-		NUMA:           c.NUMA,
-		Embeddings:     c.Embeddings,
-		LowVRAM:        c.LowVRAM,
-		NGPULayers:     int32(c.NGPULayers),
-		MMap:           c.MMap,
-		MainGPU:        c.MainGPU,
-		Threads:        int32(c.Threads),
-		TensorSplit:    c.TensorSplit,
-		// AutoGPTQ
-		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
-		Device:           c.AutoGPTQ.Device,
-		UseTriton:        c.AutoGPTQ.Triton,
-		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
-		// RWKV
-		Tokenizer: c.Tokenizer,
-	}
-}
-
-func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
-	promptCachePath := ""
-	if c.PromptCachePath != "" {
-		p := filepath.Join(modelPath, c.PromptCachePath)
-		os.MkdirAll(filepath.Dir(p), 0755)
-		promptCachePath = p
-	}
-	return &pb.PredictOptions{
-		Temperature:         float32(c.Temperature),
-		TopP:                float32(c.TopP),
-		NDraft:              c.NDraft,
-		TopK:                int32(c.TopK),
-		Tokens:              int32(c.Maxtokens),
-		Threads:             int32(c.Threads),
-		PromptCacheAll:      c.PromptCacheAll,
-		PromptCacheRO:       c.PromptCacheRO,
-		PromptCachePath:     promptCachePath,
-		F16KV:               c.F16,
-		DebugMode:           c.Debug,
-		Grammar:             c.Grammar,
-		NegativePromptScale: c.NegativePromptScale,
-		RopeFreqBase:        c.RopeFreqBase,
-		RopeFreqScale:       c.RopeFreqScale,
-		NegativePrompt:      c.NegativePrompt,
-		Mirostat:            int32(c.LLMConfig.Mirostat),
-		MirostatETA:         float32(c.LLMConfig.MirostatETA),
-		MirostatTAU:         float32(c.LLMConfig.MirostatTAU),
-		Debug:               c.Debug,
-		StopPrompts:         c.StopWords,
-		Repeat:              int32(c.RepeatPenalty),
-		NKeep:               int32(c.Keep),
-		Batch:               int32(c.Batch),
-		IgnoreEOS:           c.IgnoreEOS,
-		Seed:                int32(c.Seed),
-		FrequencyPenalty:    float32(c.FrequencyPenalty),
-		MLock:               c.MMlock,
-		MMap:                c.MMap,
-		MainGPU:             c.MainGPU,
-		TensorSplit:         c.TensorSplit,
-		TailFreeSamplingZ:   float32(c.TFZ),
-		TypicalP:            float32(c.TypicalP),
-	}
-}
--- a/api/backend/transcript.go
+++ b/api/backend/transcript.go
@@ -1,39 +0,0 @@
-package backend
-
-import (
-	"context"
-	"fmt"
-
-	config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/api/schema"
-
-	"github.com/go-skynet/LocalAI/api/options"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-)
-
-func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
-
-	opts := modelOpts(c, o, []model.Option{
-		model.WithBackendString(model.WhisperBackend),
-		model.WithModel(c.Model),
-		model.WithContext(o.Context),
-		model.WithThreads(uint32(c.Threads)),
-		model.WithAssetDir(o.AssetsDestination),
-	})
-
-	whisperModel, err := o.Loader.BackendLoader(opts...)
-	if err != nil {
-		return nil, err
-	}
-
-	if whisperModel == nil {
-		return nil, fmt.Errorf("could not load whisper model")
-	}
-
-	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
-		Dst:      audio,
-		Language: language,
-		Threads:  uint32(c.Threads),
-	})
-}
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -1,79 +0,0 @@
-package backend
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"path/filepath"
-
-	api_config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/api/options"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-)
-
-func generateUniqueFileName(dir, baseName, ext string) string {
-	counter := 1
-	fileName := baseName + ext
-
-	for {
-		filePath := filepath.Join(dir, fileName)
-		_, err := os.Stat(filePath)
-		if os.IsNotExist(err) {
-			return fileName
-		}
-
-		counter++
-		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
-	}
-}
-
-func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
-	bb := backend
-	if bb == "" {
-		bb = model.PiperBackend
-	}
-	opts := modelOpts(api_config.Config{}, o, []model.Option{
-		model.WithBackendString(bb),
-		model.WithModel(modelFile),
-		model.WithContext(o.Context),
-		model.WithAssetDir(o.AssetsDestination),
-	})
-	piperModel, err := o.Loader.BackendLoader(opts...)
-	if err != nil {
-		return "", nil, err
-	}
-
-	if piperModel == nil {
-		return "", nil, fmt.Errorf("could not load piper model")
-	}
-
-	if err := os.MkdirAll(o.AudioDir, 0755); err != nil {
-		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
-	}
-
-	fileName := generateUniqueFileName(o.AudioDir, "piper", ".wav")
-	filePath := filepath.Join(o.AudioDir, fileName)
-
-	// If the model file is not empty, we pass it joined with the model path
-	modelPath := ""
-	if modelFile != "" {
-		if bb != model.TransformersMusicGen {
-			modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
-			if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
-				return "", nil, err
-			}
-		} else {
-			modelPath = modelFile
-		}
-	}
-
-	res, err := piperModel.TTS(context.Background(), &proto.TTSRequest{
-		Text:  text,
-		Model: modelPath,
-		Dst:   filePath,
-	})
-
-	return filePath, res, err
-}
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -1,359 +0,0 @@
-package api_config
-
-import (
-	"errors"
-	"fmt"
-	"io/fs"
-	"os"
-	"path/filepath"
-	"strings"
-	"sync"
-
-	"github.com/go-skynet/LocalAI/pkg/utils"
-	"github.com/rs/zerolog/log"
-	"gopkg.in/yaml.v3"
-)
-
-type Config struct {
-	PredictionOptions `yaml:"parameters"`
-	Name              string `yaml:"name"`
-
-	F16            bool              `yaml:"f16"`
-	Threads        int               `yaml:"threads"`
-	Debug          bool              `yaml:"debug"`
-	Roles          map[string]string `yaml:"roles"`
-	Embeddings     bool              `yaml:"embeddings"`
-	Backend        string            `yaml:"backend"`
-	TemplateConfig TemplateConfig    `yaml:"template"`
-
-	PromptStrings, InputStrings                []string `yaml:"-"`
-	InputToken                                 [][]int  `yaml:"-"`
-	functionCallString, functionCallNameString string   `yaml:"-"`
-
-	FunctionsConfig Functions `yaml:"function"`
-
-	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
-	// LLM configs (GPT4ALL, Llama.cpp, ...)
-	LLMConfig `yaml:",inline"`
-
-	// AutoGPTQ specifics
-	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
-
-	// Diffusers
-	Diffusers Diffusers `yaml:"diffusers"`
-	Step      int       `yaml:"step"`
-
-	// GRPC Options
-	GRPC GRPC `yaml:"grpc"`
-
-	// Vall-e-x
-	VallE VallE `yaml:"vall-e"`
-
-	// CUDA
-	// Explicitly enable CUDA or not (some backends might need it)
-	CUDA bool `yaml:"cuda"`
-
-	DownloadFiles []File `yaml:"download_files"`
-}
-
-type File struct {
-	Filename string `yaml:"filename" json:"filename"`
-	SHA256   string `yaml:"sha256" json:"sha256"`
-	URI      string `yaml:"uri" json:"uri"`
-}
-
-type VallE struct {
-	AudioPath string `yaml:"audio_path"`
-}
-
-type FeatureFlag map[string]*bool
-
-func (ff FeatureFlag) Enabled(s string) bool {
-	v, exist := ff[s]
-	return exist && v != nil && *v
-}
-
-type GRPC struct {
-	Attempts          int `yaml:"attempts"`
-	AttemptsSleepTime int `yaml:"attempts_sleep_time"`
-}
-
-type Diffusers struct {
-	CUDA             bool    `yaml:"cuda"`
-	PipelineType     string  `yaml:"pipeline_type"`
-	SchedulerType    string  `yaml:"scheduler_type"`
-	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
-	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
-	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
-	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
-	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
-	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
-	ControlNet       string  `yaml:"control_net"`
-}
-
-type LLMConfig struct {
-	SystemPrompt    string   `yaml:"system_prompt"`
-	TensorSplit     string   `yaml:"tensor_split"`
-	MainGPU         string   `yaml:"main_gpu"`
-	RMSNormEps      float32  `yaml:"rms_norm_eps"`
-	NGQA            int32    `yaml:"ngqa"`
-	PromptCachePath string   `yaml:"prompt_cache_path"`
-	PromptCacheAll  bool     `yaml:"prompt_cache_all"`
-	PromptCacheRO   bool     `yaml:"prompt_cache_ro"`
-	MirostatETA     float64  `yaml:"mirostat_eta"`
-	MirostatTAU     float64  `yaml:"mirostat_tau"`
-	Mirostat        int      `yaml:"mirostat"`
-	NGPULayers      int      `yaml:"gpu_layers"`
-	MMap            bool     `yaml:"mmap"`
-	MMlock          bool     `yaml:"mmlock"`
-	LowVRAM         bool     `yaml:"low_vram"`
-	Grammar         string   `yaml:"grammar"`
-	StopWords       []string `yaml:"stopwords"`
-	Cutstrings      []string `yaml:"cutstrings"`
-	TrimSpace       []string `yaml:"trimspace"`
-	TrimSuffix      []string `yaml:"trimsuffix"`
-
-	ContextSize  int     `yaml:"context_size"`
-	NUMA         bool    `yaml:"numa"`
-	LoraAdapter  string  `yaml:"lora_adapter"`
-	LoraBase     string  `yaml:"lora_base"`
-	LoraScale    float32 `yaml:"lora_scale"`
-	NoMulMatQ    bool    `yaml:"no_mulmatq"`
-	DraftModel   string  `yaml:"draft_model"`
-	NDraft       int32   `yaml:"n_draft"`
-	Quantization string  `yaml:"quantization"`
-	MMProj       string  `yaml:"mmproj"`
-
-	RopeScaling    string  `yaml:"rope_scaling"`
-	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
-	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
-	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
-	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
-}
-
-type AutoGPTQ struct {
-	ModelBaseName    string `yaml:"model_base_name"`
-	Device           string `yaml:"device"`
-	Triton           bool   `yaml:"triton"`
-	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
-}
-
-type Functions struct {
-	DisableNoAction         bool   `yaml:"disable_no_action"`
-	NoActionFunctionName    string `yaml:"no_action_function_name"`
-	NoActionDescriptionName string `yaml:"no_action_description_name"`
-}
-
-type TemplateConfig struct {
-	Chat        string `yaml:"chat"`
-	ChatMessage string `yaml:"chat_message"`
-	Completion  string `yaml:"completion"`
-	Edit        string `yaml:"edit"`
-	Functions   string `yaml:"function"`
-}
-
-type ConfigLoader struct {
-	configs map[string]Config
-	sync.Mutex
-}
-
-func (c *Config) SetFunctionCallString(s string) {
-	c.functionCallString = s
-}
-
-func (c *Config) SetFunctionCallNameString(s string) {
-	c.functionCallNameString = s
-}
-
-func (c *Config) ShouldUseFunctions() bool {
-	return ((c.functionCallString != "none" || c.functionCallString == "") || c.ShouldCallSpecificFunction())
-}
-
-func (c *Config) ShouldCallSpecificFunction() bool {
-	return len(c.functionCallNameString) > 0
-}
-
-func (c *Config) FunctionToCall() string {
-	return c.functionCallNameString
-}
-
-func defaultPredictOptions(modelFile string) PredictionOptions {
-	return PredictionOptions{
-		TopP:        0.7,
-		TopK:        80,
-		Maxtokens:   512,
-		Temperature: 0.9,
-		Model:       modelFile,
-	}
-}
-
-func DefaultConfig(modelFile string) *Config {
-	return &Config{
-		PredictionOptions: defaultPredictOptions(modelFile),
-	}
-}
-
-func NewConfigLoader() *ConfigLoader {
-	return &ConfigLoader{
-		configs: make(map[string]Config),
-	}
-}
-func ReadConfigFile(file string) ([]*Config, error) {
-	c := &[]*Config{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	return *c, nil
-}
-
-func ReadConfig(file string) (*Config, error) {
-	c := &Config{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	return c, nil
-}
-
-func (cm *ConfigLoader) LoadConfigFile(file string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadConfigFile(file)
-	if err != nil {
-		return fmt.Errorf("cannot load config file: %w", err)
-	}
-
-	for _, cc := range c {
-		cm.configs[cc.Name] = *cc
-	}
-	return nil
-}
-
-func (cm *ConfigLoader) LoadConfig(file string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadConfig(file)
-	if err != nil {
-		return fmt.Errorf("cannot read config file: %w", err)
-	}
-
-	cm.configs[c.Name] = *c
-	return nil
-}
-
-func (cm *ConfigLoader) GetConfig(m string) (Config, bool) {
-	cm.Lock()
-	defer cm.Unlock()
-	v, exists := cm.configs[m]
-	return v, exists
-}
-
-func (cm *ConfigLoader) GetAllConfigs() []Config {
-	cm.Lock()
-	defer cm.Unlock()
-	var res []Config
-	for _, v := range cm.configs {
-		res = append(res, v)
-	}
-	return res
-}
-
-func (cm *ConfigLoader) ListConfigs() []string {
-	cm.Lock()
-	defer cm.Unlock()
-	var res []string
-	for k := range cm.configs {
-		res = append(res, k)
-	}
-	return res
-}
-
-// Preload prepare models if they are not local but url or huggingface repositories
-func (cm *ConfigLoader) Preload(modelPath string) error {
-	cm.Lock()
-	defer cm.Unlock()
-
-	status := func(fileName, current, total string, percent float64) {
-		utils.DisplayDownloadFunction(fileName, current, total, percent)
-	}
-
-	log.Info().Msgf("Preloading models from %s", modelPath)
-
-	for i, config := range cm.configs {
-
-		// Download files and verify their SHA
-		for _, file := range config.DownloadFiles {
-			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
-
-			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
-				return err
-			}
-			// Create file path
-			filePath := filepath.Join(modelPath, file.Filename)
-
-			if err := utils.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
-				return err
-			}
-		}
-
-		modelURL := config.PredictionOptions.Model
-		modelURL = utils.ConvertURL(modelURL)
-
-		if utils.LooksLikeURL(modelURL) {
-			// md5 of model name
-			md5Name := utils.MD5(modelURL)
-
-			// check if file exists
-			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
-				err := utils.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
-				if err != nil {
-					return err
-				}
-			}
-
-			cc := cm.configs[i]
-			c := &cc
-			c.PredictionOptions.Model = md5Name
-			cm.configs[i] = *c
-		}
-	}
-	return nil
-}
-
-func (cm *ConfigLoader) LoadConfigs(path string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	entries, err := os.ReadDir(path)
-	if err != nil {
-		return err
-	}
-	files := make([]fs.FileInfo, 0, len(entries))
-	for _, entry := range entries {
-		info, err := entry.Info()
-		if err != nil {
-			return err
-		}
-		files = append(files, info)
-	}
-	for _, file := range files {
-		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
-			continue
-		}
-		c, err := ReadConfig(filepath.Join(path, file.Name()))
-		if err == nil {
-			cm.configs[c.Name] = *c
-		}
-	}
-
-	return nil
-}
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -1,162 +0,0 @@
-package localai
-
-import (
-	"context"
-	"fmt"
-	"strings"
-
-	config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	"github.com/go-skynet/LocalAI/api/options"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-
-	gopsutil "github.com/shirou/gopsutil/v3/process"
-)
-
-type BackendMonitorRequest struct {
-	Model string `json:"model" yaml:"model"`
-}
-
-type BackendMonitorResponse struct {
-	MemoryInfo    *gopsutil.MemoryInfoStat
-	MemoryPercent float32
-	CPUPercent    float64
-}
-
-type BackendMonitor struct {
-	configLoader *config.ConfigLoader
-	options      *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
-}
-
-func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
-	return BackendMonitor{
-		configLoader: configLoader,
-		options:      options,
-	}
-}
-
-func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
-	config, exists := bm.configLoader.GetConfig(model)
-	var backend string
-	if exists {
-		backend = config.Model
-	} else {
-		// Last ditch effort: use it raw, see if a backend happens to match.
-		backend = model
-	}
-
-	if !strings.HasSuffix(backend, ".bin") {
-		backend = fmt.Sprintf("%s.bin", backend)
-	}
-
-	pid, err := bm.options.Loader.GetGRPCPID(backend)
-
-	if err != nil {
-		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
-		return nil, err
-	}
-
-	// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
-	backendProcess, err := gopsutil.NewProcess(int32(pid))
-
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
-		return nil, err
-	}
-
-	memInfo, err := backendProcess.MemoryInfo()
-
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
-		return nil, err
-	}
-
-	memPercent, err := backendProcess.MemoryPercent()
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
-		return nil, err
-	}
-
-	cpuPercent, err := backendProcess.CPUPercent()
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
-		return nil, err
-	}
-
-	return &BackendMonitorResponse{
-		MemoryInfo:    memInfo,
-		MemoryPercent: memPercent,
-		CPUPercent:    cpuPercent,
-	}, nil
-}
-
-func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
-	input := new(BackendMonitorRequest)
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return "", err
-	}
-
-	config, exists := bm.configLoader.GetConfig(input.Model)
-	var backendId string
-	if exists {
-		backendId = config.Model
-	} else {
-		// Last ditch effort: use it raw, see if a backend happens to match.
-		backendId = input.Model
-	}
-
-	if !strings.HasSuffix(backendId, ".bin") {
-		backendId = fmt.Sprintf("%s.bin", backendId)
-	}
-
-	return backendId, nil
-}
-
-func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		backendId, err := bm.getModelLoaderIDFromCtx(c)
-		if err != nil {
-			return err
-		}
-
-		model := bm.options.Loader.CheckIsLoaded(backendId)
-		if model == "" {
-			return fmt.Errorf("backend %s is not currently loaded", backendId)
-		}
-
-		status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
-		if rpcErr != nil {
-			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
-			val, slbErr := bm.SampleLocalBackendProcess(backendId)
-			if slbErr != nil {
-				return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
-			}
-			return c.JSON(proto.StatusResponse{
-				State: proto.StatusResponse_ERROR,
-				Memory: &proto.MemoryUsageData{
-					Total: val.MemoryInfo.VMS,
-					Breakdown: map[string]uint64{
-						"gopsutil-RSS": val.MemoryInfo.RSS,
-					},
-				},
-			})
-		}
-
-		return c.JSON(status)
-	}
-}
-
-func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		backendId, err := bm.getModelLoaderIDFromCtx(c)
-		if err != nil {
-			return err
-		}
-
-		return bm.options.Loader.ShutdownModel(backendId)
-	}
-}
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -1,326 +0,0 @@
-package localai
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"slices"
-	"strings"
-	"sync"
-
-	json "github.com/json-iterator/go"
-	"gopkg.in/yaml.v3"
-
-	config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/pkg/gallery"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-
-	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
-	"github.com/rs/zerolog/log"
-)
-
-type galleryOp struct {
-	req         gallery.GalleryModel
-	id          string
-	galleries   []gallery.Gallery
-	galleryName string
-}
-
-type galleryOpStatus struct {
-	FileName           string  `json:"file_name"`
-	Error              error   `json:"error"`
-	Processed          bool    `json:"processed"`
-	Message            string  `json:"message"`
-	Progress           float64 `json:"progress"`
-	TotalFileSize      string  `json:"file_size"`
-	DownloadedFileSize string  `json:"downloaded_size"`
-}
-
-type galleryApplier struct {
-	modelPath string
-	sync.Mutex
-	C        chan galleryOp
-	statuses map[string]*galleryOpStatus
-}
-
-func NewGalleryService(modelPath string) *galleryApplier {
-	return &galleryApplier{
-		modelPath: modelPath,
-		C:         make(chan galleryOp),
-		statuses:  make(map[string]*galleryOpStatus),
-	}
-}
-
-func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {
-
-	config, err := gallery.GetGalleryConfigFromURL(req.URL)
-	if err != nil {
-		return err
-	}
-
-	config.Files = append(config.Files, req.AdditionalFiles...)
-
-	return gallery.InstallModel(modelPath, req.Name, &config, req.Overrides, downloadStatus)
-}
-
-func (g *galleryApplier) updateStatus(s string, op *galleryOpStatus) {
-	g.Lock()
-	defer g.Unlock()
-	g.statuses[s] = op
-}
-
-func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
-	g.Lock()
-	defer g.Unlock()
-
-	return g.statuses[s]
-}
-
-func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
-	g.Lock()
-	defer g.Unlock()
-
-	return g.statuses
-}
-
-func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
-	go func() {
-		for {
-			select {
-			case <-c.Done():
-				return
-			case op := <-g.C:
-				utils.ResetDownloadTimers()
-
-				g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: 0})
-
-				// updates the status with an error
-				updateError := func(e error) {
-					g.updateStatus(op.id, &galleryOpStatus{Error: e, Processed: true, Message: "error: " + e.Error()})
-				}
-
-				// displayDownload displays the download progress
-				progressCallback := func(fileName string, current string, total string, percentage float64) {
-					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
-					utils.DisplayDownloadFunction(fileName, current, total, percentage)
-				}
-
-				var err error
-				// if the request contains a gallery name, we apply the gallery from the gallery list
-				if op.galleryName != "" {
-					if strings.Contains(op.galleryName, "@") {
-						err = gallery.InstallModelFromGallery(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
-					} else {
-						err = gallery.InstallModelFromGalleryByName(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
-					}
-				} else {
-					err = prepareModel(g.modelPath, op.req, cm, progressCallback)
-				}
-
-				if err != nil {
-					updateError(err)
-					continue
-				}
-
-				// Reload models
-				err = cm.LoadConfigs(g.modelPath)
-				if err != nil {
-					updateError(err)
-					continue
-				}
-
-				err = cm.Preload(g.modelPath)
-				if err != nil {
-					updateError(err)
-					continue
-				}
-
-				g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
-			}
-		}
-	}()
-}
-
-type galleryModel struct {
-	gallery.GalleryModel `yaml:",inline"` // https://github.com/go-yaml/yaml/issues/63
-	ID                   string           `json:"id"`
-}
-
-func processRequests(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
-	var err error
-	for _, r := range requests {
-		utils.ResetDownloadTimers()
-		if r.ID == "" {
-			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
-		} else {
-			if strings.Contains(r.ID, "@") {
-				err = gallery.InstallModelFromGallery(
-					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
-			} else {
-				err = gallery.InstallModelFromGalleryByName(
-					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
-			}
-		}
-	}
-	return err
-}
-
-func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
-	dat, err := os.ReadFile(s)
-	if err != nil {
-		return err
-	}
-	var requests []galleryModel
-
-	if err := yaml.Unmarshal(dat, &requests); err != nil {
-		return err
-	}
-
-	return processRequests(modelPath, s, cm, galleries, requests)
-}
-
-func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
-	var requests []galleryModel
-	err := json.Unmarshal([]byte(s), &requests)
-	if err != nil {
-		return err
-	}
-
-	return processRequests(modelPath, s, cm, galleries, requests)
-}
-
-/// Endpoint Service
-
-type ModelGalleryService struct {
-	galleries      []gallery.Gallery
-	modelPath      string
-	galleryApplier *galleryApplier
-}
-
-type GalleryModel struct {
-	ID string `json:"id"`
-	gallery.GalleryModel
-}
-
-func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
-	return ModelGalleryService{
-		galleries:      galleries,
-		modelPath:      modelPath,
-		galleryApplier: galleryApplier,
-	}
-}
-
-func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		status := mgs.galleryApplier.getStatus(c.Params("uuid"))
-		if status == nil {
-			return fmt.Errorf("could not find any status for ID")
-		}
-		return c.JSON(status)
-	}
-}
-
-func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		return c.JSON(mgs.galleryApplier.getAllStatus())
-	}
-}
-
-func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(GalleryModel)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		uuid, err := uuid.NewUUID()
-		if err != nil {
-			return err
-		}
-		mgs.galleryApplier.C <- galleryOp{
-			req:         input.GalleryModel,
-			id:          uuid.String(),
-			galleryName: input.ID,
-			galleries:   mgs.galleries,
-		}
-		return c.JSON(struct {
-			ID        string `json:"uuid"`
-			StatusURL string `json:"status"`
-		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
-	}
-}
-
-func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
-
-		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
-		if err != nil {
-			return err
-		}
-		log.Debug().Msgf("Models found from galleries: %+v", models)
-		for _, m := range models {
-			log.Debug().Msgf("Model found from galleries: %+v", m)
-		}
-		dat, err := json.Marshal(models)
-		if err != nil {
-			return err
-		}
-		return c.Send(dat)
-	}
-}
-
-// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
-func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
-		dat, err := json.Marshal(mgs.galleries)
-		if err != nil {
-			return err
-		}
-		return c.Send(dat)
-	}
-}
-
-func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(gallery.Gallery)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
-			return gallery.Name == input.Name
-		}) {
-			return fmt.Errorf("%s already exists", input.Name)
-		}
-		dat, err := json.Marshal(mgs.galleries)
-		if err != nil {
-			return err
-		}
-		log.Debug().Msgf("Adding %+v to gallery list", *input)
-		mgs.galleries = append(mgs.galleries, *input)
-		return c.Send(dat)
-	}
-}
-
-func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(gallery.Gallery)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
-			return gallery.Name == input.Name
-		}) {
-			return fmt.Errorf("%s is not currently registered", input.Name)
-		}
-		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
-			return gallery.Name == input.Name
-		})
-		return c.Send(nil)
-	}
-}
--- a/api/localai/localai.go
+++ b/api/localai/localai.go
@@ -1,32 +0,0 @@
-package localai
-
-import (
-	"github.com/go-skynet/LocalAI/api/backend"
-	config "github.com/go-skynet/LocalAI/api/config"
-
-	"github.com/go-skynet/LocalAI/api/options"
-	"github.com/gofiber/fiber/v2"
-)
-
-type TTSRequest struct {
-	Model   string `json:"model" yaml:"model"`
-	Input   string `json:"input" yaml:"input"`
-	Backend string `json:"backend" yaml:"backend"`
-}
-
-func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		input := new(TTSRequest)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		filePath, _, err := backend.ModelTTS(input.Backend, input.Input, input.Model, o.Loader, o)
-		if err != nil {
-			return err
-		}
-		return c.Download(filePath)
-	}
-}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -1,399 +0,0 @@
-package openai
-
-import (
-	"bufio"
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"strings"
-	"time"
-
-	"github.com/go-skynet/LocalAI/api/backend"
-	config "github.com/go-skynet/LocalAI/api/config"
-	"github.com/go-skynet/LocalAI/api/options"
-	"github.com/go-skynet/LocalAI/api/schema"
-	"github.com/go-skynet/LocalAI/pkg/grammar"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
-	"github.com/rs/zerolog/log"
-	"github.com/valyala/fasthttp"
-)
-
-func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
-	emptyMessage := ""
-	id := uuid.New().String()
-	created := int(time.Now().Unix())
-
-	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		initialMessage := schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
-			Object:  "chat.completion.chunk",
-		}
-		responses <- initialMessage
-
-		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			resp := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
-				Object:  "chat.completion.chunk",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     usage.Prompt,
-					CompletionTokens: usage.Completion,
-					TotalTokens:      usage.Prompt + usage.Completion,
-				},
-			}
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-	return func(c *fiber.Ctx) error {
-		processFunctions := false
-		funcs := grammar.Functions{}
-		modelFile, input, err := readInput(c, o, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-		log.Debug().Msgf("Configuration read: %+v", config)
-
-		// Allow the user to set custom actions via config file
-		// to be "embedded" in each model
-		noActionName := "answer"
-		noActionDescription := "use this action to answer without performing any action"
-
-		if config.FunctionsConfig.NoActionFunctionName != "" {
-			noActionName = config.FunctionsConfig.NoActionFunctionName
-		}
-		if config.FunctionsConfig.NoActionDescriptionName != "" {
-			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
-		}
-
-		if input.ResponseFormat.Type == "json_object" {
-			input.Grammar = grammar.JSONBNF
-		}
-
-		// process functions if we have any defined or if we have a function call string
-		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
-			log.Debug().Msgf("Response needs to process functions")
-
-			processFunctions = true
-
-			noActionGrammar := grammar.Function{
-				Name:        noActionName,
-				Description: noActionDescription,
-				Parameters: map[string]interface{}{
-					"properties": map[string]interface{}{
-						"message": map[string]interface{}{
-							"type":        "string",
-							"description": "The message to reply the user with",
-						}},
-				},
-			}
-
-			// Append the no action function
-			funcs = append(funcs, input.Functions...)
-			if !config.FunctionsConfig.DisableNoAction {
-				funcs = append(funcs, noActionGrammar)
-			}
-
-			// Force picking one of the functions by the request
-			if config.FunctionToCall() != "" {
-				funcs = funcs.Select(config.FunctionToCall())
-			}
-
-			// Update input grammar
-			jsStruct := funcs.ToJSONStructure()
-			config.Grammar = jsStruct.Grammar("")
-		} else if input.JSONFunctionGrammarObject != nil {
-			config.Grammar = input.JSONFunctionGrammarObject.Grammar("")
-		}
-
-		// functions are not supported in stream mode (yet?)
-		toStream := input.Stream && !processFunctions
-
-		log.Debug().Msgf("Parameters: %+v", config)
-
-		var predInput string
-
-		suppressConfigSystemPrompt := false
-		mess := []string{}
-		for messageIndex, i := range input.Messages {
-			var content string
-			role := i.Role
-
-			// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
-			// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
-			if i.FunctionCall != nil && i.Role == "assistant" {
-				roleFn := "assistant_function_call"
-				r := config.Roles[roleFn]
-				if r != "" {
-					role = roleFn
-				}
-			}
-			r := config.Roles[role]
-			contentExists := i.Content != nil && i.StringContent != ""
-			// First attempt to populate content via a chat message specific template
-			if config.TemplateConfig.ChatMessage != "" {
-				chatMessageData := model.ChatMessageTemplateData{
-					SystemPrompt: config.SystemPrompt,
-					Role:         r,
-					RoleName:     role,
-					Content:      i.StringContent,
-					MessageIndex: messageIndex,
-				}
-				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
-				if err != nil {
-					log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, config.TemplateConfig.ChatMessage, err)
-				} else {
-					if templatedChatMessage == "" {
-						log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
-						continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
-					}
-					log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
-					content = templatedChatMessage
-				}
-			}
-			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
-			if content == "" {
-				if r != "" {
-					if contentExists {
-						content = fmt.Sprint(r, i.StringContent)
-					}
-					if i.FunctionCall != nil {
-						j, err := json.Marshal(i.FunctionCall)
-						if err == nil {
-							if contentExists {
-								content += "\n" + fmt.Sprint(r, " ", string(j))
-							} else {
-								content = fmt.Sprint(r, " ", string(j))
-							}
-						}
-					}
-				} else {
-					if contentExists {
-						content = fmt.Sprint(i.StringContent)
-					}
-					if i.FunctionCall != nil {
-						j, err := json.Marshal(i.FunctionCall)
-						if err == nil {
-							if contentExists {
-								content += "\n" + string(j)
-							} else {
-								content = string(j)
-							}
-						}
-					}
-				}
-				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
-				if contentExists && role == "system" {
-					suppressConfigSystemPrompt = true
-				}
-			}
-
-			mess = append(mess, content)
-		}
-
-		predInput = strings.Join(mess, "\n")
-		log.Debug().Msgf("Prompt (before templating): %s", predInput)
-
-		if toStream {
-			log.Debug().Msgf("Stream request received")
-			c.Context().SetContentType("text/event-stream")
-			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//	c.Set("Content-Type", "text/event-stream")
-			c.Set("Cache-Control", "no-cache")
-			c.Set("Connection", "keep-alive")
-			c.Set("Transfer-Encoding", "chunked")
-		}
-
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
-
-		if config.TemplateConfig.Chat != "" && !processFunctions {
-			templateFile = config.TemplateConfig.Chat
-		}
-
-		if config.TemplateConfig.Functions != "" && processFunctions {
-			templateFile = config.TemplateConfig.Functions
-		}
-
-		if templateFile != "" {
-			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-				SystemPrompt:         config.SystemPrompt,
-				SuppressSystemPrompt: suppressConfigSystemPrompt,
-				Input:                predInput,
-				Functions:            funcs,
-			})
-			if err == nil {
-				predInput = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", predInput)
-			} else {
-				log.Debug().Msgf("Template failed loading: %s", err.Error())
-			}
-		}
-
-		log.Debug().Msgf("Prompt (after templating): %s", predInput)
-		if processFunctions {
-			log.Debug().Msgf("Grammar: %+v", config.Grammar)
-		}
-
-		if toStream {
-			responses := make(chan schema.OpenAIResponse)
-
-			go process(predInput, input, config, o.Loader, responses)
-
-			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-
-				usage := &schema.OpenAIUsage{}
-
-				for ev := range responses {
-					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
-					var buf bytes.Buffer
-					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
-					log.Debug().Msgf("Sending chunk: %s", buf.String())
-					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
-					if err != nil {
-						log.Debug().Msgf("Sending chunk failed: %v", err)
-						input.Cancel()
-						break
-					}
-					w.Flush()
-				}
-
-				resp := &schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []schema.Choice{
-						{
-							FinishReason: "stop",
-							Index:        0,
-							Delta:        &schema.Message{Content: &emptyMessage},
-						}},
-					Object: "chat.completion.chunk",
-					Usage:  *usage,
-				}
-				respData, _ := json.Marshal(resp)
-
-				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
-				w.WriteString("data: [DONE]\n\n")
-				w.Flush()
-			}))
-			return nil
-		}
-
-		result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
-			if processFunctions {
-				// As we have to change the result before processing, we can't stream the answer (yet?)
-				ss := map[string]interface{}{}
-				// This prevent newlines to break JSON parsing for clients
-				s = utils.EscapeNewLines(s)
-				json.Unmarshal([]byte(s), &ss)
-				log.Debug().Msgf("Function return: %s %+v", s, ss)
-
-				// The grammar defines the function name as "function", while OpenAI returns "name"
-				func_name := ss["function"]
-				// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
-				args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
-				d, _ := json.Marshal(args)
-
-				ss["arguments"] = string(d)
-				ss["name"] = func_name
-
-				// if do nothing, reply with a message
-				if func_name == noActionName {
-					log.Debug().Msgf("nothing to do, computing a reply")
-
-					// If there is a message that the LLM already sends as part of the JSON reply, use it
-					arguments := map[string]interface{}{}
-					json.Unmarshal([]byte(d), &arguments)
-					m, exists := arguments["message"]
-					if exists {
-						switch message := m.(type) {
-						case string:
-							if message != "" {
-								log.Debug().Msgf("Reply received from LLM: %s", message)
-								message = backend.Finetune(*config, predInput, message)
-								log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
-
-								*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
-								return
-							}
-						}
-					}
-
-					log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
-					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
-					// Note: This costs (in term of CPU) another computation
-					config.Grammar = ""
-					images := []string{}
-					for _, m := range input.Messages {
-						images = append(images, m.StringImages...)
-					}
-					predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
-					if err != nil {
-						log.Error().Msgf("inference error: %s", err.Error())
-						return
-					}
-
-					prediction, err := predFunc()
-					if err != nil {
-						log.Error().Msgf("inference error: %s", err.Error())
-						return
-					}
-
-					fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
-					*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
-				} else {
-					// otherwise reply with the function call
-					*c = append(*c, schema.Choice{
-						FinishReason: "function_call",
-						Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
-					})
-				}
-
-				return
-			}
-			*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
-		}, nil)
-		if err != nil {
-			return err
-		}
-
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "chat.completion",
-			Usage: schema.OpenAIUsage{
-				PromptTokens:     tokenUsage.Prompt,
-				CompletionTokens: tokenUsage.Completion,
-				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
-			},
-		}
-		respData, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", respData)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -112,7 +112,6 @@ message ModelOptions {
  int32 CLIPSkip = 33;
  string ControlNet = 48;

-  // RWKV
  string Tokenizer = 34;

  // LLM (llama.cpp)
@@ -127,6 +126,11 @@ message ModelOptions {

  // vllm
  string Quantization = 40;
+  float  GPUMemoryUtilization = 50;
+  bool   TrustRemoteCode = 51;
+  bool   EnforceEager = 52;
+  int32  SwapSpace = 53;
+  int32  MaxModelLen = 54;

  string MMProj = 41;

@@ -135,6 +139,8 @@ message ModelOptions {
  float YarnAttnFactor = 45;
  float YarnBetaFast = 46;
  float YarnBetaSlow = 47;
+
+  string Type = 49;
 }

 message Result {
@@ -185,6 +191,7 @@ message TTSRequest {
  string text = 1;
  string model = 2;
  string dst = 3;
+  string voice = 4;
 }

 message TokenizationResponse {
--- a/backend/backend_grpc.pb.go
+++ b/backend/backend_grpc.pb.go
@@ -0,0 +1,457 @@
+// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
+// versions:
+// - protoc-gen-go-grpc v1.2.0
+// - protoc             v4.23.4
+// source: backend/backend.proto
+
+package proto
+
+import (
+	context "context"
+	grpc "google.golang.org/grpc"
+	codes "google.golang.org/grpc/codes"
+	status "google.golang.org/grpc/status"
+)
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the grpc package it is being compiled against.
+// Requires gRPC-Go v1.32.0 or later.
+const _ = grpc.SupportPackageIsVersion7
+
+// BackendClient is the client API for Backend service.
+//
+// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
+type BackendClient interface {
+	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
+	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
+	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
+	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
+	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
+	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
+	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
+	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
+	TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error)
+	Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error)
+}
+
+type backendClient struct {
+	cc grpc.ClientConnInterface
+}
+
+func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
+	return &backendClient{cc}
+}
+
+func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
+	out := new(Reply)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
+	out := new(Reply)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
+	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
+	if err != nil {
+		return nil, err
+	}
+	x := &backendPredictStreamClient{stream}
+	if err := x.ClientStream.SendMsg(in); err != nil {
+		return nil, err
+	}
+	if err := x.ClientStream.CloseSend(); err != nil {
+		return nil, err
+	}
+	return x, nil
+}
+
+type Backend_PredictStreamClient interface {
+	Recv() (*Reply, error)
+	grpc.ClientStream
+}
+
+type backendPredictStreamClient struct {
+	grpc.ClientStream
+}
+
+func (x *backendPredictStreamClient) Recv() (*Reply, error) {
+	m := new(Reply)
+	if err := x.ClientStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
+	out := new(EmbeddingResult)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
+	out := new(TranscriptResult)
+	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error) {
+	out := new(TokenizationResponse)
+	err := c.cc.Invoke(ctx, "/backend.Backend/TokenizeString", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error) {
+	out := new(StatusResponse)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Status", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+// BackendServer is the server API for Backend service.
+// All implementations must embed UnimplementedBackendServer
+// for forward compatibility
+type BackendServer interface {
+	Health(context.Context, *HealthMessage) (*Reply, error)
+	Predict(context.Context, *PredictOptions) (*Reply, error)
+	LoadModel(context.Context, *ModelOptions) (*Result, error)
+	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
+	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
+	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
+	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
+	TTS(context.Context, *TTSRequest) (*Result, error)
+	TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error)
+	Status(context.Context, *HealthMessage) (*StatusResponse, error)
+	mustEmbedUnimplementedBackendServer()
+}
+
+// UnimplementedBackendServer must be embedded to have forward compatible implementations.
+type UnimplementedBackendServer struct {
+}
+
+func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
+}
+func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
+}
+func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
+}
+func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
+	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
+}
+func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
+}
+func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
+}
+func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
+}
+func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
+}
+func (UnimplementedBackendServer) TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method TokenizeString not implemented")
+}
+func (UnimplementedBackendServer) Status(context.Context, *HealthMessage) (*StatusResponse, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Status not implemented")
+}
+func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
+
+// UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
+// Use of this interface is not recommended, as added methods to BackendServer will
+// result in compilation errors.
+type UnsafeBackendServer interface {
+	mustEmbedUnimplementedBackendServer()
+}
+
+func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
+	s.RegisterService(&Backend_ServiceDesc, srv)
+}
+
+func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(HealthMessage)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Health(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Health",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Predict(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Predict",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(ModelOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).LoadModel(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/LoadModel",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
+	m := new(PredictOptions)
+	if err := stream.RecvMsg(m); err != nil {
+		return err
+	}
+	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
+}
+
+type Backend_PredictStreamServer interface {
+	Send(*Reply) error
+	grpc.ServerStream
+}
+
+type backendPredictStreamServer struct {
+	grpc.ServerStream
+}
+
+func (x *backendPredictStreamServer) Send(m *Reply) error {
+	return x.ServerStream.SendMsg(m)
+}
+
+func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Embedding(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Embedding",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(GenerateImageRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).GenerateImage(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/GenerateImage",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(TranscriptRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).AudioTranscription(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/AudioTranscription",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(TTSRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).TTS(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/TTS",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_TokenizeString_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).TokenizeString(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/TokenizeString",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).TokenizeString(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(HealthMessage)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Status(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Status",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Status(ctx, req.(*HealthMessage))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+// Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
+// It's only intended for direct use with grpc.RegisterService,
+// and not to be introspected or modified (even as a copy)
+var Backend_ServiceDesc = grpc.ServiceDesc{
+	ServiceName: "backend.Backend",
+	HandlerType: (*BackendServer)(nil),
+	Methods: []grpc.MethodDesc{
+		{
+			MethodName: "Health",
+			Handler:    _Backend_Health_Handler,
+		},
+		{
+			MethodName: "Predict",
+			Handler:    _Backend_Predict_Handler,
+		},
+		{
+			MethodName: "LoadModel",
+			Handler:    _Backend_LoadModel_Handler,
+		},
+		{
+			MethodName: "Embedding",
+			Handler:    _Backend_Embedding_Handler,
+		},
+		{
+			MethodName: "GenerateImage",
+			Handler:    _Backend_GenerateImage_Handler,
+		},
+		{
+			MethodName: "AudioTranscription",
+			Handler:    _Backend_AudioTranscription_Handler,
+		},
+		{
+			MethodName: "TTS",
+			Handler:    _Backend_TTS_Handler,
+		},
+		{
+			MethodName: "TokenizeString",
+			Handler:    _Backend_TokenizeString_Handler,
+		},
+		{
+			MethodName: "Status",
+			Handler:    _Backend_Status_Handler,
+		},
+	},
+	Streams: []grpc.StreamDesc{
+		{
+			StreamName:    "PredictStream",
+			Handler:       _Backend_PredictStream_Handler,
+			ServerStreams: true,
+		},
+	},
+	Metadata: "backend/backend.proto",
+}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -0,0 +1,66 @@
+# Basic platform detection
+HOST_SYSTEM = $(shell uname | cut -f 1 -d_)
+SYSTEM ?= $(HOST_SYSTEM)
+
+TAG_LIB_GRPC?=v1.59.0
+GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
+GIT_CLONE_DEPTH?=1
+NUM_BUILD_THREADS?=$(shell nproc --ignore=1)
+
+INSTALLED_PACKAGES=installed_packages
+GRPC_REPO=grpc_repo
+GRPC_BUILD=grpc_build
+
+export CMAKE_ARGS?=
+CMAKE_ARGS+=-DCMAKE_BUILD_TYPE=Release
+CMAKE_ARGS+=-DgRPC_INSTALL=ON
+CMAKE_ARGS+=-DEXECUTABLE_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/bin
+CMAKE_ARGS+=-DLIBRARY_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/lib
+CMAKE_ARGS+=-DgRPC_BUILD_TESTS=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_CSHARP_EXT=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CPP_PLUGIN=ON
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF
+CMAKE_ARGS+=-Dprotobuf_WITH_ZLIB=ON
+CMAKE_ARGS+=-DRE2_BUILD_TESTING=OFF
+CMAKE_ARGS+=-DCMAKE_INSTALL_PREFIX=../$(INSTALLED_PACKAGES)
+
+# windows need to set OPENSSL_NO_ASM. Results in slower crypto performance but doesn't build otherwise.
+# May be resolvable, but for now its set. More info: https://stackoverflow.com/a/75240504/480673
+ifeq ($(SYSTEM),MSYS)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),MINGW64)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),MINGW32)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),CYGWIN)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+
+$(INSTALLED_PACKAGES): grpc_build
+
+$(GRPC_REPO):
+	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
+	cd $(GRPC_REPO)/grpc && git submodule update --init --recursive --depth $(GIT_CLONE_DEPTH)
+
+$(GRPC_BUILD): $(GRPC_REPO)
+	mkdir -p $(GRPC_BUILD)
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . -- -j ${NUM_BUILD_THREADS} && cmake --build . --target install -- -j ${NUM_BUILD_THREADS}
+
+build: $(INSTALLED_PACKAGES)
+
+rebuild:
+	rm -rf grpc_build
+	$(MAKE) grpc_build
+
+clean:
+	rm -rf grpc_build
+	rm -rf grpc_repo
+	rm -rf installed_packages
--- a/backend/cpp/grpc/script/build_grpc.sh
+++ b/backend/cpp/grpc/script/build_grpc.sh
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-# Builds locally from sources the packages needed by the llama cpp backend.
-
-# Makes sure a few base packages exist.
-# sudo apt-get --no-upgrade -y install g++ gcc binutils cmake git build-essential autoconf libtool pkg-config 
-
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-echo "Script directory: $SCRIPT_DIR"
-
-CPP_INSTALLED_PACKAGES_DIR=$1
-if [ -z ${CPP_INSTALLED_PACKAGES_DIR} ]; then 
-    echo "CPP_INSTALLED_PACKAGES_DIR env variable not set. Don't know where to install: failed."; 
-    echo
-    exit -1
-fi
-
-if [ -d "${CPP_INSTALLED_PACKAGES_DIR}" ]; then
-  echo "gRPC installation directory already exists. Nothing to do."
-  exit 0
-fi
-
-# The depth when cloning a git repo. 1 speeds up the clone when the repo history is not needed.
-GIT_CLONE_DEPTH=1
-
-NUM_BUILD_THREADS=$(nproc --ignore=1)
-
-# Google gRPC --------------------------------------------------------------------------------------
-TAG_LIB_GRPC="v1.59.0"
-GIT_REPO_LIB_GRPC="https://github.com/grpc/grpc.git"
-GRPC_REPO_DIR="${SCRIPT_DIR}/../grpc_repo"
-GRPC_BUILD_DIR="${SCRIPT_DIR}/../grpc_build"
-SRC_DIR_LIB_GRPC="${GRPC_REPO_DIR}/grpc"
-
-echo "SRC_DIR_LIB_GRPC: ${SRC_DIR_LIB_GRPC}"
-echo "GRPC_REPO_DIR: ${GRPC_REPO_DIR}"
-echo "GRPC_BUILD_DIR: ${GRPC_BUILD_DIR}"
-
-mkdir -pv ${GRPC_REPO_DIR}
-
-rm   -rf ${GRPC_BUILD_DIR}
-mkdir -pv ${GRPC_BUILD_DIR}
-
-mkdir -pv ${CPP_INSTALLED_PACKAGES_DIR}
-	
-if [ -d "${SRC_DIR_LIB_GRPC}" ]; then
-  echo "gRPC source already exists locally. Not cloned again."
-else  
-  ( cd ${GRPC_REPO_DIR} && \
-    git clone --depth ${GIT_CLONE_DEPTH} -b ${TAG_LIB_GRPC} ${GIT_REPO_LIB_GRPC} && \
-    cd ${SRC_DIR_LIB_GRPC} && \
-    git submodule update --init --recursive --depth ${GIT_CLONE_DEPTH} 
-  )    
-fi
-
-( cd ${GRPC_BUILD_DIR} && \
-  cmake -G "Unix Makefiles" \
-     -DCMAKE_BUILD_TYPE=Release \
-     -DgRPC_INSTALL=ON \
-     -DEXECUTABLE_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/bin \
-     -DLIBRARY_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/lib \
-     -DgRPC_BUILD_TESTS=OFF \
-     -DgRPC_BUILD_CSHARP_EXT=OFF \
-     -DgRPC_BUILD_GRPC_CPP_PLUGIN=ON \
-     -DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF \
-     -DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF \
-     -DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF \
-     -DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF \
-     -DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON \
-     -DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF \
-     -Dprotobuf_WITH_ZLIB=ON \
-     -DRE2_BUILD_TESTING=OFF \
-     -DCMAKE_INSTALL_PREFIX=${CPP_INSTALLED_PACKAGES_DIR}/ \
-     ${SRC_DIR_LIB_GRPC}  && \
-  cmake --build .  -- -j ${NUM_BUILD_THREADS} && \
-  cmake --build .  --target install -- -j ${NUM_BUILD_THREADS} 
-)
-
-rm -rf ${GRPC_BUILD_DIR}
-rm -rf ${GRPC_REPO_DIR}
-
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,16 +2,20 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(myclip PUBLIC .)
+target_include_directories(myclip PUBLIC ../..)
+target_include_directories(myclip PUBLIC ../../common)
+target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
+# END CLIP hack
+

 set(TARGET grpc-server)
-# END CLIP hack
 set(CMAKE_CXX_STANDARD 17)
 cmake_minimum_required(VERSION 3.15)
 set(TARGET grpc-server)
@@ -70,7 +74,7 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )

-add_executable(${TARGET} grpc-server.cpp json.hpp )
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
 target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -3,6 +3,7 @@ LLAMA_VERSION?=

 CMAKE_ARGS?=
 BUILD_TYPE?=
+ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh

 # If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -11,12 +12,21 @@ ifeq ($(BUILD_TYPE),cublas)
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
-else ifeq ($(BUILD_TYPE),clblast)
+# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif

 llama.cpp:
@@ -31,10 +41,14 @@ llama.cpp/examples/grpc-server:
 	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
 	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
 	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/utils.hpp llama.cpp/examples/grpc-server/
 	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+	cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
+	echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
+	cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
 	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp

 rebuild:
@@ -49,5 +63,10 @@ clean:
 	rm -rf grpc-server

 grpc-server: llama.cpp llama.cpp/examples/grpc-server
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	bash -c "source $(ONEAPI_VARS); \
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release"	
+else
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
+endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -0,0 +1,510 @@
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <mutex>
+#include <condition_variable>
+#include <unordered_map>
+
+#include "json.hpp"
+
+#include "../llava/clip.h"
+
+using json = nlohmann::json;
+
+extern bool server_verbose;
+
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// parallel
+//
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+    SERVER_STATE_ERROR           // An error occurred, load_model failed
+};
+
+enum task_type {
+    TASK_TYPE_COMPLETION,
+    TASK_TYPE_CANCEL,
+    TASK_TYPE_NEXT_RESPONSE
+};
+
+struct task_server {
+    int id = -1; // to be filled by llama_server_queue
+    int target_id;
+    task_type type;
+    json data;
+    bool infill_mode = false;
+    bool embedding_mode = false;
+    int multitask_id = -1;
+};
+
+struct task_result {
+    int id;
+    int multitask_id = -1;
+    bool stop;
+    bool error;
+    json result_json;
+};
+
+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
+// TODO: can become bool if we can't find use of more states
+enum slot_state
+{
+    IDLE,
+    PROCESSING,
+};
+
+enum slot_command
+{
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
+
+struct slot_params
+{
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
+
+    std::vector<std::string> antiprompt;
+
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image
+{
+    int32_t id;
+
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+    std::string text_to_send;
+};
+
+static inline void server_log(const char *level, const char *function, int line,
+                       const char *message, const nlohmann::ordered_json &extra)
+{
+    nlohmann::ordered_json log
+    {
+        {"timestamp", time(nullptr)},
+        {"level",     level},
+        {"function",  function},
+        {"line",      line},
+        {"message",   message},
+    };
+
+    if (!extra.empty())
+    {
+        log.merge_patch(extra);
+    }
+
+    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+    printf("%.*s\n", (int)str.size(), str.data());
+    fflush(stdout);
+}
+
+//
+// server utils
+//
+
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
+inline std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
+
+    chatml_msgs << "<|im_start|>assistant" << '\n';
+
+    return chatml_msgs.str();
+}
+
+//
+// work queue utils
+//
+
+struct llama_server_queue {
+    int id = 0;
+    std::mutex mutex_tasks;
+    // queues
+    std::vector<task_server> queue_tasks;
+    std::vector<task_server> queue_tasks_deferred;
+    std::vector<task_multi> queue_multitasks;
+    std::condition_variable condition_tasks;
+    // callback functions
+    std::function<void(task_server&)> callback_new_task;
+    std::function<void(task_multi&)> callback_finish_multitask;
+    std::function<void(void)> callback_all_task_finished;
+
+    // Add a new task to the end of the queue
+    int post(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        queue_tasks.push_back(std::move(task));
+        condition_tasks.notify_one();
+        return task.id;
+    }
+
+    // Add a new task, but defer until one slot is available
+    void defer(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        queue_tasks_deferred.push_back(std::move(task));
+    }
+
+    // Get the next id for creating anew task
+    int get_new_id() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return id++;
+    }
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(task_server&)> callback) {
+        callback_new_task = callback;
+    }
+
+    // Register function to process a multitask
+    void on_finish_multitask(std::function<void(task_multi&)> callback) {
+        callback_finish_multitask = callback;
+    }
+
+    // Register the function to be called when the batch of tasks is finished
+    void on_all_tasks_finished(std::function<void(void)> callback) {
+        callback_all_task_finished = callback;
+    }
+
+    // Call when the state of one slot is changed
+    void notify_slot_changed() {
+        // move deferred tasks back to main loop
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        for (auto & task : queue_tasks_deferred) {
+            queue_tasks.push_back(std::move(task));
+        }
+        queue_tasks_deferred.clear();
+    }
+
+    // Start the main loop. This call is blocking
+    [[noreturn]]
+    void start_loop() {
+        while (true) {
+            // new task arrived
+            LOG_VERBOSE("have new task", {});
+            {
+                while (true)
+                {
+                    std::unique_lock<std::mutex> lock(mutex_tasks);
+                    if (queue_tasks.empty()) {
+                        lock.unlock();
+                        break;
+                    }
+                    task_server task = queue_tasks.front();
+                    queue_tasks.erase(queue_tasks.begin());
+                    lock.unlock();
+                    LOG_VERBOSE("callback_new_task", {});
+                    callback_new_task(task);
+                }
+                LOG_VERBOSE("callback_all_task_finished", {});
+                // process and update all the multitasks
+                auto queue_iterator = queue_multitasks.begin();
+                while (queue_iterator != queue_multitasks.end())
+                {
+                    if (queue_iterator->subtasks_remaining.empty())
+                    {
+                        // all subtasks done == multitask is done
+                        task_multi current_multitask = *queue_iterator;
+                        callback_finish_multitask(current_multitask);
+                        // remove this multitask
+                        queue_iterator = queue_multitasks.erase(queue_iterator);
+                    }
+                    else
+                    {
+                        ++queue_iterator;
+                    }
+                }
+                // all tasks in the current loop is finished
+                callback_all_task_finished();
+            }
+            LOG_VERBOSE("wait for new task", {});
+            // wait for new task
+            {
+                std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (queue_tasks.empty()) {
+                    condition_tasks.wait(lock, [&]{
+                        return !queue_tasks.empty();
+                    });
+                }
+            }
+        }
+    }
+
+    //
+    // functions to manage multitasks
+    //
+
+    // add a multitask by specifying the id of all subtask (subtask is a task_server)
+    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = multitask_id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    // updatethe remaining subtasks, while appending results to multitask
+    void update_multitask(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+};
+
+struct llama_server_response {
+    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
+    callback_multitask_t callback_update_multitask;
+    // for keeping track of all tasks waiting for the result
+    std::set<int> waiting_task_ids;
+    // the main result queue
+    std::vector<task_result> queue_results;
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+    void add_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.insert(task_id);
+    }
+
+    void remove_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.erase(task_id);
+    }
+
+    // This function blocks the thread until there is a response for this task_id
+    task_result recv(int task_id) {
+        while (true)
+        {
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return !queue_results.empty();
+            });
+            LOG_VERBOSE("condition_results unblock", {});
+
+            for (int i = 0; i < (int) queue_results.size(); i++)
+            {
+                if (queue_results[i].id == task_id)
+                {
+                    assert(queue_results[i].multitask_id == -1);
+                    task_result res = queue_results[i];
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // should never reach here
+    }
+
+    // Register the function to update multitask
+    void on_multitask_update(callback_multitask_t callback) {
+        callback_update_multitask = callback;
+    }
+
+    // Send a new result to a waiting task_id
+    void send(task_result result) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        LOG_VERBOSE("send new result", {});
+        for (auto& task_id : waiting_task_ids) {
+            // LOG_TEE("waiting task id %i \n", task_id);
+            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+            if (result.multitask_id == task_id)
+            {
+                LOG_VERBOSE("callback_update_multitask", {});
+                callback_update_multitask(task_id, result.id, result);
+                continue;
+            }
+
+            if (result.id == task_id)
+            {
+                LOG_VERBOSE("queue_results.push_back", {});
+                queue_results.push_back(result);
+                condition_results.notify_one();
+                return;
+            }
+        }
+    }
+};
+
+//
+// base64 utils (TODO: move to common in the future)
+//
+
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+
+static inline bool is_base64(uint8_t c)
+{
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
+{
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    std::vector<uint8_t> ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
+    {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4)
+        {
+            for (i = 0; i <4; i++)
+            {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++)
+            {
+                ret.push_back(char_array_3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i)
+    {
+        for (j = i; j <4; j++)
+        {
+            char_array_4[j] = 0;
+        }
+
+        for (j = 0; j <4; j++)
+        {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; (j < i - 1); j++)
+        {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+
+    return ret;
+}
+
+//
+// random string / id
+//
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
+}
--- a/backend/go/llm/dolly/main.go
+++ b/backend/go/llm/dolly/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.Dolly{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/falcon-ggml/main.go
+++ b/backend/go/llm/falcon-ggml/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.Falcon{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/gpt2/main.go
+++ b/backend/go/llm/gpt2/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.GPT2{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/gptj/main.go
+++ b/backend/go/llm/gptj/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.GPTJ{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/gptneox/main.go
+++ b/backend/go/llm/gptneox/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.GPTNeoX{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/mpt/main.go
+++ b/backend/go/llm/mpt/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.MPT{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/replit/main.go
+++ b/backend/go/llm/replit/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.Replit{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/starcoder/main.go
+++ b/backend/go/llm/starcoder/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &transformers.Starcoder{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/llm/transformers/dolly.go
+++ b/backend/go/llm/transformers/dolly.go
@@ -1,44 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type Dolly struct {
-	base.SingleThread
-
-	dolly *transformers.Dolly
-}
-
-func (llm *Dolly) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewDolly(opts.ModelFile)
-	llm.dolly = model
-	return err
-}
-
-func (llm *Dolly) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.dolly.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *Dolly) PredictStream(opts *pb.PredictOptions, results chan string) error {
-
-	go func() {
-		res, err := llm.dolly.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-
-	return nil
-}
--- a/backend/go/llm/transformers/falcon.go
+++ b/backend/go/llm/transformers/falcon.go
@@ -1,43 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type Falcon struct {
-	base.SingleThread
-
-	falcon *transformers.Falcon
-}
-
-func (llm *Falcon) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewFalcon(opts.ModelFile)
-	llm.falcon = model
-	return err
-}
-
-func (llm *Falcon) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.falcon.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *Falcon) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.falcon.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-
-	return nil
-}
--- a/backend/go/llm/transformers/gpt2.go
+++ b/backend/go/llm/transformers/gpt2.go
@@ -1,42 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type GPT2 struct {
-	base.SingleThread
-
-	gpt2 *transformers.GPT2
-}
-
-func (llm *GPT2) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.New(opts.ModelFile)
-	llm.gpt2 = model
-	return err
-}
-
-func (llm *GPT2) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.gpt2.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *GPT2) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.gpt2.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-	return nil
-}
--- a/backend/go/llm/transformers/gptj.go
+++ b/backend/go/llm/transformers/gptj.go
@@ -1,42 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type GPTJ struct {
-	base.SingleThread
-
-	gptj *transformers.GPTJ
-}
-
-func (llm *GPTJ) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewGPTJ(opts.ModelFile)
-	llm.gptj = model
-	return err
-}
-
-func (llm *GPTJ) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.gptj.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *GPTJ) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.gptj.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-	return nil
-}
--- a/backend/go/llm/transformers/gptneox.go
+++ b/backend/go/llm/transformers/gptneox.go
@@ -1,42 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type GPTNeoX struct {
-	base.SingleThread
-
-	gptneox *transformers.GPTNeoX
-}
-
-func (llm *GPTNeoX) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewGPTNeoX(opts.ModelFile)
-	llm.gptneox = model
-	return err
-}
-
-func (llm *GPTNeoX) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.gptneox.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *GPTNeoX) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.gptneox.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-	return nil
-}
--- a/backend/go/llm/transformers/mpt.go
+++ b/backend/go/llm/transformers/mpt.go
@@ -1,42 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type MPT struct {
-	base.SingleThread
-
-	mpt *transformers.MPT
-}
-
-func (llm *MPT) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewMPT(opts.ModelFile)
-	llm.mpt = model
-	return err
-}
-
-func (llm *MPT) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.mpt.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *MPT) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.mpt.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-	return nil
-}
--- a/backend/go/llm/transformers/predict.go
+++ b/backend/go/llm/transformers/predict.go
@@ -1,26 +0,0 @@
-package transformers
-
-import (
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-func buildPredictOptions(opts *pb.PredictOptions) []transformers.PredictOption {
-	predictOptions := []transformers.PredictOption{
-		transformers.SetTemperature(float64(opts.Temperature)),
-		transformers.SetTopP(float64(opts.TopP)),
-		transformers.SetTopK(int(opts.TopK)),
-		transformers.SetTokens(int(opts.Tokens)),
-		transformers.SetThreads(int(opts.Threads)),
-	}
-
-	if opts.Batch != 0 {
-		predictOptions = append(predictOptions, transformers.SetBatch(int(opts.Batch)))
-	}
-
-	if opts.Seed != 0 {
-		predictOptions = append(predictOptions, transformers.SetSeed(int(opts.Seed)))
-	}
-
-	return predictOptions
-}
--- a/backend/go/llm/transformers/replit.go
+++ b/backend/go/llm/transformers/replit.go
@@ -1,42 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type Replit struct {
-	base.SingleThread
-
-	replit *transformers.Replit
-}
-
-func (llm *Replit) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewReplit(opts.ModelFile)
-	llm.replit = model
-	return err
-}
-
-func (llm *Replit) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.replit.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *Replit) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.replit.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-	return nil
-}
--- a/backend/go/llm/transformers/starcoder.go
+++ b/backend/go/llm/transformers/starcoder.go
@@ -1,43 +0,0 @@
-package transformers
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-)
-
-type Starcoder struct {
-	base.SingleThread
-
-	starcoder *transformers.Starcoder
-}
-
-func (llm *Starcoder) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewStarcoder(opts.ModelFile)
-	llm.starcoder = model
-	return err
-}
-
-func (llm *Starcoder) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.starcoder.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-// fallback to Predict
-func (llm *Starcoder) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	go func() {
-		res, err := llm.starcoder.Predict(opts.Prompt, buildPredictOptions(opts)...)
-
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		results <- res
-		close(results)
-	}()
-
-	return nil
-}
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -8,24 +8,24 @@ import (

 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
-	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/core/schema"
 )

-func sh(c string) (string, error) {
-	cmd := exec.Command("/bin/sh", "-c", c)
+func runCommand(command []string) (string, error) {
+	cmd := exec.Command(command[0], command[1:]...)
 	cmd.Env = os.Environ()
-	o, err := cmd.CombinedOutput()
-	return string(o), err
+	out, err := cmd.CombinedOutput()
+	return string(out), err
 }

-// AudioToWav converts audio to wav for transcribe. It bashes out to ffmpeg
+// AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
-	out, err := sh(fmt.Sprintf("ffmpeg -i %s -format s16le -ar 16000 -ac 1 -acodec pcm_s16le %s", src, dst))
+    command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := runCommand(command)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
-
 	return nil
 }

--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -4,7 +4,7 @@ package main
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 )
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,5 +1,4 @@
 .PHONY: autogptq
 autogptq:
-	@echo "Creating virtual environment..."
-	@conda env create --name autogptq --file autogptq.yml
-	@echo "Virtual environment created."
+	$(MAKE) -C ../common-env/transformers
+
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
@@ -33,7 +33,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            model = AutoGPTQForCausalLM.from_quantized(request.Model,
                    model_basename=request.ModelBaseName,
                    use_safetensors=True,
-                    trust_remote_code=True,
+                    trust_remote_code=request.TrustRemoteCode,
                    device=device,
                    use_triton=request.UseTriton,
                    quantize_config=None)
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@@ -71,7 +71,7 @@ dependencies:
      - regex==2023.10.3
      - requests==2.31.0
      - rouge==1.0.1
-      - safetensors==0.3.3
+      - safetensors>=0.3.3
      - six==1.16.0
      - sympy==1.12
      - tokenizers==0.14.0
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -6,7 +6,7 @@
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
-source activate autogptq
+source activate transformers

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/common-env/transformers/Makefile
+++ b/backend/python/common-env/transformers/Makefile
@@ -4,6 +4,17 @@ ifeq ($(BUILD_TYPE), cublas)
 	CONDA_ENV_PATH = "transformers-nvidia.yml"
 endif

+ifeq ($(BUILD_TYPE), hipblas)
+	CONDA_ENV_PATH = "transformers-rocm.yml"
+endif
+
+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: transformers
 transformers:
 	@echo "Installing $(CONDA_ENV_PATH)..."
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -1,15 +1,38 @@
 #!/bin/bash
 set -ex

+SKIP_CONDA=${SKIP_CONDA:-0}
+
 # Check if environment exist
 conda_env_exists(){
    ! conda list --name "${@}" >/dev/null 2>/dev/null
 }

-if conda_env_exists "transformers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name transformers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "transformers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name transformers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
 fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the intel image
+    # (no conda env)
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed
+fi
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    if [ $SKIP_CONDA -eq 0 ]; then
+        # Activate conda environment
+        source activate transformers
+    fi
+
+    pip cache purge
+fi
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -30,12 +30,14 @@ dependencies:
      - async-timeout==4.0.3
      - attrs==23.1.0
      - bark==0.1.5
+      - bitsandbytes==0.43.0
      - boto3==1.28.61
      - botocore==1.31.61
      - certifi==2023.7.22
+      - TTS==0.22.0
      - charset-normalizer==3.3.0
      - datasets==2.14.5
-      - sentence-transformers==2.2.2
+      - sentence-transformers==2.5.1 # Updated Version
      - sentencepiece==0.1.99
      - dill==0.3.7
      - einops==0.7.0
@@ -45,7 +47,7 @@ dependencies:
      - fsspec==2023.6.0
      - funcy==2.0
      - grpcio==1.59.0
-      - huggingface-hub==0.16.4
+      - huggingface-hub
      - idna==3.4
      - jinja2==3.1.2
      - jmespath==1.0.1
@@ -70,7 +72,6 @@ dependencies:
      - packaging==23.2
      - pandas
      - peft==0.5.0
-      - git+https://github.com/bigscience-workshop/petals
      - protobuf==4.24.4
      - psutil==5.9.5
      - pyarrow==13.0.0
@@ -81,21 +82,20 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
-      - scipy==1.11.3
+      - safetensors>=0.4.1
+      - scipy==1.12.0 # Updated Version
      - six==1.16.0
      - sympy==1.12
-      - tokenizers==0.14.0
-      - torch==2.1.0
-      - torchaudio==2.1.0
+      - tokenizers
+      - torch==2.1.2
+      - torchaudio==2.1.2
      - tqdm==4.66.1
-      - transformers==4.34.0
-      - TTS==0.22.0
      - triton==2.1.0
      - typing-extensions==4.8.0
      - tzdata==2023.3
      - urllib3==1.26.17
      - xxhash==3.4.1
+      - auto-gptq==0.6.0
      - yarl==1.9.2
      - soundfile
      - langid
@@ -114,4 +114,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
+      - vllm==0.3.2
+      - transformers>=4.38.2  # Updated Version
+      - xformers==0.0.23.post1  
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -1,4 +1,4 @@
-name: vllm
+name: transformers
 channels:
  - defaults
 dependencies:
@@ -24,76 +24,86 @@ dependencies:
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
+      - --pre
+      - --extra-index-url https://download.pytorch.org/whl/nightly/
+      - accelerate==0.23.0
+      - aiohttp==3.8.5
      - aiosignal==1.3.1
-      - anyio==3.7.1
+      - async-timeout==4.0.3
      - attrs==23.1.0
+      - bark==0.1.5
+      - boto3==1.28.61
+      - botocore==1.31.61
      - certifi==2023.7.22
+      - TTS==0.22.0
      - charset-normalizer==3.3.0
-      - click==8.1.7
-      - cmake==3.27.6
-      - fastapi==0.103.2
+      - datasets==2.14.5
+      - sentence-transformers==2.5.1 # Updated Version
+      - sentencepiece==0.1.99
+      - dill==0.3.7
+      - einops==0.7.0
+      - encodec==0.1.1
      - filelock==3.12.4
      - frozenlist==1.4.0
-      - fsspec==2023.9.2
+      - fsspec==2023.6.0
+      - funcy==2.0
      - grpcio==1.59.0
-      - h11==0.14.0
-      - httptools==0.6.0
-      - huggingface-hub==0.17.3
+      - huggingface-hub
      - idna==3.4
      - jinja2==3.1.2
-      - jsonschema==4.19.1
-      - jsonschema-specifications==2023.7.1
-      - lit==17.0.2
+      - jmespath==1.0.1
      - markupsafe==2.1.3
      - mpmath==1.3.0
-      - msgpack==1.0.7
-      - networkx==3.1
-      - ninja==1.11.1
+      - multidict==6.0.4
+      - multiprocess==0.70.15
+      - networkx
      - numpy==1.26.0
-      - nvidia-cublas-cu11==11.10.3.66
-      - nvidia-cuda-cupti-cu11==11.7.101
-      - nvidia-cuda-nvrtc-cu11==11.7.99
-      - nvidia-cuda-runtime-cu11==11.7.99
-      - nvidia-cudnn-cu11==8.5.0.96
-      - nvidia-cufft-cu11==10.9.0.58
-      - nvidia-curand-cu11==10.2.10.91
-      - nvidia-cusolver-cu11==11.4.0.1
-      - nvidia-cusparse-cu11==11.7.4.91
-      - nvidia-nccl-cu11==2.14.3
-      - nvidia-nvtx-cu11==11.7.91
      - packaging==23.2
-      - pandas==2.1.1
+      - pandas
+      - peft==0.5.0
      - protobuf==4.24.4
      - psutil==5.9.5
      - pyarrow==13.0.0
-      - pydantic==1.10.13
      - python-dateutil==2.8.2
-      - python-dotenv==1.0.0
      - pytz==2023.3.post1
      - pyyaml==6.0.1
-      - ray==2.7.0
-      - referencing==0.30.2
      - regex==2023.10.3
      - requests==2.31.0
-      - rpds-py==0.10.4
-      - safetensors==0.4.0
-      - sentencepiece==0.1.99
+      - rouge==1.0.1
+      - s3transfer==0.7.0
+      - safetensors>=0.4.1
+      - scipy==1.12.0 # Updated Version
      - six==1.16.0
-      - sniffio==1.3.0
-      - starlette==0.27.0
      - sympy==1.12
-      - tokenizers==0.14.1
-      - torch==2.0.1
+      - tokenizers
+      - torch
+      - torchaudio
      - tqdm==4.66.1
-      - transformers==4.34.0
-      - triton==2.0.0
+      - triton==2.1.0
      - typing-extensions==4.8.0
      - tzdata==2023.3
-      - urllib3==2.0.6
-      - uvicorn==0.23.2
-      - uvloop==0.17.0
-      - vllm==0.2.0
-      - watchfiles==0.20.0
-      - websockets==11.0.3
-      - xformers==0.0.22
-prefix: /opt/conda/envs/vllm
+      - auto-gptq==0.6.0
+      - urllib3==1.26.17
+      - xxhash==3.4.1
+      - yarl==1.9.2
+      - soundfile
+      - langid
+      - wget
+      - unidecode
+      - pyopenjtalk-prebuilt
+      - pypinyin
+      - inflect
+      - cn2an
+      - jieba
+      - eng_to_ipa
+      - openai-whisper
+      - matplotlib
+      - gradio==3.41.2
+      - nltk
+      - sudachipy
+      - sudachidict_core
+      - vocos
+      - vllm==0.3.2
+      - transformers>=4.38.2  # Updated Version
+      - xformers==0.0.23.post1
+prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -36,7 +36,7 @@ dependencies:
      - TTS==0.22.0
      - charset-normalizer==3.3.0
      - datasets==2.14.5
-      - sentence-transformers==2.2.2
+      - sentence-transformers==2.5.1 # Updated Version
      - sentencepiece==0.1.99
      - dill==0.3.7
      - einops==0.7.0
@@ -46,7 +46,7 @@ dependencies:
      - fsspec==2023.6.0
      - funcy==2.0
      - grpcio==1.59.0
-      - huggingface-hub==0.16.4
+      - huggingface-hub
      - idna==3.4
      - jinja2==3.1.2
      - jmespath==1.0.1
@@ -59,7 +59,6 @@ dependencies:
      - packaging==23.2
      - pandas
      - peft==0.5.0
-      - git+https://github.com/bigscience-workshop/petals
      - protobuf==4.24.4
      - psutil==5.9.5
      - pyarrow==13.0.0
@@ -70,18 +69,18 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
-      - scipy==1.11.3
+      - safetensors>=0.4.1
+      - scipy==1.12.0 # Updated Version
      - six==1.16.0
      - sympy==1.12
-      - tokenizers==0.14.0
-      - torch==2.1.0
-      - torchaudio==2.1.0
+      - tokenizers
+      - torch==2.1.2
+      - torchaudio==2.1.2
      - tqdm==4.66.1
-      - transformers==4.34.0
      - triton==2.1.0
      - typing-extensions==4.8.0
      - tzdata==2023.3
+      - auto-gptq==0.6.0
      - urllib3==1.26.17
      - xxhash==3.4.1
      - yarl==1.9.2
@@ -102,4 +101,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-prefix: /opt/conda/envs/transformers
+      - vllm==0.3.2
+      - transformers>=4.38.2  # Updated Version
+      - xformers==0.0.23.post1  
+prefix: /opt/conda/envs/transformers
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/coqui/coqui_server.py
+++ b/backend/python/coqui/coqui_server.py
@@ -21,7 +21,7 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', 'en')
+COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)

 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
@@ -33,11 +33,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def LoadModel(self, request, context):

        # Get device
-        device = "cuda" if request.CUDA else "cpu"
+        # device = "cuda" if request.CUDA else "cpu"
+        if torch.cuda.is_available():
+            print("CUDA is available", file=sys.stderr)
+            device = "cuda"
+        else:
+            print("CUDA is not available", file=sys.stderr)
+            device = "cpu"

        if not torch.cuda.is_available() and request.CUDA:
            return backend_pb2.Result(success=False, message="CUDA is not available")

+        self.AudioPath = None
        # List available 🐸TTS models
        print(TTS().list_models())
        if os.path.isabs(request.AudioPath):
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -1,8 +1,20 @@
+export CONDA_ENV_PATH = "diffusers.yml"
+
+ifeq ($(BUILD_TYPE), hipblas)
+export CONDA_ENV_PATH = "diffusers-rocm.yml"
+endif
+
+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: diffusers
 diffusers:
-	@echo "Creating virtual environment..."
-	@conda env create --name diffusers --file diffusers.yml
-	@echo "Virtual environment created."
+	@echo "Installing $(CONDA_ENV_PATH)..."
+	bash install.sh $(CONDA_ENV_PATH)

 .PHONY: run
 run:
@@ -11,4 +23,4 @@ run:
 	@echo "Diffusers run."

 test:
-	bash test.sh
+	bash test.sh
--- a/backend/python/diffusers/backend_diffusers.py
+++ b/backend/python/diffusers/backend_diffusers.py
@@ -21,14 +21,15 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipelin
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image,export_to_video
-from compel import Compel
+from compel import Compel, ReturnedEmbeddingsType

 from transformers import CLIPTextModel
 from safetensors.torch import load_file


 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
-COMPEL=os.environ.get("COMPEL", "1") == "1"
+COMPEL=os.environ.get("COMPEL", "0") == "1"
+XPU=os.environ.get("XPU", "0") == "1"
 CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
 SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1"
 CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8")
@@ -36,6 +37,10 @@ FPS=os.environ.get("FPS", "7")
 DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
 FRAMES=os.environ.get("FRAMES", "64")

+if XPU:
+    import intel_extension_for_pytorch as ipex
+    print(ipex.xpu.get_device_name(0))
+
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

@@ -231,8 +236,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.SchedulerType != "":
                self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
                
-            if not self.img2vid:
-                self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
+            if COMPEL:
+                self.compel = Compel(
+                    tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2 ], 
+                    text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2],
+                    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+                    requires_pooled=[False, True]
+                    )


            if request.ControlNet:
@@ -247,6 +257,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.pipe.to('cuda')
                if self.controlnet:
                    self.controlnet.to('cuda')
+            if XPU:
+                self.pipe = self.pipe.to("xpu")
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
@@ -386,8 +398,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        image = {}
        if COMPEL:
-            conditioning = self.compel.build_conditioning_tensor(prompt)
-            kwargs["prompt_embeds"]= conditioning
+            conditioning, pooled = self.compel.build_conditioning_tensor(prompt)
+            kwargs["prompt_embeds"] = conditioning
+            kwargs["pooled_prompt_embeds"] = pooled
            # pass the kwargs dictionary to the self.pipe method
            image = self.pipe(
                guidance_scale=self.cfg_scale,
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/diffusers-rocm.yml
+++ b/backend/python/diffusers/diffusers-rocm.yml
@@ -0,0 +1,64 @@
+name: diffusers
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2023c=h04d1e81_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - --pre
+      - --extra-index-url https://download.pytorch.org/whl/nightly/
+      - accelerate>=0.11.0
+      - certifi==2023.7.22
+      - charset-normalizer==3.3.0
+      - compel==2.0.2
+      - diffusers==0.24.0
+      - filelock==3.12.4
+      - fsspec==2023.9.2
+      - grpcio==1.59.0
+      - huggingface-hub>=0.19.4
+      - idna==3.4
+      - importlib-metadata==6.8.0
+      - jinja2==3.1.2
+      - markupsafe==2.1.3
+      - mpmath==1.3.0
+      - networkx==3.1
+      - numpy==1.26.0
+      - omegaconf
+      - packaging==23.2
+      - pillow==10.0.1
+      - protobuf==4.24.4
+      - psutil==5.9.5
+      - pyparsing==3.1.1
+      - pyyaml==6.0.1
+      - regex==2023.10.3
+      - requests==2.31.0
+      - safetensors==0.4.0
+      - sympy==1.12
+      - tqdm==4.66.1
+      - transformers>=4.25.1
+      - triton==2.1.0
+      - typing-extensions==4.8.0
+      - urllib3==2.0.6
+      - zipp==3.17.0
+      - torch
+prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -71,4 +71,4 @@ dependencies:
      - typing-extensions==4.8.0
      - urllib3==2.0.6
      - zipp==3.17.0
-prefix: /opt/conda/envs/diffusers
+prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -ex
+
+SKIP_CONDA=${SKIP_CONDA:-0}
+
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}
+
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "diffusers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name diffusers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the Intel image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install torch==2.1.0a0 \
+                torchvision==0.16.0a0 \
+                torchaudio==2.1.0a0 \
+                intel-extension-for-pytorch==2.1.10+xpu \
+                --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+    
+    pip install google-api-python-client \
+                grpcio \
+                grpcio-tools \
+                diffusers==0.24.0 \
+                transformers>=4.25.1 \
+                accelerate \
+                compel==2.0.2 \
+                Pillow
+fi
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    if [ $SKIP_CONDA -ne 1 ]; then
+        # Activate conda environment
+        source activate diffusers
+    fi
+
+    pip cache purge
+fi
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
@@ -3,10 +3,15 @@
 ##
 ## A bash script wrapper that runs the diffusers server with conda

-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate diffusers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate diffusers
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,9 +1,8 @@
+export CONDA_ENV_PATH = "exllama.yml"
+
 .PHONY: exllama
 exllama:
-	@echo "Creating virtual environment..."
-	@conda env create --name exllama --file exllama.yml
-	@echo "Virtual environment created."
-	bash install.sh
+	bash install.sh ${CONDA_ENV_PATH}

 .PHONY: run
 run:
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -1,15 +1,32 @@
 #!/bin/bash
+set -ex

-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment
 export PATH=$PATH:/opt/conda/bin

-# Activate conda environment
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllama] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi
+
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}
+
+if conda_env_exists "exllama" ; then
+    echo "Creating virtual environment..."
+    conda env create --name exllama --file $1
+    echo "Virtual environment created."
+else
+    echo "Virtual environment already exists."
+fi
+
 source activate exllama

-echo $CONDA_PREFIX
-
-
 git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd

-cp -rfv $CONDA_PREFIX/exllama/* ./
+cp -rfv $CONDA_PREFIX/exllama/* ./
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    pip cache purge
+fi
--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@@ -2,7 +2,6 @@

 ##
 ## A bash script wrapper that runs the exllama server with conda
-
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -1,8 +1,6 @@
 .PHONY: exllama2
 exllama2:
-	@echo "Creating virtual environment..."
-	@conda env create --name exllama2 --file exllama2.yml
-	@echo "Virtual environment created."
+	$(MAKE) -C ../common-env/transformers
 	bash install.sh

 .PHONY: run
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -1,14 +1,32 @@
 #!/bin/bash
-
+set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
+export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f

-# Activate conda environment
-source activate exllama2
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllamav2] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi
+
+export PATH=$PATH:/opt/conda/bin
+source activate transformers

 echo $CONDA_PREFIX

-git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
+git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2

-cp -rfv $CONDA_PREFIX/exllamav2/* ./  
+pushd $CONDA_PREFIX/exllamav2
+
+git checkout -b build $SHA
+
+# TODO: this needs to be pinned within the conda environments
+pip install -r requirements.txt
+
+popd
+
+cp -rfv $CONDA_PREFIX/exllamav2/* ./  
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    pip cache purge
+fi
--- a/backend/python/exllama2/run.sh
+++ b/backend/python/exllama2/run.sh
@@ -6,7 +6,7 @@
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
-source activate exllama2
+source activate transformers

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/mamba/Makefile
+++ b/backend/python/mamba/Makefile
@@ -0,0 +1,16 @@
+.PHONY: mamba
+mamba:
+	$(MAKE) -C ../common-env/transformers
+	bash install.sh
+
+.PHONY: run
+run:
+	@echo "Running mamba..."
+	bash run.sh
+	@echo "mamba run."
+
+.PHONY: test
+test:
+	@echo "Testing mamba..."
+	bash test.sh
+	@echo "mamba tested."
--- a/backend/python/mamba/README.md
+++ b/backend/python/mamba/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the mamba project
+
+```
+make mamba
+```
--- a/backend/python/mamba/backend_mamba.py
+++ b/backend/python/mamba/backend_mamba.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+from concurrent import futures
+import time
+import argparse
+import signal
+import sys
+import os
+
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+MAMBA_CHAT= os.environ.get('MAMBA_CHAT', '1') == '1'
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer that implements the Backend service defined in backend.proto.
+    """
+    def generate(self,prompt, max_new_tokens):
+        """
+        Generates text based on the given prompt and maximum number of new tokens.
+
+        Args:
+            prompt (str): The prompt to generate text from.
+            max_new_tokens (int): The maximum number of new tokens to generate.
+
+        Returns:
+            str: The generated text.
+        """
+        self.generator.end_beam_search()
+
+        # Tokenizing the input
+        ids = self.generator.tokenizer.encode(prompt)
+
+        self.generator.gen_begin_reuse(ids)
+        initial_len = self.generator.sequence[0].shape[0]
+        has_leading_space = False
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            token = self.generator.gen_single_token()
+            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                has_leading_space = True
+
+            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+            if has_leading_space:
+                decoded_text = ' ' + decoded_text
+
+            if token.item() == self.generator.tokenizer.eos_token_id:
+                break
+        return decoded_text
+
+    def Health(self, request, context):
+        """
+        Returns a health check message.
+
+        Args:
+            request: The health check request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Reply: The health check reply.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        Loads a language model.
+
+        Args:
+            request: The load model request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The load model result.
+        """
+        try:
+            tokenizerModel = request.Tokenizer
+            if tokenizerModel == "":
+                tokenizerModel = request.Model
+
+            tokenizer = AutoTokenizer.from_pretrained(tokenizerModel)
+            if MAMBA_CHAT:
+                tokenizer.eos_token = "<|endoftext|>"
+                tokenizer.pad_token = tokenizer.eos_token
+            self.tokenizer = tokenizer
+            self.model = MambaLMHeadModel.from_pretrained(request.Model, device="cuda", dtype=torch.float16)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Predict(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters.
+
+        Args:
+            request: The predict request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The predict result.
+        """
+        if request.TopP == 0:
+            request.TopP = 0.9
+
+        max_tokens = request.Tokens
+
+        if request.Tokens == 0:
+            max_tokens = 2000
+
+        # encoded_input = self.tokenizer(request.Prompt)
+        tokens = self.tokenizer(request.Prompt, return_tensors="pt")
+        input_ids = tokens.input_ids.to(device="cuda")
+        out = self.model.generate(input_ids=input_ids, max_length=max_tokens, temperature=request.Temperature,
+                                     top_p=request.TopP, eos_token_id=self.tokenizer.eos_token_id)
+
+        decoded = self.tokenizer.batch_decode(out)
+       
+        generated_text = decoded[0]
+
+        # Remove prompt from response if present
+        if request.Prompt in generated_text:
+            generated_text = generated_text.replace(request.Prompt, "")
+
+        return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+
+    def PredictStream(self, request, context):
+        """
+        Generates text based on the given prompt and sampling parameters, and streams the results.
+
+        Args:
+            request: The predict stream request.
+            context: The gRPC context.
+
+        Returns:
+            backend_pb2.Result: The predict stream result.
+        """
+        yield self.Predict(request, context)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/mamba/backend_pb2.py
+++ b/backend/python/mamba/backend_pb2.py
--- a/backend/python/mamba/backend_pb2_grpc.py
+++ b/backend/python/mamba/backend_pb2_grpc.py
@@ -0,0 +1,363 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import backend_pb2 as backend__pb2
+
+
+class BackendStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Health = channel.unary_unary(
+                '/backend.Backend/Health',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Predict = channel.unary_unary(
+                '/backend.Backend/Predict',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.LoadModel = channel.unary_unary(
+                '/backend.Backend/LoadModel',
+                request_serializer=backend__pb2.ModelOptions.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.PredictStream = channel.unary_stream(
+                '/backend.Backend/PredictStream',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Embedding = channel.unary_unary(
+                '/backend.Backend/Embedding',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.EmbeddingResult.FromString,
+                )
+        self.GenerateImage = channel.unary_unary(
+                '/backend.Backend/GenerateImage',
+                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.AudioTranscription = channel.unary_unary(
+                '/backend.Backend/AudioTranscription',
+                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
+                response_deserializer=backend__pb2.TranscriptResult.FromString,
+                )
+        self.TTS = channel.unary_unary(
+                '/backend.Backend/TTS',
+                request_serializer=backend__pb2.TTSRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )
+
+
+class BackendServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Health(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def LoadModel(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def PredictStream(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Embedding(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateImage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def AudioTranscription(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TTS(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BackendServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Health': grpc.unary_unary_rpc_method_handler(
+                    servicer.Health,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'LoadModel': grpc.unary_unary_rpc_method_handler(
+                    servicer.LoadModel,
+                    request_deserializer=backend__pb2.ModelOptions.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'PredictStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.PredictStream,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Embedding': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embedding,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
+            ),
+            'GenerateImage': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateImage,
+                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
+                    servicer.AudioTranscription,
+                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
+                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
+            ),
+            'TTS': grpc.unary_unary_rpc_method_handler(
+                    servicer.TTS,
+                    request_deserializer=backend__pb2.TTSRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'backend.Backend', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Backend(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Health(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def LoadModel(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
+            backend__pb2.ModelOptions.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def PredictStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Embedding(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.EmbeddingResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateImage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
+            backend__pb2.GenerateImageRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def AudioTranscription(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
+            backend__pb2.TranscriptRequest.SerializeToString,
+            backend__pb2.TranscriptResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TTS(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
+            backend__pb2.TTSRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/backend/python/mamba/install.sh
+++ b/backend/python/mamba/install.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+##
+## A bash script installs the required dependencies of VALL-E-X and prepares the environment
+
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[mamba] Attention!!! nvcc is required - skipping installation"
+    exit 0
+fi
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate transformers
+
+echo $CONDA_PREFIX
+
+pip install causal-conv1d==1.0.0 mamba-ssm==1.0.1
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    pip cache purge
+fi
--- a/backend/python/mamba/run.sh
+++ b/backend/python/mamba/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the diffusers server with conda
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/backend_mamba.py $@
--- a/backend/python/mamba/test.sh
+++ b/backend/python/mamba/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the transformers server with conda
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test_backend_mamba.py
--- a/backend/python/mamba/test_backend_mamba.py
+++ b/backend/python/mamba/test_backend_mamba.py
@@ -0,0 +1,76 @@
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+import unittest
+import subprocess
+import time
+import grpc
+import backend_pb2_grpc
+import backend_pb2
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service.
+
+    This class contains methods to test the startup and shutdown of the gRPC service.
+    """
+    def setUp(self):
+        self.service = subprocess.Popen(["python", "backend_vllm.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_text(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+                req = backend_pb2.PredictOptions(Prompt="The capital of France is")
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+        except Exception as err:
+            print(err)
+            self.fail("text service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,6 +1,8 @@
 .PHONY: petals
 petals:
-	$(MAKE) -C ../common-env/transformers
+	@echo "Creating virtual environment..."
+	bash install.sh "petals.yml"
+	@echo "Virtual environment created."

 .PHONY: run
 run:
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/petals/install.sh
+++ b/backend/python/petals/install.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export PATH=$PATH:/opt/conda/bin
+
+conda env create --name petals --file $1
--- a/backend/python/petals/run.sh
+++ b/backend/python/petals/run.sh
@@ -5,14 +5,16 @@

 export PATH=$PATH:/opt/conda/bin

+CONDA_ENV=petals
+
 # Activate conda environment
 # if source is available use it, or use conda
 #
 if [ -f /opt/conda/bin/activate ]; then
-    source activate transformers
+    source activate $CONDA_ENV
 else
    eval "$(conda shell.bash hook)"
-    conda activate transformers
+    conda activate $CONDA_ENV
 fi

 # get the directory where the bash script is located
--- a/backend/python/petals/test.sh
+++ b/backend/python/petals/test.sh
@@ -3,7 +3,16 @@
 ## A bash script wrapper that runs the transformers server with conda

 # Activate conda environment
-source activate transformers
+CONDA_ENV=petals
+# Activate conda environment
+# if source is available use it, or use conda
+#
+if [ -f /opt/conda/bin/activate ]; then
+    source activate $CONDA_ENV
+else
+    eval "$(conda shell.bash hook)"
+    conda activate $CONDA_ENV
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@@ -3,10 +3,16 @@
 ##
 ## A bash script wrapper that runs the transformers server with conda

-export PATH=$PATH:/opt/conda/bin

-# Activate conda environment
-source activate transformers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate transformers
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/Show More
+++ b/Show More