feat(alias): alias llama to llama-cpp, update docs (#1448 )

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
docs(mixtral): add mixtral example (#1449 )
2026-02-03 03:02:38 -05:00 · 2023-12-16 18:22:45 +01:00 · 2023-12-16 17:44:43 +01:00 · 2023-12-16 09:22:28 +01:00 · 2023-12-16 09:22:13 +01:00 · 2023-12-16 09:21:56 +01:00
260 changed files with 14706 additions and 1652 deletions
--- a/.env
+++ b/.env
@@ -69,4 +69,21 @@ MODELS_PATH=/models
 # PYTHON_GRPC_MAX_WORKERS=1

 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
-# LLAMACPP_PARALLEL=1
+# LLAMACPP_PARALLEL=1
+
+### Enable to run parallel requests
+# PARALLEL_REQUESTS=true
+
+### Watchdog settings
+###
+# Enables watchdog to kill backends that are inactive for too much time
+# WATCHDOG_IDLE=true
+#
+# Enables watchdog to kill backends that are busy for too much time
+# WATCHDOG_BUSY=true
+#
+# Time in duration format (e.g. 1h30m) after which a backend is considered idle
+# WATCHDOG_IDLE_TIMEOUT=5m
+#
+# Time in duration format (e.g. 1h30m) after which a backend is considered busy
+# WATCHDOG_BUSY_TIMEOUT=5m
--- a/.github/workflows/disabled/test-gpu.yml
+++ b/.github/workflows/disabled/test-gpu.yml
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -14,8 +14,25 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  image-build:
+  extras-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+    secrets:
+      dockerUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      dockerPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
          - build-type: ''
@@ -24,130 +41,117 @@ jobs:
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11'
            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
-
-    runs-on: arc-runner-set 
-    steps:
-      - name: Force Install GIT latest
-        run: |
-          sudo apt-get update \
-          && sudo apt-get install -y software-properties-common \
-          && sudo apt-get update \
-          && sudo add-apt-repository -y ppa:git-core/ppa \
-          && sudo apt-get update \
-          && sudo apt-get install -y git
-      - name: Checkout
-        uses: actions/checkout@v4
-      # - name: Release space from worker
-      #   run: |
-      #     echo "Listing top largest packages"
-      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-      #     head -n 30 <<< "${pkgs}"
-      #     echo
-      #     df -h
-      #     echo
-      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-      #     sudo rm -rf /usr/local/lib/android
-      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-      #     sudo rm -rf /usr/share/dotnet
-      #     sudo apt-get remove -y '^mono-.*' || true
-      #     sudo apt-get remove -y '^ghc-.*' || true
-      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-      #     sudo apt-get remove -y 'php.*' || true
-      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-      #     sudo apt-get remove -y '^google-.*' || true
-      #     sudo apt-get remove -y azure-cli || true
-      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-      #     sudo apt-get remove -y '^gfortran-.*' || true
-      #     sudo apt-get remove -y microsoft-edge-stable || true
-      #     sudo apt-get remove -y firefox || true
-      #     sudo apt-get remove -y powershell || true
-      #     sudo apt-get remove -y r-base-core || true
-      #     sudo apt-get autoremove -y
-      #     sudo apt-get clean
-      #     echo
-      #     echo "Listing top largest packages"
-      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-      #     head -n 30 <<< "${pkgs}"
-      #     echo
-      #     sudo rm -rfv build || true
-      #     df -h
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: quay.io/go-skynet/local-ai
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
-          flavor: |
-            latest=${{ matrix.tag-latest }}
-            suffix=${{ matrix.tag-suffix }}
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@master
-        with:
-          platforms: all
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@master
-
-      - name: Login to DockerHub
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: quay.io
-          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          build-args: |
-            BUILD_TYPE=${{ matrix.build-type }}
-            CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
-            CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
-            FFMPEG=${{ matrix.ffmpeg }}
-          context: .
-          file: ./Dockerfile
-          platforms: ${{ matrix.platforms }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          - build-type: ''
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: ''
+            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+  core-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+    secrets:
+      dockerUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      dockerPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-core'
+            ffmpeg: ''
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-core'
+            ffmpeg: ''
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -0,0 +1,147 @@
+---
+name: 'build container images (reusable)'
+
+on:
+  workflow_call:
+    inputs:
+      build-type:
+        description: 'Build type'
+        default: ''
+        type: string
+      cuda-major-version:
+        description: 'CUDA major version'
+        default: "11"
+        type: string
+      cuda-minor-version:
+        description: 'CUDA minor version'
+        default: "7"
+        type: string
+      platforms:
+        description: 'Platforms'
+        default: ''
+        type: string
+      tag-latest:
+        description: 'Tag latest'
+        default: ''
+        type: string
+      tag-suffix:
+        description: 'Tag suffix'
+        default: ''
+        type: string
+      ffmpeg:
+        description: 'FFMPEG'
+        default: ''
+        type: string
+      image-type:
+        description: 'Image type'
+        default: ''
+        type: string
+      runs-on:
+        description: 'Runs on'
+        required: true
+        default: ''
+        type: string
+    secrets:
+      dockerUsername:
+        required: true
+      dockerPassword:
+        required: true
+jobs:
+  reusable_image-build:
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Checkout
+        uses: actions/checkout@v4
+      # - name: Release space from worker
+      #   run: |
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     df -h
+      #     echo
+      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+      #     sudo rm -rf /usr/local/lib/android
+      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+      #     sudo rm -rf /usr/share/dotnet
+      #     sudo apt-get remove -y '^mono-.*' || true
+      #     sudo apt-get remove -y '^ghc-.*' || true
+      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+      #     sudo apt-get remove -y 'php.*' || true
+      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+      #     sudo apt-get remove -y '^google-.*' || true
+      #     sudo apt-get remove -y azure-cli || true
+      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+      #     sudo apt-get remove -y '^gfortran-.*' || true
+      #     sudo apt-get remove -y microsoft-edge-stable || true
+      #     sudo apt-get remove -y firefox || true
+      #     sudo apt-get remove -y powershell || true
+      #     sudo apt-get remove -y r-base-core || true
+      #     sudo apt-get autoremove -y
+      #     sudo apt-get clean
+      #     echo
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     sudo rm -rfv build || true
+      #     df -h
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.dockerUsername }}
+          password: ${{ secrets.dockerPassword }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            FFMPEG=${{ inputs.ffmpeg }}
+            IMAGE_TYPE=${{ inputs.image-type }}
+          context: .
+          file: ./Dockerfile
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+      - name: job summary
+        run: |
+          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -0,0 +1,219 @@
+---
+name: 'Tests extras backends'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  tests-transformers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test transformers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/transformers
+           make -C backend/python/transformers test
+
+  tests-sentencetransformers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test sentencetransformers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/sentencetransformers
+           make -C backend/python/sentencetransformers test
+
+  tests-diffusers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test diffusers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/diffusers
+           make -C backend/python/diffusers test
+
+
+  tests-transformers-musicgen:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test transformers-musicgen
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/transformers-musicgen
+           make -C backend/python/transformers-musicgen test
+
+
+
+  tests-bark:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test bark
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/bark
+           make -C backend/python/bark test
+
+           
+  # Below tests needs GPU. Commented out for now
+  # TODO: Re-enable as soon as we have GPU nodes
+  # tests-vllm:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+  #            sudo apt-get update && \
+  #            sudo apt-get install -y conda
+  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+  #         sudo rm -rfv /usr/bin/conda || true
+  #     - name: Test vllm
+  #       run: |
+  #          export PATH=$PATH:/opt/conda/bin
+  #          make -C backend/python/vllm
+  #          make -C backend/python/vllm test
+  # tests-vallex:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+  #            sudo apt-get update && \
+  #            sudo apt-get install -y conda
+  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2    
+  #         sudo rm -rfv /usr/bin/conda || true
+  #     - name: Test vall-e-x
+  #       run: |
+  #          export PATH=$PATH:/opt/conda/bin
+  #          make -C backend/python/vall-e-x
+  #          make -C backend/python/vall-e-x test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -78,13 +78,12 @@ jobs:
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
          
          sudo rm -rfv /usr/bin/conda || true
-          PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface
+          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers

          # Pre-build piper before we start tests in order to have shared libraries in place
-          make go-piper && \
-          GO_TAGS="tts" make -C go-piper piper.o && \
-          sudo cp -rfv go-piper/piper/build/pi/lib/. /usr/lib/ && \
-
+          make sources/go-piper && \
+          GO_TAGS="tts" make -C sources/go-piper piper.o && \
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,9 @@
 # go-llama build artifacts
-go-llama
-go-llama-stable
-/gpt4all
-go-stable-diffusion
-go-piper
-/go-bert
-go-ggllm
-/piper
+/sources/
 __pycache__/
 *.a
 get-sources
+prepare-sources
 /backend/cpp/llama/grpc-server
 /backend/cpp/llama/llama.cpp

--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "docs/themes/hugo-theme-relearn"]
+	path = docs/themes/hugo-theme-relearn
+	url = https://github.com/McShelby/hugo-theme-relearn.git
--- a/71
+++ b/71
@@ -12,7 +12,9 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/run.sh,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/run.sh,vall-e-x:/build/extra/grpc/vall-e-x/run.sh,vllm:/build/extra/grpc/vllm/run.sh"
+
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
+
 ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tts"

@@ -64,20 +66,10 @@ RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmo
    apt-get update && \
    apt-get install -y conda

-COPY extra/requirements.txt /build/extra/requirements.txt
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-#RUN if [ "${TARGETARCH}" = "amd64" ]; then \
-#        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
-#    fi
-#RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
-#        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
- #   fi
-#RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt

-# Vall-e-X
-RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt

 # \
 #    ; fi
@@ -98,12 +90,9 @@ ENV NVIDIA_VISIBLE_DEVICES=all

 WORKDIR /build

-COPY Makefile .
-RUN make get-sources
-COPY go.mod .
-RUN make prepare
 COPY . .
 COPY .git .
+RUN make prepare

 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
@@ -112,15 +101,15 @@ RUN if [ "${BUILD_GRPC}" = "true" ]; then \
    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
      -DgRPC_BUILD_TESTS=OFF \
-       ../.. && make -j12 install && rm -rf grpc \
+       ../.. && make -j12 install \
    ; fi

 # Rebuild with defaults backends
 RUN make build

-RUN if [ ! -d "/build/go-piper/piper/build/pi/lib/" ]; then \
-    mkdir -p /build/go-piper/piper/build/pi/lib/ \
-    touch /build/go-piper/piper/build/pi/lib/keep \
+RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
+    mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
+    touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
    ; fi

 ###################################
@@ -154,49 +143,55 @@ WORKDIR /build
 # see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
-RUN make prepare-sources
+
+COPY --from=builder /build/sources ./sources/
+COPY --from=builder /build/grpc ./grpc/
+
+RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc

 # Copy the binary
 COPY --from=builder /build/local-ai ./

 # Copy shared libraries for piper
-COPY --from=builder /build/go-piper/piper/build/pi/lib/* /usr/lib/
+COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/

 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion

 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C extra/grpc/autogptq \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C extra/grpc/bark \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C extra/grpc/diffusers \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vllm \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vall-e-x \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C extra/grpc/exllama \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
    ; fi
-
-# Copy VALLE-X as it's not a real "lib"
-RUN if [ -d /usr/lib/vall-e-x ]; then \
-    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
-    fi
-
-# we also copy exllama libs over to resolve exllama import error
-RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
-        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
-    fi

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
--- a/Entitlements.plist
+++ b/Entitlements.plist
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>com.apple.security.network.client</key>
+    <true/>
+    <key>com.apple.security.network.server</key>
+    <true/>
+</dict>
+</plist>
--- a/354
+++ b/354
@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

-CPPLLAMA_VERSION?=a75fa576abba9d37f463580c379e4bbf1e1ad03c
+CPPLLAMA_VERSION?=88ae8952b65cbf32eb1f5703681ea592e510e570

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -22,20 +22,21 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=c898cd0f62df8f2a7830e53d1d513bef4f6f792b

 # whisper.cpp version
-WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
+WHISPER_CPP_VERSION?=940de9dbe9c90624dc99521cb34c8a97b86d543c

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d

 # go-piper version
-PIPER_VERSION?=736f6fb639ab8e3397356e48eeb6bdcb9da88a78
+PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
+STABLEDIFFUSION_VERSION?=902db5f066fd137697e3b69d0fa10d4782bd2c2f

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
+
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
@@ -68,29 +69,39 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif

-ifeq ($(UNAME_S),Darwin)
+ifeq ($(OS),Darwin)
 	CGO_LDFLAGS += -lcblas -framework Accelerate
-ifneq ($(BUILD_TYPE),metal)
-    # explicit disable metal if on Darwin and metal is disabled
-	CMAKE_ARGS+=-DLLAMA_METAL=OFF
-endif
+	ifeq ($(OSX_SIGNING_IDENTITY),)
+		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
+	endif
+
+	# on OSX, if BUILD_TYPE is blank, we should default to use Metal
+	ifeq ($(BUILD_TYPE),)
+		BUILD_TYPE=metal
+	# disable metal if on Darwin and any other value is explicitly passed.
+	else ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+	endif
 endif

 ifeq ($(BUILD_TYPE),openblas)
 	CGO_LDFLAGS+=-lopenblas
+	export WHISPER_OPENBLAS=1
 endif

 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
+	export WHISPER_CUBLAS=1
 endif

 ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
-	# Llama-stable has no hipblas support, so override it here.
+	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
+	export WHISPER_HIPBLAS=1
 	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
@@ -100,10 +111,12 @@ endif
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
+	export WHISPER_METAL=1
 endif

 ifeq ($(BUILD_TYPE),clblas)
 	CGO_LDFLAGS+=-lOpenCL -lclblast
+	export WHISPER_CLBLAST=1
 endif

 # glibc-static or glibc-devel-static required
@@ -119,12 +132,12 @@ endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
-	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/go-piper/piper/src/cpp -I$(shell pwd)/go-piper/piper/build/fi/include -I$(shell pwd)/go-piper/piper/build/pi/include -I$(shell pwd)/go-piper/piper/build/si/include
- 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/go-piper/piper/build/fi/lib -L$(shell pwd)/go-piper/piper/build/pi/lib -L$(shell pwd)/go-piper/piper/build/si/lib -lfmt -lspdlog
+	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/sources/go-piper/piper/src/cpp -I$(shell pwd)/sources/go-piper/piper/build/fi/include -I$(shell pwd)/sources/go-piper/piper/build/pi/include -I$(shell pwd)/sources/go-piper/piper/build/si/include
+ 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/sources/go-piper/piper/build/fi/lib -L$(shell pwd)/sources/go-piper/piper/build/pi/lib -L$(shell pwd)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-ggml backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)

 # If empty, then we build all
@@ -137,112 +150,117 @@ endif
 all: help

 ## GPT4ALL
-gpt4all:
-	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
-	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
+sources/gpt4all:
+	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
+	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

 ## go-piper
-go-piper:
-	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
-	cd go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-piper:
+	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
+	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1

 ## BERT embeddings
-go-bert:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp go-bert
-	cd go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
+	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

 ## stable diffusion
-go-stable-diffusion:
-	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion go-stable-diffusion
-	cd go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-stable-diffusion:
+	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
+	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1

-go-stable-diffusion/libstablediffusion.a:
-	$(MAKE) -C go-stable-diffusion libstablediffusion.a
+sources/go-stable-diffusion/libstablediffusion.a:
+	$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a

 ## RWKV
-go-rwkv:
-	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
-	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
+	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-go-rwkv/librwkv.a: go-rwkv
-	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+sources/go-rwkv/librwkv.a: sources/go-rwkv
+	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-go-bert/libgobert.a: go-bert
-	$(MAKE) -C go-bert libgobert.a
+sources/go-bert/libgobert.a: sources/go-bert
+	$(MAKE) -C sources/go-bert libgobert.a

-backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
+backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	mkdir -p backend-assets/gpt4all
-	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
-	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
-	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

-backend-assets/espeak-ng-data: go-piper
+backend-assets/espeak-ng-data: sources/go-piper
 	mkdir -p backend-assets/espeak-ng-data
-	$(MAKE) -C go-piper piper.o
-	@cp -rf go-piper/piper/build/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
+	$(MAKE) -C sources/go-piper piper.o
+	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data

-gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
-	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a
+sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## CEREBRAS GPT
-go-ggml-transformers:
-	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp go-ggml-transformers
-	cd go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-ggml-transformers:
+	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp sources/go-ggml-transformers
+	cd sources/go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1

-go-ggml-transformers/libtransformers.a: go-ggml-transformers
-	$(MAKE) -C go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
+sources/go-ggml-transformers/libtransformers.a: sources/go-ggml-transformers
+	$(MAKE) -C sources/go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a

-whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git
-	cd whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
+sources/whisper.cpp:
+	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
+	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

-whisper.cpp/libwhisper.a: whisper.cpp
-	cd whisper.cpp && make libwhisper.a
+sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
+	cd sources/whisper.cpp && make libwhisper.a

-go-llama:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
-	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-llama:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
+	cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

-go-llama-stable:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
-	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-llama-ggml:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
+	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-go-llama/libbinding.a: go-llama
-	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+sources/go-llama/libbinding.a: sources/go-llama
+	$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

-go-llama-stable/libbinding.a: go-llama-stable
-	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
+	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-go-piper/libpiper_binding.a: go-piper
-	$(MAKE) -C go-piper libpiper_binding.a example/main
+sources/go-piper/libpiper_binding.a: sources/go-piper
+	$(MAKE) -C sources/go-piper libpiper_binding.a example/main

-get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
+backend/cpp/llama/llama.cpp:
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp	
+
+get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/go-ggml-transformers sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion
 	touch $@

 replace:
-	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
-	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/sources/go-ggml-transformers
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/sources/go-rwkv
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/sources/whisper.cpp
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(shell pwd)/sources/whisper.cpp/bindings/go
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/sources/go-bert
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/sources/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/sources/go-piper

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
+	touch $@

 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C go-llama clean
-	$(MAKE) -C go-llama-stable clean
-	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C go-ggml-transformers clean
-	$(MAKE) -C go-rwkv clean
-	$(MAKE) -C whisper.cpp clean
-	$(MAKE) -C go-stable-diffusion clean
-	$(MAKE) -C go-bert clean
-	$(MAKE) -C go-piper clean
+	$(MAKE) -C sources/go-llama clean
+	$(MAKE) -C sources/go-llama-ggml clean
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
+	$(MAKE) -C sources/go-ggml-transformers clean
+	$(MAKE) -C sources/go-rwkv clean
+	$(MAKE) -C sources/whisper.cpp clean
+	$(MAKE) -C sources/go-stable-diffusion clean
+	$(MAKE) -C sources/go-bert clean
+	$(MAKE) -C sources/go-piper clean
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -251,17 +269,7 @@ prepare: prepare-sources $(OPTIONAL_TARGETS)
 clean: ## Remove build related file
 	$(GOCMD) clean -cache
 	rm -f prepare
-	rm -rf ./go-llama
-	rm -rf ./gpt4all
-	rm -rf ./go-llama-stable
-	rm -rf ./go-gpt2
-	rm -rf ./go-stable-diffusion
-	rm -rf ./go-ggml-transformers
-	rm -rf ./backend-assets
-	rm -rf ./go-rwkv
-	rm -rf ./go-bert
-	rm -rf ./whisper.cpp
-	rm -rf ./go-piper
+	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
 	rm -rf ./backend/cpp/grpc/grpc_repo
@@ -283,6 +291,9 @@ dist: build
 	mkdir -p release
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)

+osx-signed: build
+	codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
+
 ## Run
 run: prepare ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./
@@ -306,7 +317,7 @@ test: prepare test-models/testmodel grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
@@ -373,40 +384,55 @@ help: ## Show this help.
 protogen: protogen-go protogen-python

 protogen-go:
-	protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative \
-    pkg/grpc/proto/backend.proto
+	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+    backend/backend.proto

 protogen-python:
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto

 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments:
-	$(MAKE) -C extra/grpc/autogptq
-	$(MAKE) -C extra/grpc/bark
-	$(MAKE) -C extra/grpc/diffusers
-	$(MAKE) -C extra/grpc/vllm
-	$(MAKE) -C extra/grpc/huggingface
-	$(MAKE) -C extra/grpc/vall-e-x
-	$(MAKE) -C extra/grpc/exllama
+	$(MAKE) -C backend/python/autogptq
+	$(MAKE) -C backend/python/bark
+	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/vllm
+	$(MAKE) -C backend/python/sentencetransformers
+	$(MAKE) -C backend/python/transformers
+	$(MAKE) -C backend/python/transformers-musicgen
+	$(MAKE) -C backend/python/vall-e-x
+	$(MAKE) -C backend/python/exllama
+	$(MAKE) -C backend/python/petals
+	$(MAKE) -C backend/python/exllama2

+prepare-test-extra:
+	$(MAKE) -C backend/python/transformers
+	$(MAKE) -C backend/python/diffusers
+
+test-extra: prepare-test-extra
+	$(MAKE) -C backend/python/transformers test
+	$(MAKE) -C backend/python/diffusers test

 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
-# TODO: every binary should have its own folder instead, so can have different metal implementations
+backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama LIBRARY_PATH=$(shell pwd)/sources/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
+# TODO: every binary should have its own folder instead, so can have different  implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
+	cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
 endif

 ## BACKEND CPP LLAMA START
@@ -425,7 +451,7 @@ ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
 	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
 	export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
-	CMAKE_ARGS="${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server 
+	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
@@ -440,71 +466,71 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

-backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/
+backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama-ggml LIBRARY_PATH=$(shell pwd)/sources/go-llama-ggml \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

-backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
+backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/

-backend-assets/grpc/dolly: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./cmd/grpc/dolly/
+backend-assets/grpc/dolly: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./backend/go/llm/dolly/

-backend-assets/grpc/gpt2: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./cmd/grpc/gpt2/
+backend-assets/grpc/gpt2: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./backend/go/llm/gpt2/

-backend-assets/grpc/gptj: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./cmd/grpc/gptj/
+backend-assets/grpc/gptj: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./backend/go/llm/gptj/

-backend-assets/grpc/gptneox: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./cmd/grpc/gptneox/
+backend-assets/grpc/gptneox: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./backend/go/llm/gptneox/

-backend-assets/grpc/mpt: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./cmd/grpc/mpt/
+backend-assets/grpc/mpt: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./backend/go/llm/mpt/

-backend-assets/grpc/replit: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./cmd/grpc/replit/
+backend-assets/grpc/replit: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./backend/go/llm/replit/

-backend-assets/grpc/falcon-ggml: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./cmd/grpc/falcon-ggml/
+backend-assets/grpc/falcon-ggml: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./backend/go/llm/falcon-ggml/

-backend-assets/grpc/starcoder: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./cmd/grpc/starcoder/
+backend-assets/grpc/starcoder: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./backend/go/llm/starcoder/

-backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/
+backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-rwkv LIBRARY_PATH=$(shell pwd)/sources/go-rwkv \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv

-backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
+backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-bert LIBRARY_PATH=$(shell pwd)/sources/go-bert \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/

 backend-assets/grpc/stablediffusion: backend-assets/grpc
 	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
-		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
-		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
-		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
+		$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
+		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/sources/go-stable-diffusion/ \
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/; \
 	fi

-backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
-	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/
+backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/sources/go-piper \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

-backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/
+backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/whisper.cpp LIBRARY_PATH=$(shell pwd)/sources/whisper.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/

 grpcs: prepare $(GRPC_BACKENDS)
--- a/README.md
+++ b/README.md
@@ -21,15 +21,14 @@
 </p>

 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
-> 
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)
-
+>
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.

-<p align="center"><b>Follow LocalAI </b></p>
+<p  align="center"><b>Follow LocalAI </b></p>

 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
@@ -39,7 +38,7 @@
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>

-<p align="center"><b>Connect with the Creator </b></p>
+<p  align="center"><b>Connect with the Creator </b></p>

 <p align="center">
 <a href="https://twitter.com/mudler_it" target="blank">
@@ -50,12 +49,12 @@
 </a>
 </p>

-<p align="center"><b>Share LocalAI Repository</b></p>
+<p  align="center"><b>Share LocalAI Repository</b></p>

 <p align="center">

 <a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
-<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a> 
+<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a>
 <a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
 <a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
 " target="blank">
@@ -64,23 +63,37 @@

 </p>

+## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)
+
+## 🔥🔥 Hot topics / Roadmap
+
+[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+
+🆕 New! [LLM finetuning guide](https://localai.io/advanced/fine-tuning/)
+
+Hot topics (looking for contributors):
+- Backends v2: https://github.com/mudler/LocalAI/issues/1126
+- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
+
+If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
+
+
+
 <hr>

 In a nutshell:

 - Local, OpenAI drop-in alternative REST API. You own your data.
 - NO GPU required. NO Internet access is required either
-  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
+  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html).
 - Supports multiple models
 - 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
 - ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.

-LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 
+LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!

 Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!

-## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)
-
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -91,7 +104,34 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
+- 🆕 [Vision API](https://localai.io/features/gpt-vision/)

+## 💻 Usage
+
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
+
+### 🔗 Community and integrations
+
+Build and deploy custom containers:
+- https://github.com/sozercan/aikit
+
+WebUIs:
+- https://github.com/Jirubizu/localai-admin
+- https://github.com/go-skynet/LocalAI-frontend
+
+Model galleries
+- https://github.com/go-skynet/model-gallery
+
+Other:
+- Helm chart https://github.com/go-skynet/helm-charts
+
+### 🔗 Resources
+
+- 🆕 New! [LLM finetuning guide](https://localai.io/advanced/fine-tuning/)
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/integrations/)
+- [How tos section](https://localai.io/howtos/) (curated by our community)

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

@@ -100,21 +140,6 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs
 - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
 - [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)

-## 💻 Usage
-
-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
-
-### 💡 Example: Use Luna-AI Llama model
-
-See the [documentation](https://localai.io/basics/getting_started)
-
-### 🔗 Resources
-
- [How to build locally](https://localai.io/basics/build/index.html)
- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
- [Projects integrating LocalAI](https://localai.io/integrations/)
- [How tos section](https://localai.io/howtos/) (curated by our community)
-  
 ## Citation

 If you utilize this repository, data in a downstream project, please consider citing it with:
@@ -137,12 +162,12 @@ Support the project by becoming [a backer or sponsor](https://github.com/sponsor

 A huge thank you to our generous sponsors who support this project:

-| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) | 
+| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
 |:-----------------------------------------------:|
-|  [Spectro Cloud](https://www.spectrocloud.com/)  |  
+|  [Spectro Cloud](https://www.spectrocloud.com/)  |
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

-And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 
+And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.

 - [Sponsor list](https://github.com/sponsors/mudler)
 - JDAM00 (donating HW for the CI)
--- a/api/api.go
+++ b/api/api.go
@@ -1,8 +1,10 @@
 package api

 import (
+	"encoding/json"
 	"errors"
 	"fmt"
+	"os"
 	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
@@ -13,6 +15,7 @@ import (
 	"github.com/go-skynet/LocalAI/internal"
 	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/assets"
+	"github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
@@ -79,6 +82,22 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 		options.Loader.StopAllGRPC()
 	}()

+	if options.WatchDog {
+		wd := model.NewWatchDog(
+			options.Loader,
+			options.WatchDogBusyTimeout,
+			options.WatchDogIdleTimeout,
+			options.WatchDogBusy,
+			options.WatchDogIdle)
+		options.Loader.SetWatchDog(wd)
+		go wd.Run()
+		go func() {
+			<-options.Context.Done()
+			log.Debug().Msgf("Context canceled, shutting down")
+			wd.Shutdown()
+		}()
+	}
+
 	return options, cl, nil
 }

@@ -127,28 +146,46 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
-		if len(options.ApiKeys) > 0 {
-			authHeader := c.Get("Authorization")
-			if authHeader == "" {
-				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-			}
-			authHeaderParts := strings.Split(authHeader, " ")
-			if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+		if len(options.ApiKeys) == 0 {
+			return c.Next()
+		}
+
+		// Check for api_keys.json file
+		fileContent, err := os.ReadFile("api_keys.json")
+		if err == nil {
+			// Parse JSON content from the file
+			var fileKeys []string
+			err := json.Unmarshal(fileContent, &fileKeys)
+			if err != nil {
+				return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{"message": "Error parsing api_keys.json"})
 			}

-			apiKey := authHeaderParts[1]
-			validApiKey := false
-			for _, key := range options.ApiKeys {
-				if apiKey == key {
-					validApiKey = true
-				}
-			}
-			if !validApiKey {
-				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+			// Add file keys to options.ApiKeys
+			options.ApiKeys = append(options.ApiKeys, fileKeys...)
+		}
+
+		if len(options.ApiKeys) == 0 {
+			return c.Next()
+		}
+
+		authHeader := c.Get("Authorization")
+		if authHeader == "" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+		}
+		authHeaderParts := strings.Split(authHeader, " ")
+		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+		}
+
+		apiKey := authHeaderParts[1]
+		for _, key := range options.ApiKeys {
+			if apiKey == key {
+				return c.Next()
 			}
 		}
-		return c.Next()
+
+		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+
 	}

 	if options.CORS {
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -301,7 +301,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
+					Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -704,7 +704,7 @@ var _ = Describe("API test", func() {
 		})

 		Context("External gRPC calls", func() {
-			It("calculate embeddings with huggingface", func() {
+			It("calculate embeddings with sentencetransformers", func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -16,7 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 		model.WithContext(o.Context),
 		model.WithModel(c.Model),
 		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
-			CUDA:          c.Diffusers.CUDA,
+			CUDA:          c.CUDA || c.Diffusers.CUDA,
 			SchedulerType: c.Diffusers.SchedulerType,
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
@@ -27,6 +27,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			CLIPModel:     c.Diffusers.ClipModel,
 			CLIPSubfolder: c.Diffusers.ClipSubFolder,
 			CLIPSkip:      int32(c.Diffusers.ClipSkip),
+			ControlNet:    c.Diffusers.ControlNet,
 		}),
 	})

--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -16,6 +16,10 @@ func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.
 		opts = append(opts, model.WithSingleActiveBackend())
 	}

+	if o.ParallelBackendRequests {
+		opts = append(opts, model.EnableParallelRequests)
+	}
+
 	if c.GRPC.Attempts != 0 {
 		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}
@@ -42,6 +46,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		Seed:           int32(c.Seed),
 		NBatch:         int32(b),
 		NoMulMatQ:      c.NoMulMatQ,
+		CUDA:           c.CUDA, // diffusers, transformers
 		DraftModel:     c.DraftModel,
 		AudioPath:      c.VallE.AudioPath,
 		Quantization:   c.Quantization,
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -59,9 +59,13 @@ func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *opt
 	// If the model file is not empty, we pass it joined with the model path
 	modelPath := ""
 	if modelFile != "" {
-		modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
-		if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
-			return "", nil, err
+		if bb != model.TransformersMusicGen {
+			modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
+			if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+				return "", nil, err
+			}
+		} else {
+			modelPath = modelFile
 		}
 	}

--- a/api/config/config.go
+++ b/api/config/config.go
@@ -38,14 +38,17 @@ type Config struct {

 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
-
-	Step int `yaml:"step"`
+	Step      int       `yaml:"step"`

 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`

 	// Vall-e-x
 	VallE VallE `yaml:"vall-e"`
+
+	// CUDA
+	// Explicitly enable CUDA or not (some backends might need it)
+	CUDA bool `yaml:"cuda"`
 }

 type VallE struct {
@@ -65,15 +68,16 @@ type GRPC struct {
 }

 type Diffusers struct {
+	CUDA             bool    `yaml:"cuda"`
 	PipelineType     string  `yaml:"pipeline_type"`
 	SchedulerType    string  `yaml:"scheduler_type"`
-	CUDA             bool    `yaml:"cuda"`
 	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
 	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
 	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
 	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
 	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
 	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+	ControlNet       string  `yaml:"control_net"`
 }

 type LLMConfig struct {
@@ -277,7 +281,7 @@ func (cm *ConfigLoader) LoadConfigs(path string) error {
 	}
 	for _, file := range files {
 		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") {
+		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
 			continue
 		}
 		c, err := ReadConfig(filepath.Join(path, file.Name()))
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -123,13 +123,12 @@ func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
 			return err
 		}

-		client := bm.options.Loader.CheckIsLoaded(backendId)
-
-		if client == nil {
+		model := bm.options.Loader.CheckIsLoaded(backendId)
+		if model == "" {
 			return fmt.Errorf("backend %s is not currently loaded", backendId)
 		}

-		status, rpcErr := client.Status(context.TODO())
+		status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
 		if rpcErr != nil {
 			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
 			val, slbErr := bm.SampleLocalBackendProcess(backendId)
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -81,7 +81,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}

-		if input.ResponseFormat == "json_object" {
+		if input.ResponseFormat.Type == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}

--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -65,7 +65,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		if input.ResponseFormat == "json_object" {
+		if input.ResponseFormat.Type == "json_object" {
 			input.Grammar = grammar.JSONBNF
 		}

--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -5,6 +5,8 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
+	"io"
+	"net/http"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -22,6 +24,26 @@ import (
 	"github.com/rs/zerolog/log"
 )

+func downloadFile(url string) (string, error) {
+	// Get the data
+	resp, err := http.Get(url)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	// Create the file
+	out, err := os.CreateTemp("", "image")
+	if err != nil {
+		return "", err
+	}
+	defer out.Close()
+
+	// Write the body to file
+	_, err = io.Copy(out, resp.Body)
+	return out.Name(), err
+}
+
 // https://platform.openai.com/docs/api-reference/images/create

 /*
@@ -56,12 +78,31 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx

 		src := ""
 		if input.File != "" {
-			//base 64 decode the file and write it somewhere
-			// that we will cleanup
-			decoded, err := base64.StdEncoding.DecodeString(input.File)
-			if err != nil {
-				return err
+
+			fileData := []byte{}
+			// check if input.File is an URL, if so download it and save it
+			// to a temporary file
+			if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
+				out, err := downloadFile(input.File)
+				if err != nil {
+					return fmt.Errorf("failed downloading file:%w", err)
+				}
+				defer os.RemoveAll(out)
+
+				fileData, err = os.ReadFile(out)
+				if err != nil {
+					return fmt.Errorf("failed reading file:%w", err)
+				}
+
+			} else {
+				// base 64 decode the file and write it somewhere
+				// that we will cleanup
+				fileData, err = base64.StdEncoding.DecodeString(input.File)
+				if err != nil {
+					return err
+				}
 			}
+
 			// Create a temporary file
 			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
 			if err != nil {
@@ -69,7 +110,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 			// write the base64 result
 			writer := bufio.NewWriter(outputFile)
-			_, err = writer.Write(decoded)
+			_, err = writer.Write(fileData)
 			if err != nil {
 				outputFile.Close()
 				return err
@@ -100,7 +141,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 		}

 		b64JSON := false
-		if input.ResponseFormat == "b64_json" {
+		if input.ResponseFormat.Type == "b64_json" {
 			b64JSON = true
 		}
 		// src and clip_skip
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -4,10 +4,11 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
+	"time"

+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/rs/zerolog/log"
 )

@@ -36,7 +37,13 @@ type Option struct {

 	AutoloadGalleries bool

-	SingleBackend bool
+	SingleBackend           bool
+	ParallelBackendRequests bool
+
+	WatchDogIdle                             bool
+	WatchDogBusy                             bool
+	WatchDog                                 bool
+	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
 }

 type AppOption func(*Option)
@@ -62,10 +69,40 @@ func WithCors(b bool) AppOption {
 	}
 }

+var EnableWatchDog = func(o *Option) {
+	o.WatchDog = true
+}
+
+var EnableWatchDogIdleCheck = func(o *Option) {
+	o.WatchDog = true
+	o.WatchDogIdle = true
+}
+
+var EnableWatchDogBusyCheck = func(o *Option) {
+	o.WatchDog = true
+	o.WatchDogBusy = true
+}
+
+func SetWatchDogBusyTimeout(t time.Duration) AppOption {
+	return func(o *Option) {
+		o.WatchDogBusyTimeout = t
+	}
+}
+
+func SetWatchDogIdleTimeout(t time.Duration) AppOption {
+	return func(o *Option) {
+		o.WatchDogIdleTimeout = t
+	}
+}
+
 var EnableSingleBackend = func(o *Option) {
 	o.SingleBackend = true
 }

+var EnableParallelBackendRequests = func(o *Option) {
+	o.ParallelBackendRequests = true
+}
+
 var EnableGalleriesAutoload = func(o *Option) {
 	o.AutoloadGalleries = true
 }
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -83,6 +83,12 @@ type OpenAIModel struct {
 	Object string `json:"object"`
 }

+type ChatCompletionResponseFormatType string
+
+type ChatCompletionResponseFormat struct {
+	Type ChatCompletionResponseFormatType `json:"type,omitempty"`
+}
+
 type OpenAIRequest struct {
 	config.PredictionOptions

@@ -92,7 +98,7 @@ type OpenAIRequest struct {
 	// whisper
 	File string `json:"file" validate:"required"`
 	//whisper/image
-	ResponseFormat string `json:"response_format"`
+	ResponseFormat ChatCompletionResponseFormat `json:"response_format"`
 	// image
 	Size string `json:"size"`
 	// Prompt is read only by completion/image API calls
--- a/pkg/grpc/proto/backend.proto
+++ b/pkg/grpc/proto/backend.proto
@@ -110,6 +110,7 @@ message ModelOptions {
  string CLIPModel = 31;
  string CLIPSubfolder = 32;
  int32 CLIPSkip = 33;
+  string ControlNet = 48;

  // RWKV
  string Tokenizer = 34;
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -36,7 +36,7 @@ include_directories(${Protobuf_INCLUDE_DIRS})
 message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")

 # Proto file
-get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
 get_filename_component(hw_proto_path "${hw_proto}" PATH)

 # Generated sources
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=d9b33fe95bd257b36c84ee5769cc048230067d6f
+LLAMA_VERSION?=

 CMAKE_ARGS?=
 BUILD_TYPE?=
@@ -21,6 +21,9 @@ endif

 llama.cpp:
 	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
+	if [ -z "$(LLAMA_VERSION)" ]; then \
+		exit 1; \
+	fi
 	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1

 llama.cpp/examples/grpc-server:
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -40,9 +40,18 @@ using backend::HealthMessage;


 ///// LLAMA.CPP server code below
-
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 using json = nlohmann::json;

+struct server_params
+{
+    std::string hostname = "127.0.0.1";
+    std::string public_path = "examples/server/public";
+    int32_t port = 8080;
+    int32_t read_timeout = 600;
+    int32_t write_timeout = 600;
+};
+
 static bool server_verbose = false;

 #if SERVER_VERBOSE != 1
@@ -62,6 +71,10 @@ static bool server_verbose = false;
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)

+json oaicompat_completion_params_parse(const json &body);
+std::string format_chatml(std::vector<json> messages);
+
+
 //
 // base64 utils (TODO: move to common in the future)
 //
@@ -152,15 +165,23 @@ struct task_server {
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
+    int multitask_id = -1;
 };

 struct task_result {
    int id;
+    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };

+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
 // TODO: can become bool if we can't find use of more states
 enum slot_state
 {
@@ -365,7 +386,6 @@ struct llama_client_slot

    int32_t num_prompt_tokens           = 0;
    int32_t num_prompt_tokens_processed = 0;
-    int32_t multibyte_pending           = 0;

    json prompt;
    std::string generated_text;
@@ -381,6 +401,9 @@ struct llama_client_slot
    bool stopped_word = false;
    bool stopped_limit = false;

+    bool oaicompat = false;
+    std::string oaicompat_model;
+
    std::string stopping_word;

    // sampling
@@ -400,6 +423,9 @@ struct llama_client_slot
    double t_prompt_processing; // ms
    double t_token_generation; // ms

+    // multitasks
+    int multitask_id = -1;
+
    void reset() {
        num_prompt_tokens      = 0;
        generated_text         = "";
@@ -408,7 +434,6 @@ struct llama_client_slot
        stopped_word           = false;
        stopped_limit          = false;
        stopping_word          = "";
-        multibyte_pending      = 0;
        n_past                 = 0;
        sent_count             = 0;
        sent_token_probs_index = 0;
@@ -480,7 +505,7 @@ struct llama_client_slot
        };
    }

-    void print_timings() {
+    void print_timings() const {
        LOG_TEE("\n");
        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
@@ -504,6 +529,7 @@ struct llama_server_context
    bool multimodal         = false;
    bool clean_kv_cache     = true;
    bool all_slots_are_idle = false;
+    bool add_bos_token      = true;

    int32_t id_gen;
    int32_t n_ctx;  // total context for all clients / slots
@@ -522,7 +548,8 @@ struct llama_server_context

    std::vector<task_server> queue_tasks;
    std::vector<task_result> queue_results;
-    std::mutex mutex_tasks;
+    std::vector<task_multi>  queue_multitasks;
+    std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
    std::mutex mutex_results;

    ~llama_server_context()
@@ -576,6 +603,8 @@ struct llama_server_context

        n_ctx = llama_n_ctx(ctx);

+        add_bos_token = llama_should_add_bos_token(model);
+
        return true;
    }

@@ -609,6 +638,11 @@ struct llama_server_context

    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
    {
+        // TODO: currently, we tokenize using special tokens by default
+        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
+        //       but it's better compared to completely ignoring ChatML and other chat templates
+        const bool TMP_FORCE_SPECIAL = true;
+
        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
        std::vector<llama_token> prompt_tokens;
@@ -624,12 +658,12 @@ struct llama_server_context
                    std::vector<llama_token> p;
                    if (first)
                    {
-                        p = ::llama_tokenize(ctx, s, add_bos);
+                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                        first = false;
                    }
                    else
                    {
-                        p = ::llama_tokenize(ctx, s, false);
+                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                    }
                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                }
@@ -646,7 +680,7 @@ struct llama_server_context
        else
        {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
        }

        return prompt_tokens;
@@ -677,11 +711,20 @@ struct llama_server_context
        slot_params default_params;
        llama_sampling_params default_sparams;

+        if (data.count("__oaicompat") != 0) {
+            slot->oaicompat = true;
+            slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+        } else {
+            slot->oaicompat = false;
+            slot->oaicompat_model = "";
+        }
+
        slot->params.stream           = json_value(data, "stream",            false);
        slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
        slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
        slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
        slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
+        slot->sparams.min_p           = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
        slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
        slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
@@ -866,7 +909,7 @@ struct llama_server_context
    }

    void update_system_prompt() {
-        system_tokens = ::llama_tokenize(ctx, system_prompt, true);
+        system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);

        llama_batch_clear(batch);

@@ -957,35 +1000,36 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;

-        if (slot.multibyte_pending > 0)
+        // check if there is incomplete UTF-8 character at the end
+        bool incomplete = false;
+        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
        {
-            slot.multibyte_pending -= token_str.size();
-        }
-        else if (token_str.size() == 1)
-        {
-            const char c = token_str[0];
-            // 2-byte characters: 110xxxxx 10xxxxxx
+            unsigned char c = slot.generated_text[slot.generated_text.size() - i];
+            if ((c & 0xC0) == 0x80)
+            {
+                // continuation byte: 10xxxxxx
+                continue;
+            }
            if ((c & 0xE0) == 0xC0)
            {
-                slot.multibyte_pending = 1;
-                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                // 2-byte character: 110xxxxx ...
+                incomplete = i < 2;
            }
            else if ((c & 0xF0) == 0xE0)
            {
-                slot.multibyte_pending = 2;
-                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                // 3-byte character: 1110xxxx ...
+                incomplete = i < 3;
            }
            else if ((c & 0xF8) == 0xF0)
            {
-                slot.multibyte_pending = 3;
-            }
-            else
-            {
-                slot.multibyte_pending = 0;
+                // 4-byte character: 11110xxx ...
+                incomplete = i < 4;
            }
+            // else 1-byte character or invalid byte
+            break;
        }

-        if (slot.multibyte_pending == 0)
+        if (!incomplete)
        {
            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
            const std::string str_test = slot.generated_text.substr(pos);
@@ -1020,7 +1064,7 @@ struct llama_server_context
            }
        }

-        if (slot.multibyte_pending > 0 && !slot.has_next_token)
+        if (incomplete)
        {
            slot.has_next_token = true;
        }
@@ -1089,16 +1133,40 @@ struct llama_server_context
        return slot.images.size() > 0;
    }

-    void send_error(int id, std::string error)
+    void send_error(task_server& task, std::string error)
    {
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
-        res.id = id;
+        res.id = task.id;
+        res.multitask_id = task.multitask_id;
+        res.stop = false;
        res.error = true;
        res.result_json = { { "content", error } };
        queue_results.push_back(res);
    }

+    void add_multi_task(int id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    void update_multi_task(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+
    json get_model_props()
    {
        return get_formated_generation(slots[0]);
@@ -1116,6 +1184,7 @@ struct llama_server_context
            {"temp",              slot.sparams.temp},
            {"top_k",             slot.sparams.top_k},
            {"top_p",             slot.sparams.top_p},
+            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
            {"typical_p",         slot.sparams.typical_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
@@ -1142,6 +1211,7 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
        res.error = false;
        res.stop = false;

@@ -1167,6 +1237,12 @@ struct llama_server_context
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
        }

+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
        queue_results.push_back(res);
    }

@@ -1175,6 +1251,7 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
        res.error = false;
        res.stop = true;

@@ -1214,6 +1291,18 @@ struct llama_server_context
            res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
        }

+        if (slot.oaicompat)
+        {
+            res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
+            res.result_json["model"] = slot.oaicompat_model;
+        }
+
+        // parent multitask, if any, needs to be updated
+        if (slot.multitask_id != -1)
+        {
+            update_multi_task(slot.multitask_id, slot.task_id, res);
+        }
+
        queue_results.push_back(res);
    }

@@ -1222,6 +1311,7 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
        res.error = false;
        res.stop = true;

@@ -1248,15 +1338,26 @@ struct llama_server_context
        queue_results.push_back(res);
    }

-    int request_completion(json data, bool infill, bool embedding)
+    int request_completion(json data, bool infill, bool embedding, int multitask_id)
    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
+        std::unique_lock<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
-        task.data = data;
+        task.target_id = 0;
+        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
        task.type = COMPLETION_TASK;
+        task.multitask_id = multitask_id;
+
+        // when a completion task's prompt array is not a singleton, we split it into multiple requests
+        if (task.data.at("prompt").size() > 1)
+        {
+            lock.unlock(); // entering new func scope
+            return split_multiprompt_task(task);
+        }
+
+        // otherwise, it's a single-prompt task, we actually queue it
        queue_tasks.push_back(task);
        return task.id;
    }
@@ -1275,8 +1376,17 @@ struct llama_server_context

            for (int i = 0; i < (int) queue_results.size(); i++)
            {
+                // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+                if (queue_results[i].multitask_id == task_id)
+                {
+                    update_multi_task(task_id, queue_results[i].id, queue_results[i]);
+                    queue_results.erase(queue_results.begin() + i);
+                    continue;
+                }
+
                if (queue_results[i].id == task_id)
                {
+                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
@@ -1366,6 +1476,27 @@ struct llama_server_context
        queue_tasks.push_back(task);
    }

+    int split_multiprompt_task(task_server& multiprompt_task)
+    {
+        int prompt_count = multiprompt_task.data.at("prompt").size();
+        assert(prompt_count > 1);
+
+        int multitask_id = id_gen++;
+        std::vector<int> subtask_ids(prompt_count);
+        for (int i = 0; i < prompt_count; i++)
+        {
+            json subtask_data = multiprompt_task.data;
+            subtask_data["prompt"] = subtask_data["prompt"][i];
+
+            // subtasks inherit everything else (infill mode, embedding mode, etc.)
+            subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+        }
+
+        // queue up the multitask so we can track its subtask progression
+        add_multi_task(multitask_id, subtask_ids);
+        return multitask_id;
+    }
+
    void process_tasks()
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
@@ -1381,7 +1512,7 @@ struct llama_server_context
                    {
                        LOG_TEE("slot unavailable\n");
                        // send error result
-                        send_error(task.id, "slot unavailable");
+                        send_error(task, "slot unavailable");
                        return;
                    }

@@ -1395,11 +1526,12 @@ struct llama_server_context
                    slot->infill = task.infill_mode;
                    slot->embedding = task.embedding_mode;
                    slot->task_id = task.id;
+                    slot->multitask_id = task.multitask_id;

                    if (!launch_slot_with_data(slot, task.data))
                    {
                        // send error result
-                        send_error(task.id, "internal_error");
+                        send_error(task, "internal_error");
                        break;
                    }
                } break;
@@ -1415,6 +1547,38 @@ struct llama_server_context
                } break;
            }
        }
+
+        // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
+        auto queue_iterator = queue_multitasks.begin();
+        while (queue_iterator != queue_multitasks.end())
+        {
+            if (queue_iterator->subtasks_remaining.empty())
+            {
+                // all subtasks done == multitask is done
+                task_result aggregate_result;
+                aggregate_result.id = queue_iterator->id;
+                aggregate_result.stop = true;
+                aggregate_result.error = false;
+
+                // collect json results into one json result
+                std::vector<json> result_jsons;
+                for (auto& subres : queue_iterator->results)
+                {
+                    result_jsons.push_back(subres.result_json);
+                    aggregate_result.error = aggregate_result.error && subres.error;
+                }
+                aggregate_result.result_json = json{ "results", result_jsons };
+
+                std::lock_guard<std::mutex> lock(mutex_results);
+                queue_results.push_back(aggregate_result);
+
+                queue_iterator = queue_multitasks.erase(queue_iterator);
+            }
+            else
+            {
+                ++queue_iterator;
+            }
+        }
    }

    bool update_slots() {
@@ -1553,11 +1717,40 @@ struct llama_server_context
                    }
                    else
                    {
-                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty());  // add BOS if there isn't system prompt
+                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
                    }

                    slot.num_prompt_tokens = prompt_tokens.size();

+                    if (slot.params.n_keep < 0)
+                    {
+                        slot.params.n_keep = slot.num_prompt_tokens;
+                    }
+                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
+
+                    // if input prompt is too big, truncate it
+                    if (slot.num_prompt_tokens >= slot.n_ctx)
+                    {
+                        const int n_left = slot.n_ctx - slot.params.n_keep;
+                        const int n_block_size = n_left / 2;
+                        const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
+
+                        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
+                        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+
+                        LOG_VERBOSE("input truncated", {
+                            {"n_ctx",  slot.n_ctx},
+                            {"n_keep", slot.params.n_keep},
+                            {"n_left", n_left},
+                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
+                        });
+                        slot.truncated = true;
+                        prompt_tokens = new_tokens;
+
+                        slot.num_prompt_tokens = prompt_tokens.size();
+                        GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
+                    }
+
                    if (!slot.params.cache_prompt)
                    {
                        llama_sampling_reset(slot.ctx_sampling);
@@ -1567,35 +1760,6 @@ struct llama_server_context
                    }
                    else
                    {
-                        if (slot.params.n_keep < 0)
-                        {
-                            slot.params.n_keep = slot.num_prompt_tokens;
-                        }
-                        slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
-
-                        // if input prompt is too big, truncate it
-                        if (slot.num_prompt_tokens >= slot.n_ctx)
-                        {
-                            const int n_left = slot.n_ctx - slot.params.n_keep;
-                            const int n_block_size = n_left / 2;
-                            const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
-
-                            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
-                            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
-
-                            LOG_VERBOSE("input truncated", {
-                                                            {"n_ctx",  slot.n_ctx},
-                                                            {"n_keep", slot.params.n_keep},
-                                                            {"n_left", n_left},
-                                                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                                                        });
-                            slot.truncated = true;
-                            prompt_tokens = new_tokens;
-
-                            slot.num_prompt_tokens = prompt_tokens.size();
-                            GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
-                        }
-
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
@@ -1630,7 +1794,7 @@ struct llama_server_context
                    const bool has_images = process_images(slot);

                    // process the prefix of first image
-                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
+                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
                    for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
                    {
                       llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
@@ -1750,6 +1914,231 @@ struct llama_server_context
 };


+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
+}
+
+std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
+
+    chatml_msgs << "<|im_start|>assistant" << '\n';
+
+    return chatml_msgs.str();
+}
+
+/* llama.cpp completion api semantics */
+json oaicompat_completion_params_parse(
+    const json &body /* openai api json semantics */)
+{
+    json llama_params;
+
+    llama_params["__oaicompat"] = true;
+
+    // Map OpenAI parameters to llama.cpp parameters
+    llama_params["model"]             = json_value(body, "model", std::string("uknown"));
+    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
+    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
+    llama_params["top_k"]             = json_value(body, "top_k", 40);
+    llama_params["top_p"]             = json_value(body, "top_p", 0.95);
+    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
+    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
+    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
+    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
+    llama_params["seed"]              = json_value(body, "seed", 0);
+    llama_params["stream"]            = json_value(body, "stream", false);
+    llama_params["mirostat"]          = json_value(body, "mirostat", false);
+    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", 0.0);
+    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", 0.0);
+    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", false);
+    llama_params["typical_p"]         = json_value(body, "typical_p", 0.0);
+    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", 0);
+    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
+    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
+
+    if (llama_params.count("grammar") != 0) {
+        llama_params["grammar"] = json_value(body, "grammar", json::object());
+    }
+
+    // Handle 'stop' field
+    if (body.contains("stop") && body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    // Ensure there is ChatML-specific end sequence among stop words
+    llama_params["stop"].push_back("<|im_end|>");
+
+    return llama_params;
+}
+
+static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
+{
+    json result = response.result_json;
+
+    bool stopped_word        = result.count("stopped_word") != 0;
+    bool stopped_eos         = json_value(result, "stopped_eos", false);
+    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+    std::string content      = json_value(result, "content", std::string(""));
+
+    std::string finish_reason = "length";
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+
+    json choices =
+        streaming ? json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}})
+                  : json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"message", json{{"content", content},
+                                                         {"role", "assistant"}}}}});
+
+    std::time_t t = std::time(0);
+
+    json res =
+        json{{"choices", choices},
+            {"created", t},
+            {"model",
+                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+            {"usage",
+                json{{"completion_tokens", num_tokens_predicted},
+                    {"prompt_tokens", num_prompt_tokens},
+                    {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
+            {"id", gen_chatcmplid()}};
+
+    if (server_verbose) {
+        res["__verbose"] = result;
+    }
+
+    if (result.contains("completion_probabilities")) {
+        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+    }
+
+    return res;
+}
+
+// return value is vector as there is one case where we might need to generate two responses
+static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
+    json result = response.result_json;
+
+    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+        return std::vector<json>({response.result_json});
+    }
+
+    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+    bool stopped_word   = json_value(result, "stopped_word", false);
+    bool stopped_eos    = json_value(result, "stopped_eos", false);
+    bool stopped_limit  = json_value(result, "stopped_limit", false);
+    std::string content = json_value(result, "content", std::string(""));
+
+    std::string finish_reason;
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+    if (stopped_limit) {
+        finish_reason = "length";
+    }
+
+    std::time_t t = std::time(0);
+
+    json choices;
+
+    if (!finish_reason.empty()) {
+        choices = json::array({json{{"finish_reason", finish_reason},
+                                    {"index", 0},
+                                    {"delta", json::object()}}});
+    } else {
+        if (first) {
+            if (content.empty()) {
+                choices = json::array({json{{"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{{"role", "assistant"}}}}});
+            } else {
+                // We have to send this as two updates to conform to openai behavior
+                json initial_ret = json{{"choices", json::array({json{
+                                        {"finish_reason", nullptr},
+                                        {"index", 0},
+                                        {"delta", json{
+                                            {"role", "assistant"}
+                                        }}}})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                json second_ret = json{
+                            {"choices", json::array({json{{"finish_reason", nullptr},
+                                                            {"index", 0},
+                                                            {"delta", json{
+                                                            {"content", content}}}
+                                                            }})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                return std::vector<json>({initial_ret, second_ret});
+            }
+        } else {
+            // Some idiosyncrasy in task processing logic makes several trailing calls
+            // with empty content, we ignore these at the calee site.
+            if (content.empty()) {
+                return std::vector<json>({json::object()});
+            }
+
+            choices = json::array({json{
+                {"finish_reason", nullptr},
+                {"index", 0},
+                {"delta",
+                json{
+                    {"content", content},
+                }},
+            }});
+        }
+    }
+
+    json ret = json{{"choices", choices},
+                    {"created", t},
+                    {"id", gen_chatcmplid()},
+                    {"model", modelname},
+                    {"object", "chat.completion.chunk"}};
+
+    return std::vector<json>({ret});
+}

 static json format_partial_response(
    llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
@@ -1782,8 +2171,6 @@ static json format_detokenized_response(std::string content)
        {"content", content}};
 }

-
-
 struct token_translator
 {
    llama_context * ctx;
@@ -1979,7 +2366,7 @@ static void params_parse(const backend::ModelOptions* request,
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
    params.n_ctx = request->contextsize();
-    params.memory_f16 = request->f16memory();
+    //params.memory_f16 = request->f16memory();
    params.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
@@ -2086,7 +2473,7 @@ public:
  }
  grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
        json data = parse_options(true, request, llama);
-        const int task_id = llama.request_completion(data, false, false);
+        const int task_id = llama.request_completion(data, false, false, -1);
        while (true)
        {
            task_result result = llama.next_result(task_id);
@@ -2122,7 +2509,7 @@ public:

    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
        json data = parse_options(false, request, llama);
-        const int task_id = llama.request_completion(data, false, false);
+        const int task_id = llama.request_completion(data, false, false, -1);
        std::string completion_text;
        task_result result = llama.next_result(task_id);
        if (!result.error && result.stop) {
--- a/cmd/grpc/bert-embeddings/main.go
+++ b/cmd/grpc/bert-embeddings/main.go
@@ -5,7 +5,6 @@ package main
 import (
 	"flag"

-	bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -16,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &bert.Embeddings{}); err != nil {
+	if err := grpc.StartServer(*addr, &StableDiffusion{}); err != nil {
 		panic(err)
 	}
 }
--- a/pkg/backend/image/stablediffusion.go
+++ b/pkg/backend/image/stablediffusion.go
@@ -1,4 +1,4 @@
-package image
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/pkg/backend/llm/bert/bert.go
+++ b/pkg/backend/llm/bert/bert.go
@@ -1,4 +1,4 @@
-package bert
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -5,8 +5,6 @@ package main
 import (
 	"flag"

-	rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -17,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &rwkv.LLM{}); err != nil {
+	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/dolly/main.go
+++ b/backend/go/llm/dolly/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/falcon-ggml/main.go
+++ b/backend/go/llm/falcon-ggml/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/gpt2/main.go
+++ b/backend/go/llm/gpt2/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/pkg/backend/llm/gpt4all/gpt4all.go
+++ b/pkg/backend/llm/gpt4all/gpt4all.go
@@ -1,4 +1,4 @@
-package gpt4all
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -5,8 +5,6 @@ package main
 import (
 	"flag"

-	gpt4all "github.com/go-skynet/LocalAI/pkg/backend/llm/gpt4all"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -17,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &gpt4all.LLM{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/gptj/main.go
+++ b/backend/go/llm/gptj/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/gptneox/main.go
+++ b/backend/go/llm/gptneox/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/pkg/backend/llm/langchain/langchain.go
+++ b/pkg/backend/llm/langchain/langchain.go
@@ -1,4 +1,4 @@
-package langchain
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/langchain/main.go
+++ b/backend/go/llm/langchain/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/llm/llama-stable/llama.go
+++ b/pkg/backend/llm/llama-stable/llama.go
@@ -1,4 +1,4 @@
-package llama
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -3,8 +3,6 @@ package main
 import (
 	"flag"

-	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -15,7 +13,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/pkg/backend/llm/llama/llama.go
+++ b/pkg/backend/llm/llama/llama.go
@@ -1,4 +1,4 @@
-package llama
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/llama/main.go
+++ b/backend/go/llm/llama/main.go
@@ -1,12 +1,12 @@
 package main

+// GRPC Falcon server
+
 // Note: this is started internally by LocalAI and a server is allocated for each model

 import (
 	"flag"

-	tts "github.com/go-skynet/LocalAI/pkg/backend/tts"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -17,7 +17,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &tts.Piper{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/mpt/main.go
+++ b/backend/go/llm/mpt/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/replit/main.go
+++ b/backend/go/llm/replit/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/rwkv/main.go
+++ b/backend/go/llm/rwkv/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/llm/rwkv/rwkv.go
+++ b/pkg/backend/llm/rwkv/rwkv.go
@@ -1,4 +1,4 @@
-package rwkv
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/starcoder/main.go
+++ b/backend/go/llm/starcoder/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/pkg/backend/llm/transformers/dolly.go
+++ b/pkg/backend/llm/transformers/dolly.go
--- a/pkg/backend/llm/transformers/falcon.go
+++ b/pkg/backend/llm/transformers/falcon.go
--- a/pkg/backend/llm/transformers/gpt2.go
+++ b/pkg/backend/llm/transformers/gpt2.go
--- a/pkg/backend/llm/transformers/gptj.go
+++ b/pkg/backend/llm/transformers/gptj.go
--- a/pkg/backend/llm/transformers/gptneox.go
+++ b/pkg/backend/llm/transformers/gptneox.go
--- a/pkg/backend/llm/transformers/mpt.go
+++ b/pkg/backend/llm/transformers/mpt.go
--- a/pkg/backend/llm/transformers/predict.go
+++ b/pkg/backend/llm/transformers/predict.go
--- a/pkg/backend/llm/transformers/replit.go
+++ b/pkg/backend/llm/transformers/replit.go
--- a/pkg/backend/llm/transformers/starcoder.go
+++ b/pkg/backend/llm/transformers/starcoder.go
--- a/backend/go/transcribe/main.go
+++ b/backend/go/transcribe/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Whisper{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/transcribe/transcript.go
+++ b/pkg/backend/transcribe/transcript.go
@@ -1,4 +1,4 @@
-package transcribe
+package main

 import (
 	"fmt"
--- a/pkg/backend/transcribe/whisper.go
+++ b/pkg/backend/transcribe/whisper.go
@@ -1,4 +1,4 @@
-package transcribe
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/tts/main.go
+++ b/backend/go/tts/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Piper{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/tts/piper.go
+++ b/pkg/backend/tts/piper.go
@@ -1,4 +1,4 @@
-package tts
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/python/README.md
+++ b/backend/python/README.md
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,4 +1,4 @@
-.PONY: autogptq
+.PHONY: autogptq
 autogptq:
 	@echo "Creating virtual environment..."
 	@conda env create --name autogptq --file autogptq.yml
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/autogptq/backend_pb2_grpc.py
+++ b/backend/python/autogptq/backend_pb2_grpc.py
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -0,0 +1,17 @@
+.PHONY: ttsbark
+ttsbark:
+	@echo "Creating virtual environment..."
+	@conda env create --name ttsbark --file ttsbark.yml
+	@echo "Virtual environment created."
+
+.PHONY: run
+run:
+	@echo "Running bark..."
+	bash run.sh
+	@echo "bark run."
+
+.PHONY: test
+test:
+	@echo "Testing bark..."
+	bash test.sh
+	@echo "bark tested."
--- a/backend/python/bark/README.md
+++ b/backend/python/bark/README.md
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/bark/backend_pb2_grpc.py
+++ b/backend/python/bark/backend_pb2_grpc.py
--- a/backend/python/bark/run.sh
+++ b/backend/python/bark/run.sh
--- a/backend/python/bark/test.py
+++ b/backend/python/bark/test.py
@@ -0,0 +1,81 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "ttsbark.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
+                self.assertTrue(response.success)
+                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
--- a/extra/grpc/huggingface/test.sh
+++ b/extra/grpc/huggingface/test.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 ##
-## A bash script wrapper that runs the huggingface server with conda
+## A bash script wrapper that runs the bark server with conda

 # Activate conda environment
-source activate huggingface
+source activate ttsbark

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-python -m unittest $DIR/test_huggingface.py
+python -m unittest $DIR/test.py
--- a/backend/python/bark/ttsbark.py
+++ b/backend/python/bark/ttsbark.py
@@ -1,8 +1,7 @@
-"""
-This is the extra gRPC server of LocalAI
-"""
-
 #!/usr/bin/env python3
+"""
+This is an extra gRPC server of LocalAI for Bark TTS
+"""
 from concurrent import futures
 import time
 import argparse
--- a/backend/python/bark/ttsbark.yml
+++ b/backend/python/bark/ttsbark.yml
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -1,11 +1,14 @@
-.PONY: diffusers
+.PHONY: diffusers
 diffusers:
 	@echo "Creating virtual environment..."
 	@conda env create --name diffusers --file diffusers.yml
 	@echo "Virtual environment created."

-.PONY: run
+.PHONY: run
 run:
 	@echo "Running diffusers..."
 	bash run.sh
-	@echo "Diffusers run."
+	@echo "Diffusers run."
+
+test:
+	bash test.sh
--- a/backend/python/diffusers/README.md
+++ b/backend/python/diffusers/README.md
--- a/backend/python/diffusers/backend_diffusers.py
+++ b/backend/python/diffusers/backend_diffusers.py
@@ -18,9 +18,9 @@ import backend_pb2_grpc
 import grpc

 from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
-from diffusers import StableDiffusionImg2ImgPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
-
+from diffusers.utils import load_image,export_to_video
 from compel import Compel

 from transformers import CLIPTextModel
@@ -30,6 +30,11 @@ from safetensors.torch import load_file
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 COMPEL=os.environ.get("COMPEL", "1") == "1"
 CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
+SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1"
+CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8")
+FPS=os.environ.get("FPS", "7")
+DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
+FRAMES=os.environ.get("FRAMES", "64")

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
@@ -135,8 +140,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            print(f"Loading model {request.Model}...", file=sys.stderr)
            print(f"Request {request}", file=sys.stderr)
            torchType = torch.float32
+            variant = None
+
            if request.F16Memory:
                torchType = torch.float16
+                variant="fp16"

            local = False
            modelFile = request.Model
@@ -159,15 +167,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    modelFile = request.ModelFile
            
            fromSingleFile = request.Model.startswith("http") or request.Model.startswith("/") or local
-            
-            if request.IMG2IMG and request.PipelineType == "":
-                request.PipelineType == "StableDiffusionImg2ImgPipeline"
-                
-            if request.PipelineType == "":
-                request.PipelineType == "StableDiffusionPipeline"
-
+            self.img2vid=False
+            self.txt2vid=False
            ## img2img
-            if request.PipelineType == "StableDiffusionImg2ImgPipeline":
+            if (request.PipelineType == "StableDiffusionImg2ImgPipeline") or (request.IMG2IMG and request.PipelineType == ""):
                if fromSingleFile:
                    self.pipe = StableDiffusionImg2ImgPipeline.from_single_file(modelFile,
                                torch_dtype=torchType,
@@ -177,12 +180,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                torch_dtype=torchType,
                                guidance_scale=cfg_scale)

-            if request.PipelineType == "StableDiffusionDepth2ImgPipeline":
+            elif request.PipelineType == "StableDiffusionDepth2ImgPipeline":
                self.pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(request.Model,
                            torch_dtype=torchType,
                            guidance_scale=cfg_scale)
+            ## img2vid
+            elif request.PipelineType == "StableVideoDiffusionPipeline":
+                self.img2vid=True
+                self.pipe = StableVideoDiffusionPipeline.from_pretrained(
+                    request.Model, torch_dtype=torchType, variant=variant
+                )
+                if not DISABLE_CPU_OFFLOAD:
+                    self.pipe.enable_model_cpu_offload()
            ## text2img
-            if request.PipelineType == "StableDiffusionPipeline":
+            elif request.PipelineType == "AutoPipelineForText2Image" or request.PipelineType == "":
+                self.pipe = AutoPipelineForText2Image.from_pretrained(request.Model,
+                                                    torch_dtype=torchType,
+                                                    use_safetensors=SAFETENSORS, 
+                                                    variant=variant,
+                                                    guidance_scale=cfg_scale)
+            elif request.PipelineType == "StableDiffusionPipeline":
                if fromSingleFile:
                    self.pipe = StableDiffusionPipeline.from_single_file(modelFile,
                                                        torch_dtype=torchType,
@@ -191,13 +208,16 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    self.pipe = StableDiffusionPipeline.from_pretrained(request.Model,
                                                        torch_dtype=torchType,
                                                        guidance_scale=cfg_scale)
-
-            if request.PipelineType == "DiffusionPipeline":
+            elif request.PipelineType == "DiffusionPipeline":
                self.pipe = DiffusionPipeline.from_pretrained(request.Model,
                                                        torch_dtype=torchType,
                                                        guidance_scale=cfg_scale)
-
-            if request.PipelineType == "StableDiffusionXLPipeline":
+            elif request.PipelineType == "VideoDiffusionPipeline":
+                self.txt2vid=True
+                self.pipe = DiffusionPipeline.from_pretrained(request.Model,
+                                                        torch_dtype=torchType,
+                                                        guidance_scale=cfg_scale)
+            elif request.PipelineType == "StableDiffusionXLPipeline":
                if fromSingleFile:
                    self.pipe = StableDiffusionXLPipeline.from_single_file(modelFile,
                                                               torch_dtype=torchType, use_safetensors=True,
@@ -207,21 +227,35 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        request.Model, 
                        torch_dtype=torchType, 
                        use_safetensors=True, 
-                #       variant="fp16"
+                        variant=variant,
                        guidance_scale=cfg_scale)
-            # https://github.com/huggingface/diffusers/issues/4446
-            # do not use text_encoder in the constructor since then
-            # https://github.com/huggingface/diffusers/issues/3212#issuecomment-1521841481
+
            if CLIPSKIP and request.CLIPSkip != 0:
-                text_encoder = CLIPTextModel.from_pretrained(clipmodel, num_hidden_layers=request.CLIPSkip,  subfolder=clipsubfolder, torch_dtype=torchType)
-                self.pipe.text_encoder=text_encoder
+                self.clip_skip = request.CLIPSkip
+            else:
+                self.clip_skip = 0
+            
            # torch_dtype needs to be customized. float16 for GPU, float32 for CPU
            # TODO: this needs to be customized
            if request.SchedulerType != "":
                self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
-            self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
+                
+            if not self.img2vid:
+                self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
+
+
+            if request.ControlNet:
+                self.controlnet = ControlNetModel.from_pretrained(
+                    request.ControlNet, torch_dtype=torchType, variant=variant
+                )
+                self.pipe.controlnet = self.controlnet
+            else:
+                self.controlnet = None
+
            if request.CUDA:
                self.pipe.to('cuda')
+                if self.controlnet:
+                    self.controlnet.to('cuda')
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
@@ -303,17 +337,28 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        prompt = request.positive_prompt

+        steps = 1
+
+        if request.step != 0:
+            steps = request.step
+
        # create a dictionary of values for the parameters
        options = {
            "negative_prompt":     request.negative_prompt, 
            "width":               request.width, 
            "height":              request.height,
-            "num_inference_steps": request.step,
+            "num_inference_steps": steps,
        }

-        if request.src != "":
+        if request.src != "" and not self.controlnet and not self.img2vid:
            image = Image.open(request.src)
            options["image"] = image
+        elif self.controlnet and request.src:
+            pose_image = load_image(request.src)
+            options["image"] = pose_image
+
+        if CLIPSKIP and self.clip_skip != 0:
+            options["clip_skip"]=self.clip_skip

        # Get the keys that we will build the args for our pipe for
        keys = options.keys()
@@ -333,6 +378,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                request.seed
            )

+        if self.img2vid:
+            # Load the conditioning image
+            image = load_image(request.src)
+            image = image.resize((1024, 576))
+
+            generator = torch.manual_seed(request.seed)
+            frames = self.pipe(image, decode_chunk_size=CHUNK_SIZE, generator=generator).frames[0]
+            export_to_video(frames, request.dst, fps=FPS)
+            return backend_pb2.Result(message="Media generated successfully", success=True)
+
+        if self.txt2vid:
+            video_frames = self.pipe(prompt, num_inference_steps=steps, num_frames=int(FRAMES)).frames
+            export_to_video(video_frames, request.dst)
+            return backend_pb2.Result(message="Media generated successfully", success=True)
+
        image = {}
        if COMPEL:
            conditioning = self.compel.build_conditioning_tensor(prompt)
@@ -351,7 +411,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # save the result
        image.save(request.dst)

-        return backend_pb2.Result(message="Model loaded successfully", success=True)
+        return backend_pb2.Result(message="Media generated", success=True)

 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/backend_pb2_grpc.py
+++ b/backend/python/diffusers/backend_pb2_grpc.py
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -25,15 +25,15 @@ dependencies:
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
-      - accelerate==0.23.0
+      - accelerate>=0.11.0
      - certifi==2023.7.22
      - charset-normalizer==3.3.0
      - compel==2.0.2
-      - diffusers==0.21.4
+      - diffusers==0.24.0
      - filelock==3.12.4
      - fsspec==2023.9.2
      - grpcio==1.59.0
-      - huggingface-hub==0.17.3
+      - huggingface-hub>=0.19.4
      - idna==3.4
      - importlib-metadata==6.8.0
      - jinja2==3.1.2
@@ -63,10 +63,9 @@ dependencies:
      - requests==2.31.0
      - safetensors==0.4.0
      - sympy==1.12
-      - tokenizers==0.14.1
      - torch==2.1.0
      - tqdm==4.66.1
-      - transformers==4.34.0
+      - transformers>=4.25.1
      - triton==2.1.0
      - typing-extensions==4.8.0
      - urllib3==2.0.6
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
--- a/backend/python/diffusers/test.py
+++ b/backend/python/diffusers/test.py
@@ -0,0 +1,84 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "backend_diffusers.py", "--addr", "localhost:50051"])
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.kill()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        time.sleep(10)
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        time.sleep(10)
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test(self):
+        """
+        This method tests if the backend can generate images
+        """
+        time.sleep(10)
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
+                print(response.message)
+                self.assertTrue(response.success)
+                image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
+                re = stub.GenerateImage(image_req)
+                self.assertTrue(re.success)
+        except Exception as err:
+            print(err)
+            self.fail("Image gen service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/diffusers/test.sh
+++ b/backend/python/diffusers/test.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the diffusers server with conda
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate diffusers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test.py
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,11 +1,12 @@
-.PONY: exllama
+.PHONY: exllama
 exllama:
 	@echo "Creating virtual environment..."
 	@conda env create --name exllama --file exllama.yml
 	@echo "Virtual environment created."
+	bash install.sh

-.PONY: run
+.PHONY: run
 run:
 	@echo "Running exllama..."
 	bash run.sh
-	@echo "exllama run."
+	@echo "exllama run."
--- a/backend/python/exllama/README.md
+++ b/backend/python/exllama/README.md
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/backend_pb2_grpc.py
+++ b/backend/python/exllama/backend_pb2_grpc.py
--- a/backend/python/exllama/exllama.py
+++ b/backend/python/exllama/exllama.py
@@ -13,9 +13,10 @@ from pathlib import Path
 import torch
 import torch.nn.functional as F
 from torch import version as torch_version
-from exllama.generator import ExLlamaGenerator
-from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
-from exllama.tokenizer import ExLlamaTokenizer
+
+from tokenizer import ExLlamaTokenizer
+from generator import ExLlamaGenerator
+from model import ExLlama, ExLlamaCache, ExLlamaConfig

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -63,6 +64,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

            config = ExLlamaConfig(model_config_path)               # create config from config.json
            config.model_path = model_path                          # supply path to model weights file
+            if (request.ContextSize):
+                config.max_seq_len = request.ContextSize            # override max sequence length
+                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
+                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
+
+            # Set Rope scaling.
+            if (request.RopeFreqScale):
+                # Alpha value for Rope scaling. 
+                # Higher value increases context but adds perplexity.
+                # alpha_value and compress_pos_emb are mutually exclusive.
+                # https://github.com/turboderp/exllama/issues/115
+                config.alpha_value = request.RopeFreqScale
+                config.calculate_rotary_embedding_base()

            model = ExLlama(config)                                 # create ExLlama instance and load the weights
            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
--- a/backend/python/exllama/exllama.yml
+++ b/backend/python/exllama/exllama.yml
@@ -33,6 +33,7 @@ dependencies:
      - mpmath==1.3.0
      - networkx==3.1
      - ninja==1.11.1
+      - protobuf==4.24.4
      - nvidia-cublas-cu12==12.1.3.1
      - nvidia-cuda-cupti-cu12==12.1.105
      - nvidia-cuda-nvrtc-cu12==12.1.105
@@ -45,11 +46,11 @@ dependencies:
      - nvidia-nccl-cu12==2.18.1
      - nvidia-nvjitlink-cu12==12.2.140
      - nvidia-nvtx-cu12==12.1.105
-      - protobuf==4.24.4
      - safetensors==0.3.2
      - sentencepiece==0.1.99
      - sympy==1.12
      - torch==2.1.0
      - triton==2.1.0
      - typing-extensions==4.8.0
+      - numpy
 prefix: /opt/conda/envs/exllama
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+##
+## A bash script installs the required dependencies of VALL-E-X and prepares the environment
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate exllama
+
+echo $CONDA_PREFIX
+
+
+git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
+
+cp -rfv $CONDA_PREFIX/exllama/* ./
--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
--- a/backend/python/exllama2/Makefile
+++ b/backend/python/exllama2/Makefile
@@ -0,0 +1,12 @@
+.PHONY: exllama2
+exllama2:
+	@echo "Creating virtual environment..."
+	@conda env create --name exllama2 --file exllama2.yml
+	@echo "Virtual environment created."
+	bash install.sh
+
+.PHONY: run
+run:
+	@echo "Running exllama2..."
+	bash run.sh
+	@echo "exllama2 run."
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/Show More
+++ b/Show More