⬆️ Update ggerganov/llama.cpp (#1655 )

Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>
feat(startup): fetch model definition remotely (#1654 )
2026-02-03 03:02:38 -05:00 · 2024-01-28 09:24:44 +01:00 · 2024-01-28 00:14:16 +01:00 · 2024-01-27 00:13:38 +01:00 · 2024-01-27 00:13:19 +01:00 · 2024-01-26 18:35:33 +01:00
344 changed files with 45519 additions and 2261 deletions
--- a/.env
+++ b/.env
@@ -66,4 +66,24 @@ MODELS_PATH=/models
 ### Python backends GRPC max workers
 ### Default number of workers for GRPC Python backends.
 ### This actually controls wether a backend can process multiple requests or not.
-# PYTHON_GRPC_MAX_WORKERS=1
+# PYTHON_GRPC_MAX_WORKERS=1
+
+### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
+# LLAMACPP_PARALLEL=1
+
+### Enable to run parallel requests
+# PARALLEL_REQUESTS=true
+
+### Watchdog settings
+###
+# Enables watchdog to kill backends that are inactive for too much time
+# WATCHDOG_IDLE=true
+#
+# Enables watchdog to kill backends that are busy for too much time
+# WATCHDOG_BUSY=true
+#
+# Time in duration format (e.g. 1h30m) after which a backend is considered idle
+# WATCHDOG_IDLE_TIMEOUT=5m
+#
+# Time in duration format (e.g. 1h30m) after which a backend is considered busy
+# WATCHDOG_BUSY_TIMEOUT=5m
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -2,9 +2,7 @@
 name: Bug report
 about: Create a report to help us improve
 title: ''
-labels: bug
-assignees: mudler
-
+labels: bug, unconfirmed, up-for-grabs
 ---

 <!-- Thanks for helping us to improve LocalAI! We welcome all bug reports. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,9 +2,7 @@
 name: Feature request
 about: Suggest an idea for this project
 title: ''
-labels: enhancement
-assignees: mudler
-
+labels: enhancement, up-for-grabs
 ---

 <!-- Thanks for helping us to improve LocalAI! We welcome all feature requests. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
--- a/.github/bump_docs.sh
+++ b/.github/bump_docs.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -xe
+REPO=$1
+
+LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.name')
+
+cat <<< $(jq ".version = \"$LATEST_TAG\"" docs/data/version.json) > docs/data/version.json
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,6 +12,9 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
+          - repository: "ggerganov/llama.cpp"
+            variable: "CPPLLAMA_VERSION"
+            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
@@ -41,7 +44,7 @@ jobs:
            branch: "master"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -0,0 +1,31 @@
+name: Bump dependencies
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  bump:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: "mudler/LocalAI"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Bump dependencies 🔧
+        run: |
+          bash .github/bump_docs.sh ${{ matrix.repository }}
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Update docs version ${{ matrix.repository }}'
+          title: ':arrow_up: Update docs version ${{ matrix.repository }}'
+          branch: "update/docs"
+          body: Bump of ${{ matrix.repository }} version inside docs
+          signoff: true
+
+
+
--- a/.github/workflows/disabled/test-gpu.yml
+++ b/.github/workflows/disabled/test-gpu.yml
@@ -0,0 +1,63 @@
+---
+name: 'GPU tests'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-latest:
+    runs-on: gpu
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
+      - name: Build
+        run: |
+          if [ ! -e /run/systemd/system ]; then
+            sudo mkdir /run/systemd/system
+          fi
+          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
+          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            BUILD_TYPE=cublas \
+            prepare-e2e run-e2e-image test-e2e
+      - name: Release space from worker ♻
+        if: always()
+        run: |
+          sudo rm -rf build || true
+          sudo rm -rf bin || true
+          sudo rm -rf dist || true
+          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
+          sudo cat logs.txt || true
+          sudo rm -rf logs.txt
+          make clean || true
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            teardown-e2e || true
+          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
+          docker system prune -f -a --volumes || true
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -0,0 +1,86 @@
+---
+name: 'build container images tests'
+
+on:
+  pull_request:
+
+concurrency:
+  group: ci-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  extras-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+  core-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -2,7 +2,6 @@
 name: 'build container images'

 on:
-  pull_request:
  push:
    branches:
      - master
@@ -14,128 +13,148 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  docker:
+  extras-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11'
            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
-          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
-
-    runs-on: ubuntu-latest
-    steps:
-      - name: Release space from worker
-        run: |
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          df -h
-          echo
-          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-          sudo rm -rf /usr/local/lib/android
-          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-          sudo rm -rf /usr/share/dotnet
-          sudo apt-get remove -y '^mono-.*' || true
-          sudo apt-get remove -y '^ghc-.*' || true
-          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-          sudo apt-get remove -y 'php.*' || true
-          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-          sudo apt-get remove -y '^google-.*' || true
-          sudo apt-get remove -y azure-cli || true
-          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get autoremove -y
-          sudo apt-get clean
-          echo
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          sudo rm -rfv build || true
-          df -h
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: quay.io/go-skynet/local-ai
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
-          flavor: |
-            latest=${{ matrix.tag-latest }}
-            suffix=${{ matrix.tag-suffix }}
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@master
-        with:
-          platforms: all
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@master
-
-      - name: Login to DockerHub
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: quay.io
-          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v5
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          build-args: |
-            BUILD_TYPE=${{ matrix.build-type }}
-            CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
-            CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
-            FFMPEG=${{ matrix.ffmpeg }}
-          context: .
-          file: ./Dockerfile
-          platforms: ${{ matrix.platforms }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+          - build-type: ''
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: ''
+            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+  core-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-core'
+            ffmpeg: ''
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-core'
+            ffmpeg: ''
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -0,0 +1,160 @@
+---
+name: 'build container images (reusable)'
+
+on:
+  workflow_call:
+    inputs:
+      build-type:
+        description: 'Build type'
+        default: ''
+        type: string
+      cuda-major-version:
+        description: 'CUDA major version'
+        default: "11"
+        type: string
+      cuda-minor-version:
+        description: 'CUDA minor version'
+        default: "7"
+        type: string
+      platforms:
+        description: 'Platforms'
+        default: ''
+        type: string
+      tag-latest:
+        description: 'Tag latest'
+        default: ''
+        type: string
+      tag-suffix:
+        description: 'Tag suffix'
+        default: ''
+        type: string
+      ffmpeg:
+        description: 'FFMPEG'
+        default: ''
+        type: string
+      image-type:
+        description: 'Image type'
+        default: ''
+        type: string
+      runs-on:
+        description: 'Runs on'
+        required: true
+        default: ''
+        type: string
+    secrets:
+      dockerUsername:
+        required: true
+      dockerPassword:
+        required: true
+      quayUsername:
+        required: true
+      quayPassword:
+        required: true
+jobs:
+  reusable_image-build:
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Checkout
+        uses: actions/checkout@v4
+      # - name: Release space from worker
+      #   run: |
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     df -h
+      #     echo
+      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+      #     sudo rm -rf /usr/local/lib/android
+      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+      #     sudo rm -rf /usr/share/dotnet
+      #     sudo apt-get remove -y '^mono-.*' || true
+      #     sudo apt-get remove -y '^ghc-.*' || true
+      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+      #     sudo apt-get remove -y 'php.*' || true
+      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+      #     sudo apt-get remove -y '^google-.*' || true
+      #     sudo apt-get remove -y azure-cli || true
+      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+      #     sudo apt-get remove -y '^gfortran-.*' || true
+      #     sudo apt-get remove -y microsoft-edge-stable || true
+      #     sudo apt-get remove -y firefox || true
+      #     sudo apt-get remove -y powershell || true
+      #     sudo apt-get remove -y r-base-core || true
+      #     sudo apt-get autoremove -y
+      #     sudo apt-get clean
+      #     echo
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     sudo rm -rfv build || true
+      #     df -h
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai
+            localai/localai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.dockerUsername }}
+          password: ${{ secrets.dockerPassword }}
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.quayUsername }}
+          password: ${{ secrets.quayPassword }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            FFMPEG=${{ inputs.ffmpeg }}
+            IMAGE_TYPE=${{ inputs.image-type }}
+          context: .
+          file: ./Dockerfile
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+      - name: job summary
+        run: |
+          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -5,6 +5,10 @@ on: push
 permissions:
  contents: write

+concurrency:
+  group: ci-releases-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
 jobs:
  build-linux:
    strategy:
@@ -19,7 +23,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
@@ -29,6 +33,24 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
+
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v3
+        with:
+          path: grpc
+          key: ${{ runner.os }}-grpc
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make -j12
+      - name: Install gRPC
+        run: |
+          cd grpc && cd cmake/build && sudo make -j12 install
+
      - name: Build
        id: build
        env:
@@ -60,18 +82,23 @@ jobs:
    runs-on: macOS-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          make dist
      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -0,0 +1,277 @@
+---
+name: 'Tests extras backends'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  tests-transformers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test transformers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/transformers
+           make -C backend/python/transformers test
+
+  tests-sentencetransformers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test sentencetransformers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/sentencetransformers
+           make -C backend/python/sentencetransformers test
+
+  tests-diffusers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test diffusers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/diffusers
+           make -C backend/python/diffusers test
+
+
+  tests-transformers-musicgen:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test transformers-musicgen
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/transformers-musicgen
+           make -C backend/python/transformers-musicgen test
+
+
+
+  tests-petals:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test petals
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/petals
+           make -C backend/python/petals test
+
+           
+
+  tests-bark:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test bark
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/bark
+           make -C backend/python/bark test
+
+           
+  # Below tests needs GPU. Commented out for now
+  # TODO: Re-enable as soon as we have GPU nodes
+  # tests-vllm:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+  #            sudo apt-get update && \
+  #            sudo apt-get install -y conda
+  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+  #         sudo rm -rfv /usr/bin/conda || true
+  #     - name: Test vllm
+  #       run: |
+  #          export PATH=$PATH:/opt/conda/bin
+  #          make -C backend/python/vllm
+  #          make -C backend/python/vllm test
+  tests-vallex:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2    
+          sudo rm -rfv /usr/bin/conda || true
+      - name: Test vall-e-x
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/vall-e-x
+           make -C backend/python/vall-e-x test
+
+  tests-coqui:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test coqui
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make -C backend/python/coqui
+           make -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,7 +14,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  ubuntu-latest:
+  tests-linux:
    runs-on: ubuntu-latest
    strategy:
      matrix:
@@ -53,7 +53,7 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -67,38 +67,53 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          sudo pip install -r extra/requirements.txt
+          
+          sudo rm -rfv /usr/bin/conda || true
+          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers

-          sudo mkdir /build && sudo chmod -R 777 /build && cd /build && \
-          curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v1.11.0.tar.gz" | \
-          tar -xzvf - && \
-          mkdir -p "spdlog-1.11.0/build" && \
-          cd "spdlog-1.11.0/build" && \
-          cmake ..  && \
-          make -j8 && \
-          sudo cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
-          cd /build && \
-          mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
-          curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz" | \
-          tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
-          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+          # Pre-build piper before we start tests in order to have shared libraries in place
+          make sources/go-piper && \
+          GO_TAGS="tts" make -C sources/go-piper piper.o && \
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
+          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
+          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v3
+        with:
+          path: grpc
+          key: ${{ runner.os }}-grpc
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make -j12
+      - name: Install gRPC
+        run: |
+          cd grpc && cd cmake/build && sudo make -j12 install
      - name: Test
        run: |
-          ESPEAK_DATA="/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data" GO_TAGS="tts stablediffusion" make test
+          GO_TAGS="stablediffusion tts" make test

-  macOS-latest:
+  tests-apple:
    runs-on: macOS-latest
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -108,6 +123,11 @@ jobs:
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc
      - name: Test
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,11 @@
 # go-llama build artifacts
-go-llama
-go-llama-stable
-/gpt4all
-go-stable-diffusion
-go-piper
-/go-bert
-go-ggllm
-/piper
+/sources/
 __pycache__/
 *.a
 get-sources
+prepare-sources
+/backend/cpp/llama/grpc-server
+/backend/cpp/llama/llama.cpp

 go-ggml-transformers
 go-gpt2
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "docs/themes/hugo-theme-relearn"]
+	path = docs/themes/hugo-theme-relearn
+	url = https://github.com/McShelby/hugo-theme-relearn.git
+[submodule "docs/themes/lotusdocs"]
+	path = docs/themes/lotusdocs
+	url = https://github.com/colinwilson/lotusdocs
--- a/188
+++ b/188
@@ -1,22 +1,28 @@
 ARG GO_VERSION=1.21-bullseye
+ARG IMAGE_TYPE=extras
+# extras or core

-FROM golang:$GO_VERSION as requirements
+
+FROM golang:$GO_VERSION as requirements-core

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
 ARG CUDA_MINOR_VERSION=7
-ARG SPDLOG_VERSION="1.11.0"
-ARG PIPER_PHONEMIZE_VERSION='1.0.0'
 ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py,vllm:/build/extra/grpc/vllm/backend_vllm.py"
-ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
-ARG GO_TAGS="stablediffusion tts"
+
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
+
+ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl patch pip
+    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
+
+
+COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
+RUN update-ca-certificates

 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
@@ -30,66 +36,49 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

-# Extras requirements
-COPY extra/requirements.txt /build/extra/requirements.txt
-ENV PATH="/root/.cargo/bin:${PATH}"
-RUN pip install --upgrade pip
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN if [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
-    fi
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
-    fi
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+# OpenBLAS requirements and stable diffusion
+RUN apt-get install -y \
+    libopenblas-dev \
+    libopencv-dev \ 
+    && apt-get clean

-# Vall-e-X
-RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
+# Set up OpenCV
+RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

 WORKDIR /build

-# OpenBLAS requirements
-RUN apt-get install -y libopenblas-dev
-
-# Stable Diffusion requirements
-RUN apt-get install -y libopencv-dev && \
-    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-
-
-# piper requirements
-# Use pre-compiled Piper phonemization library (includes onnxruntime)
-#RUN if echo "${GO_TAGS}" | grep -q "tts"; then \
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')

-RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz" | \
-    tar -xzvf - && \
-    mkdir -p "spdlog-${SPDLOG_VERSION}/build" && \
-    cd "spdlog-${SPDLOG_VERSION}/build" && \
-    cmake ..  && \
-    make -j8 && \
-    cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
-    cd /build && \
-    mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
-    curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH:-$(go env GOARCH)}${TARGETVARIANT}.tar.gz" | \
-    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
-    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
-# \
-#    ; fi
+# Extras requirements
+FROM requirements-core as requirements-extras
+
+RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
+    apt-get update && \
+    apt-get install -y conda && apt-get clean
+
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN apt-get install -y espeak-ng espeak && apt-get clean

 ###################################
 ###################################

-FROM requirements as builder
+FROM requirements-${IMAGE_TYPE} as builder

 ARG GO_TAGS="stablediffusion tts"
-
+ARG GRPC_BACKENDS
+ARG BUILD_GRPC=true
+ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
@@ -97,31 +86,51 @@ ENV NVIDIA_VISIBLE_DEVICES=all

 WORKDIR /build

-COPY Makefile .
-RUN make get-sources
-COPY go.mod .
-RUN make prepare
 COPY . .
 COPY .git .
+RUN make prepare

-RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build
+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+RUN if [ "${BUILD_GRPC}" = "true" ]; then \
+    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+      -DgRPC_BUILD_TESTS=OFF \
+       ../.. && make -j12 install \
+    ; fi
+
+# Rebuild with defaults backends
+RUN make build
+
+RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
+    mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
+    touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
+    ; fi

 ###################################
 ###################################

-FROM requirements
+FROM requirements-${IMAGE_TYPE}

 ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
+ARG IMAGE_TYPE=extras

 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

+ARG CUDA_MAJOR_VERSION=11
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV PIP_CACHE_PURGE=true
+
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg \
+    apt-get install -y ffmpeg && apt-get clean \
    ; fi

 WORKDIR /build
@@ -131,16 +140,65 @@ WORKDIR /build
 # see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
-RUN make prepare-sources
+
+COPY --from=builder /build/sources ./sources/
+COPY --from=builder /build/grpc ./grpc/
+
+RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
+
+# Copy the binary
 COPY --from=builder /build/local-ai ./

-# Copy VALLE-X as it's not a real "lib"
-RUN cp -rfv /usr/lib/vall-e-x/* ./
+# Copy shared libraries for piper
+COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
+
+# do not let stablediffusion rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+
+## Duplicated from Makefile to avoid having a big layer that's hard to push
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/mamba \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \
+    ; fi
+
+# Make sure the models directory exists
+RUN mkdir -p /build/models

-# To resolve exllama import error
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
-        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
-    fi
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/Entitlements.plist
+++ b/Entitlements.plist
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>com.apple.security.network.client</key>
+    <true/>
+    <key>com.apple.security.network.server</key>
+    <true/>
+</dict>
+</plist>
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Ettore Di Giacinto
+Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/510
+++ b/510
@@ -4,10 +4,12 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=1676dcd7a139b6cdfbaea5fd67f46dc25d9d8bcf
+GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

+CPPLLAMA_VERSION?=6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855
+
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
@@ -17,34 +19,36 @@ GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
-RWKV_VERSION?=c898cd0f62df8f2a7830e53d1d513bef4f6f792b
+RWKV_VERSION?=633c5a3485c403cb2520693dc0991a25dace9f0f

 # whisper.cpp version
-WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
+WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d

 # go-piper version
-PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7
-
-# go-bloomz version
-BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
+PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
+STABLEDIFFUSION_VERSION?=902db5f066fd137697e3b69d0fa10d4782bd2c2f

-# Go-ggllm
-GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
+# tinydream version
+TINYDREAM_VERSION?=772a9c0d9aaf768290e63cca3c904fe69faf677a

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
+
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git

+TEST_DIR=/tmp/test
+
+RANDOM := $(shell bash -c 'echo $$RANDOM')
+
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
 LD_FLAGS?=
@@ -52,7 +56,6 @@ override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION
 override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"

 OPTIONAL_TARGETS?=
-ESPEAK_DATA?=

 OS := $(shell uname -s)
 ARCH := $(shell uname -m)
@@ -62,33 +65,46 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

+# Default Docker bridge IP
+E2E_BRIDGE_IP?=172.17.0.1
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif

-ifeq ($(UNAME_S),Darwin)
+ifeq ($(OS),Darwin)
 	CGO_LDFLAGS += -lcblas -framework Accelerate
-ifneq ($(BUILD_TYPE),metal)
-    # explicit disable metal if on Darwin and metal is disabled
-	CMAKE_ARGS+=-DLLAMA_METAL=OFF
-endif
+	ifeq ($(OSX_SIGNING_IDENTITY),)
+		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
+	endif
+
+	# on OSX, if BUILD_TYPE is blank, we should default to use Metal
+	ifeq ($(BUILD_TYPE),)
+		BUILD_TYPE=metal
+	# disable metal if on Darwin and any other value is explicitly passed.
+	else ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+	endif
 endif

 ifeq ($(BUILD_TYPE),openblas)
 	CGO_LDFLAGS+=-lopenblas
+	export WHISPER_OPENBLAS=1
 endif

 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
+	export WHISPER_CUBLAS=1
 endif

 ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
-	# Llama-stable has no hipblas support, so override it here.
+	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
+	export WHISPER_HIPBLAS=1
 	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
@@ -98,10 +114,12 @@ endif
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
+	export WHISPER_METAL=1
 endif

 ifeq ($(BUILD_TYPE),clblas)
 	CGO_LDFLAGS+=-lOpenCL -lclblast
+	export WHISPER_CLBLAST=1
 endif

 # glibc-static or glibc-devel-static required
@@ -114,149 +132,157 @@ ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
 	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
 endif

+ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
+#	OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
+	OPTIONAL_GRPC+=backend-assets/grpc/tinydream
+endif
+
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
+	PIPER_CGO_CXXFLAGS+=-I$(CURDIR)/sources/go-piper/piper/src/cpp -I$(CURDIR)/sources/go-piper/piper/build/fi/include -I$(CURDIR)/sources/go-piper/piper/build/pi/include -I$(CURDIR)/sources/go-piper/piper/build/si/include
+	PIPER_CGO_LDFLAGS+=-L$(CURDIR)/sources/go-piper/piper/build/fi/lib -L$(CURDIR)/sources/go-piper/piper/build/pi/lib -L$(CURDIR)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-ggml backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
+
+# If empty, then we build all
+ifeq ($(GRPC_BACKENDS),)
+	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
+endif
+
+ifeq ($(BUILD_API_ONLY),true)
+	GRPC_BACKENDS=
+endif

 .PHONY: all test build vendor

 all: help

 ## GPT4ALL
-gpt4all:
-	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
-	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
-
-## go-ggllm
-go-ggllm:
-	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
-	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
-
-go-ggllm/libggllm.a: go-ggllm
-	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
+sources/gpt4all:
+	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
+	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

 ## go-piper
-go-piper:
-	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
-	cd go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-piper:
+	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
+	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1

 ## BERT embeddings
-go-bert:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp go-bert
-	cd go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
+	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

 ## stable diffusion
-go-stable-diffusion:
-	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion go-stable-diffusion
-	cd go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-stable-diffusion:
+	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
+	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1

-go-stable-diffusion/libstablediffusion.a:
-	$(MAKE) -C go-stable-diffusion libstablediffusion.a
+sources/go-stable-diffusion/libstablediffusion.a:
+	$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
+
+## tiny-dream
+sources/go-tiny-dream:
+	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
+	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
+
+sources/go-tiny-dream/libtinydream.a:
+	$(MAKE) -C sources/go-tiny-dream libtinydream.a

 ## RWKV
-go-rwkv:
-	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
-	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
+	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-go-rwkv/librwkv.a: go-rwkv
-	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+sources/go-rwkv/librwkv.a: sources/go-rwkv
+	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-## bloomz
-bloomz:
-	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
-	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert/libgobert.a: sources/go-bert
+	$(MAKE) -C sources/go-bert libgobert.a

-bloomz/libbloomz.a: bloomz
-	cd bloomz && make libbloomz.a
-
-go-bert/libgobert.a: go-bert
-	$(MAKE) -C go-bert libgobert.a
-
-backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
+backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	mkdir -p backend-assets/gpt4all
-	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
-	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
-	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

-backend-assets/espeak-ng-data:
+backend-assets/espeak-ng-data: sources/go-piper
 	mkdir -p backend-assets/espeak-ng-data
-ifdef ESPEAK_DATA
-	@cp -rf $(ESPEAK_DATA)/. backend-assets/espeak-ng-data
-else
-	@echo "ESPEAK_DATA not set, skipping tts. Note that this will break the tts functionality."
-	@touch backend-assets/espeak-ng-data/keep
-endif
+	$(MAKE) -C sources/go-piper piper.o
+	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data

-gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
-	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a
+sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## CEREBRAS GPT
-go-ggml-transformers:
-	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp go-ggml-transformers
-	cd go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-ggml-transformers:
+	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp sources/go-ggml-transformers
+	cd sources/go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1

-go-ggml-transformers/libtransformers.a: go-ggml-transformers
-	$(MAKE) -C go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
+sources/go-ggml-transformers/libtransformers.a: sources/go-ggml-transformers
+	$(MAKE) -C sources/go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a

-whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git
-	cd whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
+sources/whisper.cpp:
+	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
+	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

-whisper.cpp/libwhisper.a: whisper.cpp
-	cd whisper.cpp && make libwhisper.a
+sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
+	cd sources/whisper.cpp && make libwhisper.a

-go-llama:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
-	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-llama:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
+	cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

-go-llama-stable:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
-	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-llama-ggml:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
+	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-go-llama/libbinding.a: go-llama
-	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+sources/go-llama/libbinding.a: sources/go-llama
+	$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

-go-llama-stable/libbinding.a: go-llama-stable
-	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
+	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-go-piper/libpiper_binding.a:
-	$(MAKE) -C go-piper libpiper_binding.a example/main
+sources/go-piper/libpiper_binding.a: sources/go-piper
+	$(MAKE) -C sources/go-piper libpiper_binding.a example/main

-get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+backend/cpp/llama/llama.cpp:
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp	
+
+get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/go-ggml-transformers sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
 	touch $@

 replace:
-	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
-	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
-	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
-	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(CURDIR)/sources/go-ggml-transformers
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
+	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
+	touch $@

 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C go-llama clean
-	$(MAKE) -C go-llama-stable clean
-	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C go-ggml-transformers clean
-	$(MAKE) -C go-rwkv clean
-	$(MAKE) -C whisper.cpp clean
-	$(MAKE) -C go-stable-diffusion clean
-	$(MAKE) -C go-bert clean
-	$(MAKE) -C bloomz clean
-	$(MAKE) -C go-piper clean
-	$(MAKE) -C go-ggllm clean
+	$(MAKE) -C sources/go-llama clean
+	$(MAKE) -C sources/go-llama-ggml clean
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
+	$(MAKE) -C sources/go-ggml-transformers clean
+	$(MAKE) -C sources/go-rwkv clean
+	$(MAKE) -C sources/whisper.cpp clean
+	$(MAKE) -C sources/go-stable-diffusion clean
+	$(MAKE) -C sources/go-bert clean
+	$(MAKE) -C sources/go-piper clean
+	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -265,36 +291,29 @@ prepare: prepare-sources $(OPTIONAL_TARGETS)
 clean: ## Remove build related file
 	$(GOCMD) clean -cache
 	rm -f prepare
-	rm -rf ./go-llama
-	rm -rf ./gpt4all
-	rm -rf ./go-llama-stable
-	rm -rf ./go-gpt2
-	rm -rf ./go-stable-diffusion
-	rm -rf ./go-ggml-transformers
-	rm -rf ./backend-assets
-	rm -rf ./go-rwkv
-	rm -rf ./go-bert
-	rm -rf ./bloomz
-	rm -rf ./whisper.cpp
-	rm -rf ./go-piper
-	rm -rf ./go-ggllm
+	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
+	rm -rf backend-assets
+	$(MAKE) -C backend/cpp/grpc clean
+	$(MAKE) -C backend/cpp/llama clean

 ## Build:

-build: grpcs prepare ## Build the project
+build: backend-assets grpcs prepare ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
-
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 dist: build
 	mkdir -p release
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)

+osx-signed: build
+	codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
+
 ## Run
 run: prepare ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./
@@ -302,12 +321,12 @@ run: prepare ## run local-ai
 test-models/testmodel:
 	mkdir test-models
 	mkdir test-dir
-	wget https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
-	wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
-	wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O test-models/bert
-	wget https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
-	wget https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
-	wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
+	wget -q https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
+	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
+	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
+	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
+	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
+	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models

 prepare-test: grpcs
@@ -318,14 +337,34 @@ test: prepare test-models/testmodel grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

+prepare-e2e:
+	mkdir -p $(TEST_DIR)
+	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
+	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
+	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+
+run-e2e-image:
+	ls -liah $(abspath ./tests/e2e-fixtures)
+	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
+
+test-e2e:
+	@echo 'Running e2e tests'
+	BUILD_TYPE=$(BUILD_TYPE) \
+	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+
+teardown-e2e:
+	rm -rf $(TEST_DIR) || true
+	docker stop $$(docker ps -q --filter ancestor=localai-tests)
+
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
@@ -365,102 +404,167 @@ help: ## Show this help.
 protogen: protogen-go protogen-python

 protogen-go:
-	protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative \
-    pkg/grpc/proto/backend.proto
+	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+    backend/backend.proto

 protogen-python:
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
-	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/coqui/ --grpc_python_out=backend/python/coqui/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/mamba/ --grpc_python_out=backend/python/mamba/ backend/backend.proto
+	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto

 ## GRPC
+# Note: it is duplicated in the Dockerfile
+prepare-extra-conda-environments:
+	$(MAKE) -C backend/python/autogptq
+	$(MAKE) -C backend/python/bark
+	$(MAKE) -C backend/python/coqui
+	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/vllm
+	$(MAKE) -C backend/python/mamba
+	$(MAKE) -C backend/python/sentencetransformers
+	$(MAKE) -C backend/python/transformers
+	$(MAKE) -C backend/python/transformers-musicgen
+	$(MAKE) -C backend/python/vall-e-x
+	$(MAKE) -C backend/python/exllama
+	$(MAKE) -C backend/python/petals
+	$(MAKE) -C backend/python/exllama2
+
+prepare-test-extra:
+	$(MAKE) -C backend/python/transformers
+	$(MAKE) -C backend/python/diffusers
+
+test-extra: prepare-test-extra
+	$(MAKE) -C backend/python/transformers test
+	$(MAKE) -C backend/python/diffusers test
+
+backend-assets:
+	mkdir -p backend-assets
+ifeq ($(BUILD_API_ONLY),true)
+	touch backend-assets/keep
+endif

 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
-
-backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
-# TODO: every binary should have its own folder instead, so can have different metal implementations
+backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
+# TODO: every binary should have its own folder instead, so can have different  implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
+	cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
 endif

-backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/
+## BACKEND CPP LLAMA START
+# Sets the variables in case it has to build the gRPC locally.
+INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
+INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
+ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
+                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include

-backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
+backend/cpp/llama/grpc-server:
+ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
+	$(MAKE) -C backend/cpp/grpc build
+	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
+	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
+	export PATH="${INSTALLED_PACKAGES}/bin:${PATH}" && \
+	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+else
+	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
+endif
+## BACKEND CPP LLAMA END

-backend-assets/grpc/dolly: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./cmd/grpc/dolly/
+##
+backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
+	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
+endif

-backend-assets/grpc/gpt2: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./cmd/grpc/gpt2/
+backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

-backend-assets/grpc/gptj: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./cmd/grpc/gptj/
+backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/

-backend-assets/grpc/gptneox: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./cmd/grpc/gptneox/
+backend-assets/grpc/dolly: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./backend/go/llm/dolly/

-backend-assets/grpc/mpt: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./cmd/grpc/mpt/
+backend-assets/grpc/gpt2: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./backend/go/llm/gpt2/

-backend-assets/grpc/replit: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./cmd/grpc/replit/
+backend-assets/grpc/gptj: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./backend/go/llm/gptj/

-backend-assets/grpc/falcon-ggml: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./cmd/grpc/falcon-ggml/
+backend-assets/grpc/gptneox: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./backend/go/llm/gptneox/

-backend-assets/grpc/starcoder: backend-assets/grpc go-ggml-transformers/libtransformers.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./cmd/grpc/starcoder/
+backend-assets/grpc/mpt: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./backend/go/llm/mpt/

-backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/
+backend-assets/grpc/replit: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./backend/go/llm/replit/

-backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
+backend-assets/grpc/falcon-ggml: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./backend/go/llm/falcon-ggml/

-backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
+backend-assets/grpc/starcoder: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-ggml-transformers LIBRARY_PATH=$(CURDIR)/sources/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./backend/go/llm/starcoder/
+
+backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
+
+backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/

-backend-assets/grpc/stablediffusion: backend-assets/grpc go-stable-diffusion/libstablediffusion.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/
+backend-assets/grpc/stablediffusion: backend-assets/grpc
+	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
+		$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
+		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-stable-diffusion/ LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion; \
+	fi

-backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/
+backend-assets/grpc/tinydream: backend-assets/grpc sources/go-tiny-dream/libtinydream.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream

-backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/
+backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
+
+backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/

 grpcs: prepare $(GRPC_BACKENDS)
--- a/README.md
+++ b/README.md
@@ -20,17 +20,15 @@
 </a>
 </p>

-> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
-> 
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)
+[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
+[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)

+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+>
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.
-
-<p align="center"><b>Follow LocalAI </b></p>
-
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
 <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
@@ -39,47 +37,33 @@
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>

-<p align="center"><b>Connect with the Creator </b></p>
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.

-<p align="center">
-<a href="https://twitter.com/mudler_it" target="blank">
-<img src="https://img.shields.io/twitter/follow/mudler_it?label=Follow: mudler_it&style=social" alt="Follow mudler_it"/>
-</a>
-<a href='https://github.com/mudler'>
-<img alt="Follow on Github" src="https://img.shields.io/badge/Follow-mudler-black?logo=github&link=https%3A%2F%2Fgithub.com%2Fmudler">
-</a>
-</p>
+## 🔥🔥 Hot topics / Roadmap

-<p align="center"><b>Share LocalAI Repository</b></p>
+[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

-<p align="center">
+- Mamba support: https://github.com/mudler/LocalAI/pull/1589
+- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
+- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
+- Inline templates: https://github.com/mudler/LocalAI/pull/1452
+- Mixtral: https://github.com/mudler/LocalAI/pull/1449
+- Img2vid https://github.com/mudler/LocalAI/pull/1442
+- Musicgen https://github.com/mudler/LocalAI/pull/1387

-<a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
-<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a> 
-<a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
-<a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
-" target="blank">
-<img src="https://img.shields.io/twitter/url?label=Reddit&logo=Reddit&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Reddit"/>
-</a> <a href="mailto:?subject=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%3A%0Ahttps://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Gmail&logo=Gmail&style=social&url=https://github.com/go-skynet/LocalAI"/></a> <a href="https://www.buymeacoffee.com/mudler" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="23" width="100" style="border-radius:1px"></a>
+Hot topics (looking for contributors):
+- Backends v2: https://github.com/mudler/LocalAI/issues/1126
+- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373

-</p>
+If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22

-<hr>
+## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)

-In a nutshell:
+For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. For those in a hurry, here's a straightforward one-liner to launch a LocalAI instance with [phi-2](https://huggingface.co/microsoft/phi-2) using `docker`:

- Local, OpenAI drop-in alternative REST API. You own your data.
- NO GPU required. NO Internet access is required either
-  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
- Supports multiple models
- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.
-
-LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 
-
-Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
-
-## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)
+```
+docker run -ti -p 8080:8080 localai/localai:v2.5.1-ffmpeg-core phi-2
+```

 ## 🚀 [Features](https://localai.io/features/)

@@ -91,7 +75,45 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
+- 🆕 [Vision API](https://localai.io/features/gpt-vision/)

+## 💻 Usage
+
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
+
+### 🔗 Community and integrations
+
+Build and deploy custom containers:
+- https://github.com/sozercan/aikit
+
+WebUIs:
+- https://github.com/Jirubizu/localai-admin
+- https://github.com/go-skynet/LocalAI-frontend
+
+Model galleries
+- https://github.com/go-skynet/model-gallery
+  
+Auto Docker / Model setup
+- https://io.midori-ai.xyz/howtos/easy-localai-installer/
+- https://io.midori-ai.xyz/howtos/easy-model-installer/
+
+Other:
+- Helm chart https://github.com/go-skynet/helm-charts
+- VSCode extension https://github.com/badgooooor/localai-vscode-plugin
+- Local Smart assistant https://github.com/mudler/LocalAGI
+- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation
+- Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
+- Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
+- Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
+- Examples: https://github.com/mudler/LocalAI/tree/master/examples/
+
+### 🔗 Resources
+
+- 🆕 New! [LLM finetuning guide](https://localai.io/advanced/fine-tuning/)
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/integrations/)
+- [How tos section](https://io.midori-ai.xyz/howtos/) (curated by our community)

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

@@ -100,21 +122,6 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs
 - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
 - [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)

-## 💻 Usage
-
-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
-
-### 💡 Example: Use Luna-AI Llama model
-
-See the [documentation](https://localai.io/basics/getting_started)
-
-### 🔗 Resources
-
- [How to build locally](https://localai.io/basics/build/index.html)
- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
- [Projects integrating LocalAI](https://localai.io/integrations/)
- [How tos section](https://localai.io/howtos/) (curated by our community)
-  
 ## Citation

 If you utilize this repository, data in a downstream project, please consider citing it with:
@@ -137,12 +144,12 @@ Support the project by becoming [a backer or sponsor](https://github.com/sponsor

 A huge thank you to our generous sponsors who support this project:

-| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) | 
+| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
 |:-----------------------------------------------:|
-|  [Spectro Cloud](https://www.spectrocloud.com/)  |  
+|  [Spectro Cloud](https://www.spectrocloud.com/)  |
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

-And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 
+And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.

 - [Sponsor list](https://github.com/sponsors/mudler)
 - JDAM00 (donating HW for the CI)
--- a/api/api.go
+++ b/api/api.go
@@ -1,8 +1,10 @@
 package api

 import (
+	"encoding/json"
 	"errors"
 	"fmt"
+	"os"
 	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
@@ -11,7 +13,10 @@ import (
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/assets"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/startup"

 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
@@ -32,6 +37,8 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
 	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())

+	startup.PreloadModelsConfigurations(options.ModelLibraryURL, options.Loader.ModelPath, options.ModelsURL...)
+
 	cl := config.NewConfigLoader()
 	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
 		log.Error().Msgf("error loading config files: %s", err.Error())
@@ -43,6 +50,22 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 		}
 	}

+	if err := cl.Preload(options.Loader.ModelPath); err != nil {
+		log.Error().Msgf("error downloading models: %s", err.Error())
+	}
+
+	if options.PreloadJSONModels != "" {
+		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
 	if options.Debug {
 		for _, v := range cl.ListConfigs() {
 			cfg, _ := cl.GetConfig(v)
@@ -59,18 +82,6 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 		}
 	}

-	if options.PreloadJSONModels != "" {
-		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
-	if options.PreloadModelsFromPath != "" {
-		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
 	// turn off any process that was started by GRPC if the context is canceled
 	go func() {
 		<-options.Context.Done()
@@ -78,6 +89,22 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
 		options.Loader.StopAllGRPC()
 	}()

+	if options.WatchDog {
+		wd := model.NewWatchDog(
+			options.Loader,
+			options.WatchDogBusyTimeout,
+			options.WatchDogIdleTimeout,
+			options.WatchDogBusy,
+			options.WatchDogIdle)
+		options.Loader.SetWatchDog(wd)
+		go wd.Run()
+		go func() {
+			<-options.Context.Done()
+			log.Debug().Msgf("Context canceled, shutting down")
+			wd.Shutdown()
+		}()
+	}
+
 	return options, cl, nil
 }

@@ -120,31 +147,52 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// Default middleware config
 	app.Use(recover.New())
+	if options.Metrics != nil {
+		app.Use(metrics.APIMiddleware(options.Metrics))
+	}

 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
-		if len(options.ApiKeys) > 0 {
-			authHeader := c.Get("Authorization")
-			if authHeader == "" {
-				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-			}
-			authHeaderParts := strings.Split(authHeader, " ")
-			if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+		if len(options.ApiKeys) == 0 {
+			return c.Next()
+		}
+
+		// Check for api_keys.json file
+		fileContent, err := os.ReadFile("api_keys.json")
+		if err == nil {
+			// Parse JSON content from the file
+			var fileKeys []string
+			err := json.Unmarshal(fileContent, &fileKeys)
+			if err != nil {
+				return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{"message": "Error parsing api_keys.json"})
 			}

-			apiKey := authHeaderParts[1]
-			validApiKey := false
-			for _, key := range options.ApiKeys {
-				if apiKey == key {
-					validApiKey = true
-				}
-			}
-			if !validApiKey {
-				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+			// Add file keys to options.ApiKeys
+			options.ApiKeys = append(options.ApiKeys, fileKeys...)
+		}
+
+		if len(options.ApiKeys) == 0 {
+			return c.Next()
+		}
+
+		authHeader := c.Get("Authorization")
+		if authHeader == "" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+		}
+		authHeaderParts := strings.Split(authHeader, " ")
+		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+		}
+
+		apiKey := authHeaderParts[1]
+		for _, key := range options.ApiKeys {
+			if apiKey == key {
+				return c.Next()
 			}
 		}
-		return c.Next()
+
+		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+
 	}

 	if options.CORS {
@@ -168,6 +216,11 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}{Version: internal.PrintableVersion()})
 	})

+	// Make sure directories exists
+	os.MkdirAll(options.ImageDir, 0755)
+	os.MkdirAll(options.AudioDir, 0755)
+	os.MkdirAll(options.Loader.ModelPath, 0755)
+
 	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
 	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
 	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
@@ -229,5 +282,7 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
 	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))

+	app.Get("/metrics", metrics.MetricsHandler())
+
 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -15,9 +15,10 @@ import (

 	. "github.com/go-skynet/LocalAI/api"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/metrics"
+	"github.com/go-skynet/LocalAI/pkg/downloader"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -60,7 +61,7 @@ func getModelStatus(url string) (response map[string]interface{}) {
 }

 func getModels(url string) (response []gallery.GalleryModel) {
-	utils.GetURI(url, func(url string, i []byte) error {
+	downloader.GetURI(url, func(url string, i []byte) error {
 		// Unmarshal YAML data into a struct
 		return json.Unmarshal(i, &response)
 	})
@@ -162,8 +163,12 @@ var _ = Describe("API test", func() {
 				},
 			}

+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
+					options.WithMetrics(metricsService),
 					options.WithContext(c),
 					options.WithGalleries(galleries),
 					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
@@ -289,14 +294,14 @@ var _ = Describe("API test", func() {
 				Expect(content["backend"]).To(Equal("bert-embeddings"))
 			})

-			It("runs openllama", Label("llama"), func() {
+			It("runs openllama(llama-ggml backend)", Label("llama"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
+					Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -357,9 +362,10 @@ var _ = Describe("API test", func() {
 				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+
 			})

-			It("runs openllama gguf", Label("llama-gguf"), func() {
+			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
@@ -452,7 +458,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
+				}, "960s", "10s").Should(Equal(true))

 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -479,9 +485,13 @@ var _ = Describe("API test", func() {
 				},
 			}

+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
+					options.WithMetrics(metricsService),
 					options.WithAudioDir(tmpdir),
 					options.WithImageDir(tmpdir),
 					options.WithGalleries(galleries),
@@ -583,12 +593,15 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())

-			var err error
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					options.WithContext(c),
 					options.WithModelLoader(modelLoader),
+					options.WithMetrics(metricsService),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")
@@ -675,7 +688,7 @@ var _ = Describe("API test", func() {
 					Input: []string{"sun", "cat"},
 				},
 			)
-			Expect(err).ToNot(HaveOccurred())
+			Expect(err).ToNot(HaveOccurred(), err)
 			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
 			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))

@@ -692,7 +705,7 @@ var _ = Describe("API test", func() {
 		})

 		Context("External gRPC calls", func() {
-			It("calculate embeddings with huggingface", func() {
+			It("calculate embeddings with sentencetransformers", func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
@@ -792,10 +805,13 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())

-			var err error
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
+					options.WithMetrics(metricsService),
 					options.WithModelLoader(modelLoader),
 					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
--- a/api/backend/embeddings.go
+++ b/api/backend/embeddings.go
@@ -41,7 +41,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.

 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
-	case *grpc.Client:
+	case grpc.Backend:
 		fn = func() ([]float32, error) {
 			predictOptions := gRPCPredictOpts(c, loader.ModelPath)
 			if len(tokens) > 0 {
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -16,16 +16,18 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 		model.WithContext(o.Context),
 		model.WithModel(c.Model),
 		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
-			CUDA:          c.Diffusers.CUDA,
+			CUDA:          c.CUDA || c.Diffusers.CUDA,
 			SchedulerType: c.Diffusers.SchedulerType,
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
 			LoraAdapter:   c.LoraAdapter,
+			LoraScale:     c.LoraScale,
 			LoraBase:      c.LoraBase,
 			IMG2IMG:       c.Diffusers.IMG2IMG,
 			CLIPModel:     c.Diffusers.ClipModel,
 			CLIPSubfolder: c.Diffusers.ClipSubFolder,
 			CLIPSkip:      int32(c.Diffusers.ClipSkip),
+			ControlNet:    c.Diffusers.ControlNet,
 		}),
 	})

--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -26,12 +26,12 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model

 	grpcOpts := gRPCModelOpts(c)

-	var inferenceModel *grpc.Client
+	var inferenceModel grpc.Backend
 	var err error

 	opts := modelOpts(c, o, []model.Option{
@@ -72,6 +72,7 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c
 	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
+		opts.Images = images

 		tokenUsage := TokenUsage{}

@@ -158,6 +159,9 @@ func Finetune(config config.Config, input, prediction string) string {
 	for _, c := range config.TrimSpace {
 		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
 	}
-	return prediction

+	for _, c := range config.TrimSuffix {
+		prediction = strings.TrimSpace(strings.TrimSuffix(prediction, c))
+	}
+	return prediction
 }
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -16,6 +16,10 @@ func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.
 		opts = append(opts, model.WithSingleActiveBackend())
 	}

+	if o.ParallelBackendRequests {
+		opts = append(opts, model.EnableParallelRequests)
+	}
+
 	if c.GRPC.Attempts != 0 {
 		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}
@@ -38,29 +42,38 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 	}

 	return &pb.ModelOptions{
-		ContextSize:   int32(c.ContextSize),
-		Seed:          int32(c.Seed),
-		NBatch:        int32(b),
-		NoMulMatQ:     c.NoMulMatQ,
-		DraftModel:    c.DraftModel,
-		AudioPath:     c.VallE.AudioPath,
-		Quantization:  c.Quantization,
-		LoraAdapter:   c.LoraAdapter,
-		LoraBase:      c.LoraBase,
-		NGQA:          c.NGQA,
-		RMSNormEps:    c.RMSNormEps,
-		F16Memory:     c.F16,
-		MLock:         c.MMlock,
-		RopeFreqBase:  c.RopeFreqBase,
-		RopeFreqScale: c.RopeFreqScale,
-		NUMA:          c.NUMA,
-		Embeddings:    c.Embeddings,
-		LowVRAM:       c.LowVRAM,
-		NGPULayers:    int32(c.NGPULayers),
-		MMap:          c.MMap,
-		MainGPU:       c.MainGPU,
-		Threads:       int32(c.Threads),
-		TensorSplit:   c.TensorSplit,
+		ContextSize:    int32(c.ContextSize),
+		Seed:           int32(c.Seed),
+		NBatch:         int32(b),
+		NoMulMatQ:      c.NoMulMatQ,
+		CUDA:           c.CUDA, // diffusers, transformers
+		DraftModel:     c.DraftModel,
+		AudioPath:      c.VallE.AudioPath,
+		Quantization:   c.Quantization,
+		MMProj:         c.MMProj,
+		YarnExtFactor:  c.YarnExtFactor,
+		YarnAttnFactor: c.YarnAttnFactor,
+		YarnBetaFast:   c.YarnBetaFast,
+		YarnBetaSlow:   c.YarnBetaSlow,
+		LoraAdapter:    c.LoraAdapter,
+		LoraBase:       c.LoraBase,
+		LoraScale:      c.LoraScale,
+		NGQA:           c.NGQA,
+		RMSNormEps:     c.RMSNormEps,
+		F16Memory:      c.F16,
+		MLock:          c.MMlock,
+		RopeFreqBase:   c.RopeFreqBase,
+		RopeScaling:    c.RopeScaling,
+		Type:           c.ModelType,
+		RopeFreqScale:  c.RopeFreqScale,
+		NUMA:           c.NUMA,
+		Embeddings:     c.Embeddings,
+		LowVRAM:        c.LowVRAM,
+		NGPULayers:     int32(c.NGPULayers),
+		MMap:           c.MMap,
+		MainGPU:        c.MainGPU,
+		Threads:        int32(c.Threads),
+		TensorSplit:    c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -59,9 +59,13 @@ func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *opt
 	// If the model file is not empty, we pass it joined with the model path
 	modelPath := ""
 	if modelFile != "" {
-		modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
-		if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
-			return "", nil, err
+		if bb != model.TransformersMusicGen {
+			modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
+			if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+				return "", nil, err
+			}
+		} else {
+			modelPath = modelFile
 		}
 	}

--- a/api/config/config.go
+++ b/api/config/config.go
@@ -1,6 +1,7 @@
 package api_config

 import (
+	"errors"
 	"fmt"
 	"io/fs"
 	"os"
@@ -8,6 +9,9 @@ import (
 	"strings"
 	"sync"

+	"github.com/go-skynet/LocalAI/pkg/downloader"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v3"
 )

@@ -38,14 +42,28 @@ type Config struct {

 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
-
-	Step int `yaml:"step"`
+	Step      int       `yaml:"step"`

 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`

 	// Vall-e-x
 	VallE VallE `yaml:"vall-e"`
+
+	// CUDA
+	// Explicitly enable CUDA or not (some backends might need it)
+	CUDA bool `yaml:"cuda"`
+
+	DownloadFiles []File `yaml:"download_files"`
+
+	Description string `yaml:"description"`
+	Usage       string `yaml:"usage"`
+}
+
+type File struct {
+	Filename string `yaml:"filename" json:"filename"`
+	SHA256   string `yaml:"sha256" json:"sha256"`
+	URI      string `yaml:"uri" json:"uri"`
 }

 type VallE struct {
@@ -65,15 +83,16 @@ type GRPC struct {
 }

 type Diffusers struct {
+	CUDA             bool    `yaml:"cuda"`
 	PipelineType     string  `yaml:"pipeline_type"`
 	SchedulerType    string  `yaml:"scheduler_type"`
-	CUDA             bool    `yaml:"cuda"`
 	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
 	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
 	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
 	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
 	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
 	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+	ControlNet       string  `yaml:"control_net"`
 }

 type LLMConfig struct {
@@ -96,14 +115,26 @@ type LLMConfig struct {
 	StopWords       []string `yaml:"stopwords"`
 	Cutstrings      []string `yaml:"cutstrings"`
 	TrimSpace       []string `yaml:"trimspace"`
-	ContextSize     int      `yaml:"context_size"`
-	NUMA            bool     `yaml:"numa"`
-	LoraAdapter     string   `yaml:"lora_adapter"`
-	LoraBase        string   `yaml:"lora_base"`
-	NoMulMatQ       bool     `yaml:"no_mulmatq"`
-	DraftModel      string   `yaml:"draft_model"`
-	NDraft          int32    `yaml:"n_draft"`
-	Quantization    string   `yaml:"quantization"`
+	TrimSuffix      []string `yaml:"trimsuffix"`
+
+	ContextSize  int     `yaml:"context_size"`
+	NUMA         bool    `yaml:"numa"`
+	LoraAdapter  string  `yaml:"lora_adapter"`
+	LoraBase     string  `yaml:"lora_base"`
+	LoraScale    float32 `yaml:"lora_scale"`
+	NoMulMatQ    bool    `yaml:"no_mulmatq"`
+	DraftModel   string  `yaml:"draft_model"`
+	NDraft       int32   `yaml:"n_draft"`
+	Quantization string  `yaml:"quantization"`
+	MMProj       string  `yaml:"mmproj"`
+
+	RopeScaling string `yaml:"rope_scaling"`
+	ModelType   string `yaml:"type"`
+
+	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
+	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
+	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
+	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }

 type AutoGPTQ struct {
@@ -252,6 +283,67 @@ func (cm *ConfigLoader) ListConfigs() []string {
 	return res
 }

+// Preload prepare models if they are not local but url or huggingface repositories
+func (cm *ConfigLoader) Preload(modelPath string) error {
+	cm.Lock()
+	defer cm.Unlock()
+
+	status := func(fileName, current, total string, percent float64) {
+		utils.DisplayDownloadFunction(fileName, current, total, percent)
+	}
+
+	log.Info().Msgf("Preloading models from %s", modelPath)
+
+	for i, config := range cm.configs {
+
+		// Download files and verify their SHA
+		for _, file := range config.DownloadFiles {
+			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
+
+			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
+				return err
+			}
+			// Create file path
+			filePath := filepath.Join(modelPath, file.Filename)
+
+			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
+				return err
+			}
+		}
+
+		modelURL := config.PredictionOptions.Model
+		modelURL = downloader.ConvertURL(modelURL)
+
+		if downloader.LooksLikeURL(modelURL) {
+			// md5 of model name
+			md5Name := utils.MD5(modelURL)
+
+			// check if file exists
+			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
+				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
+				if err != nil {
+					return err
+				}
+			}
+
+			cc := cm.configs[i]
+			c := &cc
+			c.PredictionOptions.Model = md5Name
+			cm.configs[i] = *c
+		}
+		if cm.configs[i].Name != "" {
+			log.Info().Msgf("Model name: %s", cm.configs[i].Name)
+		}
+		if cm.configs[i].Description != "" {
+			log.Info().Msgf("Model description: %s", cm.configs[i].Description)
+		}
+		if cm.configs[i].Usage != "" {
+			log.Info().Msgf("Model usage: \n%s", cm.configs[i].Usage)
+		}
+	}
+	return nil
+}
+
 func (cm *ConfigLoader) LoadConfigs(path string) error {
 	cm.Lock()
 	defer cm.Unlock()
@@ -269,7 +361,7 @@ func (cm *ConfigLoader) LoadConfigs(path string) error {
 	}
 	for _, file := range files {
 		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") {
+		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
 			continue
 		}
 		c, err := ReadConfig(filepath.Join(path, file.Name()))
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -123,13 +123,12 @@ func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
 			return err
 		}

-		client := bm.options.Loader.CheckIsLoaded(backendId)
-
-		if client == nil {
+		model := bm.options.Loader.CheckIsLoaded(backendId)
+		if model == "" {
 			return fmt.Errorf("backend %s is not currently loaded", backendId)
 		}

-		status, rpcErr := client.Status(context.TODO())
+		status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
 		if rpcErr != nil {
 			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
 			val, slbErr := bm.SampleLocalBackendProcess(backendId)
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -130,6 +130,12 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 					continue
 				}

+				err = cm.Preload(g.modelPath)
+				if err != nil {
+					updateError(err)
+					continue
+				}
+
 				g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
 			}
 		}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -81,6 +81,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}

+		if input.ResponseFormat.Type == "json_object" {
+			input.Grammar = grammar.JSONBNF
+		}
+
 		// process functions if we have any defined or if we have a function call string
 		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
 			log.Debug().Msgf("Response needs to process functions")
@@ -140,14 +144,14 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 			}
 			r := config.Roles[role]
-			contentExists := i.Content != nil && *i.Content != ""
+			contentExists := i.Content != nil && i.StringContent != ""
 			// First attempt to populate content via a chat message specific template
 			if config.TemplateConfig.ChatMessage != "" {
 				chatMessageData := model.ChatMessageTemplateData{
 					SystemPrompt: config.SystemPrompt,
 					Role:         r,
 					RoleName:     role,
-					Content:      *i.Content,
+					Content:      i.StringContent,
 					MessageIndex: messageIndex,
 				}
 				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
@@ -166,7 +170,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			if content == "" {
 				if r != "" {
 					if contentExists {
-						content = fmt.Sprint(r, " ", *i.Content)
+						content = fmt.Sprint(r, i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -180,7 +184,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					}
 				} else {
 					if contentExists {
-						content = fmt.Sprint(*i.Content)
+						content = fmt.Sprint(i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -215,7 +219,12 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			c.Set("Transfer-Encoding", "chunked")
 		}

-		templateFile := config.Model
+		templateFile := ""
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+			templateFile = config.Model
+		}

 		if config.TemplateConfig.Chat != "" && !processFunctions {
 			templateFile = config.TemplateConfig.Chat
@@ -225,18 +234,19 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			templateFile = config.TemplateConfig.Functions
 		}

-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-			SystemPrompt:         config.SystemPrompt,
-			SuppressSystemPrompt: suppressConfigSystemPrompt,
-			Input:                predInput,
-			Functions:            funcs,
-		})
-		if err == nil {
-			predInput = templatedInput
-			log.Debug().Msgf("Template found, input modified to: %s", predInput)
-		} else {
-			log.Debug().Msgf("Template failed loading: %s", err.Error())
+		if templateFile != "" {
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
+				SystemPrompt:         config.SystemPrompt,
+				SuppressSystemPrompt: suppressConfigSystemPrompt,
+				Input:                predInput,
+				Functions:            funcs,
+			})
+			if err == nil {
+				predInput = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", predInput)
+			} else {
+				log.Debug().Msgf("Template failed loading: %s", err.Error())
+			}
 		}

 		log.Debug().Msgf("Prompt (after templating): %s", predInput)
@@ -334,7 +344,11 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
 					// Note: This costs (in term of CPU) another computation
 					config.Grammar = ""
-					predFunc, err := backend.ModelInference(input.Context, predInput, o.Loader, *config, o, nil)
+					images := []string{}
+					for _, m := range input.Messages {
+						images = append(images, m.StringImages...)
+					}
+					predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
 					if err != nil {
 						log.Error().Msgf("inference error: %s", err.Error())
 						return
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -12,6 +12,7 @@ import (
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
@@ -64,6 +65,10 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

+		if input.ResponseFormat.Type == "json_object" {
+			input.Grammar = grammar.JSONBNF
+		}
+
 		log.Debug().Msgf("Parameter Config: %+v", config)

 		if input.Stream {
@@ -76,7 +81,12 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			c.Set("Transfer-Encoding", "chunked")
 		}

-		templateFile := config.Model
+		templateFile := ""
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+			templateFile = config.Model
+		}

 		if config.TemplateConfig.Completion != "" {
 			templateFile = config.TemplateConfig.Completion
@@ -89,13 +99,14 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe

 			predInput := config.PromptStrings[0]

-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-				Input: predInput,
-			})
-			if err == nil {
-				predInput = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", predInput)
+			if templateFile != "" {
+				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+					Input: predInput,
+				})
+				if err == nil {
+					predInput = templatedInput
+					log.Debug().Msgf("Template found, input modified to: %s", predInput)
+				}
 			}

 			responses := make(chan schema.OpenAIResponse)
@@ -140,14 +151,16 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		totalTokenUsage := backend.TokenUsage{}

 		for k, i := range config.PromptStrings {
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-				SystemPrompt: config.SystemPrompt,
-				Input:        i,
-			})
-			if err == nil {
-				i = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", i)
+			if templateFile != "" {
+				// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+					SystemPrompt: config.SystemPrompt,
+					Input:        i,
+				})
+				if err == nil {
+					i = templatedInput
+					log.Debug().Msgf("Template found, input modified to: %s", i)
+				}
 			}

 			r, tokenUsage, err := ComputeChoices(
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -30,7 +30,12 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		templateFile := config.Model
+		templateFile := ""
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+			templateFile = config.Model
+		}

 		if config.TemplateConfig.Edit != "" {
 			templateFile = config.TemplateConfig.Edit
@@ -40,15 +45,16 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		totalTokenUsage := backend.TokenUsage{}

 		for _, i := range config.InputStrings {
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
-				Input:        i,
-				Instruction:  input.Instruction,
-				SystemPrompt: config.SystemPrompt,
-			})
-			if err == nil {
-				i = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", i)
+			if templateFile != "" {
+				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
+					Input:        i,
+					Instruction:  input.Instruction,
+					SystemPrompt: config.SystemPrompt,
+				})
+				if err == nil {
+					i = templatedInput
+					log.Debug().Msgf("Template found, input modified to: %s", i)
+				}
 			}

 			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -5,6 +5,8 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
+	"io"
+	"net/http"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -22,6 +24,26 @@ import (
 	"github.com/rs/zerolog/log"
 )

+func downloadFile(url string) (string, error) {
+	// Get the data
+	resp, err := http.Get(url)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	// Create the file
+	out, err := os.CreateTemp("", "image")
+	if err != nil {
+		return "", err
+	}
+	defer out.Close()
+
+	// Write the body to file
+	_, err = io.Copy(out, resp.Body)
+	return out.Name(), err
+}
+
 // https://platform.openai.com/docs/api-reference/images/create

 /*
@@ -56,12 +78,31 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx

 		src := ""
 		if input.File != "" {
-			//base 64 decode the file and write it somewhere
-			// that we will cleanup
-			decoded, err := base64.StdEncoding.DecodeString(input.File)
-			if err != nil {
-				return err
+
+			fileData := []byte{}
+			// check if input.File is an URL, if so download it and save it
+			// to a temporary file
+			if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
+				out, err := downloadFile(input.File)
+				if err != nil {
+					return fmt.Errorf("failed downloading file:%w", err)
+				}
+				defer os.RemoveAll(out)
+
+				fileData, err = os.ReadFile(out)
+				if err != nil {
+					return fmt.Errorf("failed reading file:%w", err)
+				}
+
+			} else {
+				// base 64 decode the file and write it somewhere
+				// that we will cleanup
+				fileData, err = base64.StdEncoding.DecodeString(input.File)
+				if err != nil {
+					return err
+				}
 			}
+
 			// Create a temporary file
 			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
 			if err != nil {
@@ -69,7 +110,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 			// write the base64 result
 			writer := bufio.NewWriter(outputFile)
-			_, err = writer.Write(decoded)
+			_, err = writer.Write(fileData)
 			if err != nil {
 				outputFile.Close()
 				return err
@@ -81,8 +122,12 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		// XXX: Only stablediffusion is supported for now
-		if config.Backend == "" {
+		switch config.Backend {
+		case "stablediffusion":
+			config.Backend = model.StableDiffusionBackend
+		case "tinydream":
+			config.Backend = model.TinyDreamBackend
+		case "":
 			config.Backend = model.StableDiffusionBackend
 		}

@@ -100,7 +145,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 		}

 		b64JSON := false
-		if input.ResponseFormat == "b64_json" {
+		if input.ResponseFormat.Type == "b64_json" {
 			b64JSON = true
 		}
 		// src and clip_skip
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -23,8 +23,13 @@ func ComputeChoices(
 		n = 1
 	}

+	images := []string{}
+	for _, m := range req.Messages {
+		images = append(images, m.StringImages...)
+	}
+
 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, images, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -2,8 +2,11 @@ package openai

 import (
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
+	"io/ioutil"
+	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
@@ -24,7 +27,7 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *sche
 	input.Cancel = cancel
 	// Get input data from the request body
 	if err := c.BodyParser(input); err != nil {
-		return "", nil, err
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
 	}

 	modelFile := input.Model
@@ -61,6 +64,37 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *sche
 	return modelFile, input, nil
 }

+// this function check if the string is an URL, if it's an URL downloads the image in memory
+// encodes it in base64 and returns the base64 string
+func getBase64Image(s string) (string, error) {
+	if strings.HasPrefix(s, "http") {
+		// download the image
+		resp, err := http.Get(s)
+		if err != nil {
+			return "", err
+		}
+		defer resp.Body.Close()
+
+		// read the image data into memory
+		data, err := ioutil.ReadAll(resp.Body)
+		if err != nil {
+			return "", err
+		}
+
+		// encode the image data in base64
+		encoded := base64.StdEncoding.EncodeToString(data)
+
+		// return the base64 string
+		return encoded, nil
+	}
+
+	// if the string instead is prefixed with "data:image/jpeg;base64,", drop it
+	if strings.HasPrefix(s, "data:image/jpeg;base64,") {
+		return strings.ReplaceAll(s, "data:image/jpeg;base64,", ""), nil
+	}
+	return "", fmt.Errorf("not valid string")
+}
+
 func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
@@ -129,6 +163,35 @@ func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 		}
 	}

+	// Decode each request's message content
+	index := 0
+	for i, m := range input.Messages {
+		switch content := m.Content.(type) {
+		case string:
+			input.Messages[i].StringContent = content
+		case []interface{}:
+			dat, _ := json.Marshal(content)
+			c := []schema.Content{}
+			json.Unmarshal(dat, &c)
+			for _, pp := range c {
+				if pp.Type == "text" {
+					input.Messages[i].StringContent = pp.Text
+				} else if pp.Type == "image_url" {
+					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
+					base64, err := getBase64Image(pp.ImageURL.URL)
+					if err == nil {
+						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+						// set a placeholder for each image
+						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
+						index++
+					} else {
+						fmt.Print("Failed encoding image", err)
+					}
+				}
+			}
+		}
+	}
+
 	if input.RepeatPenalty != 0 {
 		config.RepeatPenalty = input.RepeatPenalty
 	}
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -4,7 +4,9 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
+	"time"

+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
@@ -24,6 +26,9 @@ type Option struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
+	Metrics                             *metrics.Metrics
+
+	ModelLibraryURL string

 	Galleries []gallery.Gallery

@@ -34,7 +39,16 @@ type Option struct {

 	AutoloadGalleries bool

-	SingleBackend bool
+	SingleBackend           bool
+	ParallelBackendRequests bool
+
+	WatchDogIdle bool
+	WatchDogBusy bool
+	WatchDog     bool
+
+	ModelsURL []string
+
+	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
 }

 type AppOption func(*Option)
@@ -54,16 +68,58 @@ func NewOptions(o ...AppOption) *Option {
 	return opt
 }

+func WithModelsURL(urls ...string) AppOption {
+	return func(o *Option) {
+		o.ModelsURL = urls
+	}
+}
+
 func WithCors(b bool) AppOption {
 	return func(o *Option) {
 		o.CORS = b
 	}
 }

+func WithModelLibraryURL(url string) AppOption {
+	return func(o *Option) {
+		o.ModelLibraryURL = url
+	}
+}
+
+var EnableWatchDog = func(o *Option) {
+	o.WatchDog = true
+}
+
+var EnableWatchDogIdleCheck = func(o *Option) {
+	o.WatchDog = true
+	o.WatchDogIdle = true
+}
+
+var EnableWatchDogBusyCheck = func(o *Option) {
+	o.WatchDog = true
+	o.WatchDogBusy = true
+}
+
+func SetWatchDogBusyTimeout(t time.Duration) AppOption {
+	return func(o *Option) {
+		o.WatchDogBusyTimeout = t
+	}
+}
+
+func SetWatchDogIdleTimeout(t time.Duration) AppOption {
+	return func(o *Option) {
+		o.WatchDogIdleTimeout = t
+	}
+}
+
 var EnableSingleBackend = func(o *Option) {
 	o.SingleBackend = true
 }

+var EnableParallelBackendRequests = func(o *Option) {
+	o.ParallelBackendRequests = true
+}
+
 var EnableGalleriesAutoload = func(o *Option) {
 	o.AutoloadGalleries = true
 }
@@ -198,3 +254,9 @@ func WithApiKeys(apiKeys []string) AppOption {
 		o.ApiKeys = apiKeys
 	}
 }
+
+func WithMetrics(meter *metrics.Metrics) AppOption {
+	return func(o *Option) {
+		o.Metrics = meter
+	}
+}
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -55,11 +55,25 @@ type Choice struct {
 	Text         string   `json:"text,omitempty"`
 }

+type Content struct {
+	Type     string     `json:"type" yaml:"type"`
+	Text     string     `json:"text" yaml:"text"`
+	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
+}
+
+type ContentURL struct {
+	URL string `json:"url" yaml:"url"`
+}
+
 type Message struct {
 	// The message role
 	Role string `json:"role,omitempty" yaml:"role"`
 	// The message content
-	Content *string `json:"content" yaml:"content"`
+	Content interface{} `json:"content" yaml:"content"`
+
+	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
+	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
+
 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
 }
@@ -69,6 +83,12 @@ type OpenAIModel struct {
 	Object string `json:"object"`
 }

+type ChatCompletionResponseFormatType string
+
+type ChatCompletionResponseFormat struct {
+	Type ChatCompletionResponseFormatType `json:"type,omitempty"`
+}
+
 type OpenAIRequest struct {
 	config.PredictionOptions

@@ -78,7 +98,7 @@ type OpenAIRequest struct {
 	// whisper
 	File string `json:"file" validate:"required"`
 	//whisper/image
-	ResponseFormat string `json:"response_format"`
+	ResponseFormat ChatCompletionResponseFormat `json:"response_format"`
 	// image
 	Size string `json:"size"`
 	// Prompt is read only by completion/image API calls
--- a/pkg/grpc/proto/backend.proto
+++ b/pkg/grpc/proto/backend.proto
@@ -64,6 +64,7 @@ message PredictOptions {
  float NegativePromptScale = 39;
  string NegativePrompt = 40;
  int32 NDraft = 41;
+  repeated string Images = 42;
 }

 // The response message containing the result
@@ -109,13 +110,15 @@ message ModelOptions {
  string CLIPModel = 31;
  string CLIPSubfolder = 32;
  int32 CLIPSkip = 33;
+  string ControlNet = 48;

-  // RWKV
  string Tokenizer = 34;

  // LLM (llama.cpp)
  string LoraBase = 35;
  string LoraAdapter = 36;
+  float LoraScale = 42;
+
  bool NoMulMatQ = 37;
  string DraftModel = 39;
  
@@ -123,6 +126,16 @@ message ModelOptions {

  // vllm
  string Quantization = 40;
+
+  string MMProj = 41;
+
+  string RopeScaling = 43;
+  float YarnExtFactor = 44;
+  float YarnAttnFactor = 45;
+  float YarnBetaFast = 46;
+  float YarnBetaSlow = 47;
+
+  string Type = 49;
 }

 message Result {
--- a/backend/backend_grpc.pb.go
+++ b/backend/backend_grpc.pb.go
@@ -0,0 +1,457 @@
+// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
+// versions:
+// - protoc-gen-go-grpc v1.2.0
+// - protoc             v4.23.4
+// source: backend/backend.proto
+
+package proto
+
+import (
+	context "context"
+	grpc "google.golang.org/grpc"
+	codes "google.golang.org/grpc/codes"
+	status "google.golang.org/grpc/status"
+)
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the grpc package it is being compiled against.
+// Requires gRPC-Go v1.32.0 or later.
+const _ = grpc.SupportPackageIsVersion7
+
+// BackendClient is the client API for Backend service.
+//
+// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
+type BackendClient interface {
+	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
+	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
+	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
+	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
+	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
+	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
+	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
+	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
+	TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error)
+	Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error)
+}
+
+type backendClient struct {
+	cc grpc.ClientConnInterface
+}
+
+func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
+	return &backendClient{cc}
+}
+
+func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
+	out := new(Reply)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
+	out := new(Reply)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
+	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
+	if err != nil {
+		return nil, err
+	}
+	x := &backendPredictStreamClient{stream}
+	if err := x.ClientStream.SendMsg(in); err != nil {
+		return nil, err
+	}
+	if err := x.ClientStream.CloseSend(); err != nil {
+		return nil, err
+	}
+	return x, nil
+}
+
+type Backend_PredictStreamClient interface {
+	Recv() (*Reply, error)
+	grpc.ClientStream
+}
+
+type backendPredictStreamClient struct {
+	grpc.ClientStream
+}
+
+func (x *backendPredictStreamClient) Recv() (*Reply, error) {
+	m := new(Reply)
+	if err := x.ClientStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
+	out := new(EmbeddingResult)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
+	out := new(TranscriptResult)
+	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error) {
+	out := new(TokenizationResponse)
+	err := c.cc.Invoke(ctx, "/backend.Backend/TokenizeString", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error) {
+	out := new(StatusResponse)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Status", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+// BackendServer is the server API for Backend service.
+// All implementations must embed UnimplementedBackendServer
+// for forward compatibility
+type BackendServer interface {
+	Health(context.Context, *HealthMessage) (*Reply, error)
+	Predict(context.Context, *PredictOptions) (*Reply, error)
+	LoadModel(context.Context, *ModelOptions) (*Result, error)
+	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
+	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
+	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
+	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
+	TTS(context.Context, *TTSRequest) (*Result, error)
+	TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error)
+	Status(context.Context, *HealthMessage) (*StatusResponse, error)
+	mustEmbedUnimplementedBackendServer()
+}
+
+// UnimplementedBackendServer must be embedded to have forward compatible implementations.
+type UnimplementedBackendServer struct {
+}
+
+func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
+}
+func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
+}
+func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
+}
+func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
+	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
+}
+func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
+}
+func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
+}
+func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
+}
+func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
+}
+func (UnimplementedBackendServer) TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method TokenizeString not implemented")
+}
+func (UnimplementedBackendServer) Status(context.Context, *HealthMessage) (*StatusResponse, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Status not implemented")
+}
+func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
+
+// UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
+// Use of this interface is not recommended, as added methods to BackendServer will
+// result in compilation errors.
+type UnsafeBackendServer interface {
+	mustEmbedUnimplementedBackendServer()
+}
+
+func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
+	s.RegisterService(&Backend_ServiceDesc, srv)
+}
+
+func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(HealthMessage)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Health(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Health",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Predict(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Predict",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(ModelOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).LoadModel(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/LoadModel",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
+	m := new(PredictOptions)
+	if err := stream.RecvMsg(m); err != nil {
+		return err
+	}
+	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
+}
+
+type Backend_PredictStreamServer interface {
+	Send(*Reply) error
+	grpc.ServerStream
+}
+
+type backendPredictStreamServer struct {
+	grpc.ServerStream
+}
+
+func (x *backendPredictStreamServer) Send(m *Reply) error {
+	return x.ServerStream.SendMsg(m)
+}
+
+func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Embedding(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Embedding",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(GenerateImageRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).GenerateImage(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/GenerateImage",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(TranscriptRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).AudioTranscription(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/AudioTranscription",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(TTSRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).TTS(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/TTS",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_TokenizeString_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).TokenizeString(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/TokenizeString",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).TokenizeString(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(HealthMessage)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Status(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Status",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Status(ctx, req.(*HealthMessage))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+// Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
+// It's only intended for direct use with grpc.RegisterService,
+// and not to be introspected or modified (even as a copy)
+var Backend_ServiceDesc = grpc.ServiceDesc{
+	ServiceName: "backend.Backend",
+	HandlerType: (*BackendServer)(nil),
+	Methods: []grpc.MethodDesc{
+		{
+			MethodName: "Health",
+			Handler:    _Backend_Health_Handler,
+		},
+		{
+			MethodName: "Predict",
+			Handler:    _Backend_Predict_Handler,
+		},
+		{
+			MethodName: "LoadModel",
+			Handler:    _Backend_LoadModel_Handler,
+		},
+		{
+			MethodName: "Embedding",
+			Handler:    _Backend_Embedding_Handler,
+		},
+		{
+			MethodName: "GenerateImage",
+			Handler:    _Backend_GenerateImage_Handler,
+		},
+		{
+			MethodName: "AudioTranscription",
+			Handler:    _Backend_AudioTranscription_Handler,
+		},
+		{
+			MethodName: "TTS",
+			Handler:    _Backend_TTS_Handler,
+		},
+		{
+			MethodName: "TokenizeString",
+			Handler:    _Backend_TokenizeString_Handler,
+		},
+		{
+			MethodName: "Status",
+			Handler:    _Backend_Status_Handler,
+		},
+	},
+	Streams: []grpc.StreamDesc{
+		{
+			StreamName:    "PredictStream",
+			Handler:       _Backend_PredictStream_Handler,
+			ServerStreams: true,
+		},
+	},
+	Metadata: "backend/backend.proto",
+}
--- a/backend/cpp/grpc/.gitignore
+++ b/backend/cpp/grpc/.gitignore
@@ -0,0 +1,3 @@
+installed_packages/
+grpc_build/
+grpc_repo/
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -0,0 +1,66 @@
+# Basic platform detection
+HOST_SYSTEM = $(shell uname | cut -f 1 -d_)
+SYSTEM ?= $(HOST_SYSTEM)
+
+TAG_LIB_GRPC?=v1.59.0
+GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
+GIT_CLONE_DEPTH?=1
+NUM_BUILD_THREADS?=$(shell nproc --ignore=1)
+
+INSTALLED_PACKAGES=installed_packages
+GRPC_REPO=grpc_repo
+GRPC_BUILD=grpc_build
+
+export CMAKE_ARGS?=
+CMAKE_ARGS+=-DCMAKE_BUILD_TYPE=Release
+CMAKE_ARGS+=-DgRPC_INSTALL=ON
+CMAKE_ARGS+=-DEXECUTABLE_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/bin
+CMAKE_ARGS+=-DLIBRARY_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/lib
+CMAKE_ARGS+=-DgRPC_BUILD_TESTS=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_CSHARP_EXT=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CPP_PLUGIN=ON
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF
+CMAKE_ARGS+=-Dprotobuf_WITH_ZLIB=ON
+CMAKE_ARGS+=-DRE2_BUILD_TESTING=OFF
+CMAKE_ARGS+=-DCMAKE_INSTALL_PREFIX=../$(INSTALLED_PACKAGES)
+
+# windows need to set OPENSSL_NO_ASM. Results in slower crypto performance but doesn't build otherwise.
+# May be resolvable, but for now its set. More info: https://stackoverflow.com/a/75240504/480673
+ifeq ($(SYSTEM),MSYS)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),MINGW64)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),MINGW32)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),CYGWIN)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+
+$(INSTALLED_PACKAGES): grpc_build
+
+$(GRPC_REPO):
+	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
+	cd $(GRPC_REPO)/grpc && git submodule update --init --recursive --depth $(GIT_CLONE_DEPTH)
+
+$(GRPC_BUILD): $(GRPC_REPO)
+	mkdir -p $(GRPC_BUILD)
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . -- -j ${NUM_BUILD_THREADS} && cmake --build . --target install -- -j ${NUM_BUILD_THREADS}
+
+build: $(INSTALLED_PACKAGES)
+
+rebuild:
+	rm -rf grpc_build
+	$(MAKE) grpc_build
+
+clean:
+	rm -rf grpc_build
+	rm -rf grpc_repo
+	rm -rf installed_packages
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -0,0 +1,82 @@
+
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+set(TARGET myclip)
+add_library(${TARGET} clip.cpp clip.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+set(TARGET grpc-server)
+# END CLIP hack
+set(CMAKE_CXX_STANDARD 17)
+cmake_minimum_required(VERSION 3.15)
+set(TARGET grpc-server)
+set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+set(_REFLECTION grpc++_reflection)
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    # Set correct Homebrew install folder for Apple Silicon and Intel Macs
+    if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
+        set(HOMEBREW_DEFAULT_PREFIX "/opt/homebrew")
+    else()
+        set(HOMEBREW_DEFAULT_PREFIX "/usr/local")
+    endif()
+
+    link_directories("${HOMEBREW_DEFAULT_PREFIX}/lib")
+    include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
+endif()
+
+find_package(absl CONFIG REQUIRED)
+find_package(Protobuf CONFIG REQUIRED)
+find_package(gRPC CONFIG REQUIRED)
+
+find_program(_PROTOBUF_PROTOC protoc)
+set(_GRPC_GRPCPP grpc++)
+find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${Protobuf_INCLUDE_DIRS})
+
+message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+
+# Proto file
+get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
+
+add_custom_command(
+      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+      DEPENDS "${hw_proto}")
+
+# hw_grpc_proto
+add_library(hw_grpc_proto
+  ${hw_grpc_srcs}
+  ${hw_grpc_hdrs}
+  ${hw_proto_srcs}
+  ${hw_proto_hdrs} )
+
+add_executable(${TARGET} grpc-server.cpp json.hpp )
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+  absl::flags_parse
+  gRPC::${_REFLECTION}
+  gRPC::${_GRPC_GRPCPP}
+  protobuf::${_PROTOBUF_LIBPROTOBUF})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,53 @@
+
+LLAMA_VERSION?=
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+
+# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
+# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblast)
+	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+endif
+
+llama.cpp:
+	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
+	if [ -z "$(LLAMA_VERSION)" ]; then \
+		exit 1; \
+	fi
+	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
+
+llama.cpp/examples/grpc-server:
+	mkdir -p llama.cpp/examples/grpc-server
+	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+
+rebuild:
+	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+clean:
+	rm -rf llama.cpp
+	rm -rf grpc-server
+
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -5,8 +5,6 @@ package main
 import (
 	"flag"

-	rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -17,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &rwkv.LLM{}); err != nil {
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -1,4 +1,4 @@
-package image
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
@@ -8,20 +8,20 @@ import (
 	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
 )

-type StableDiffusion struct {
+type Image struct {
 	base.SingleThread
 	stablediffusion *stablediffusion.StableDiffusion
 }

-func (sd *StableDiffusion) Load(opts *pb.ModelOptions) error {
+func (image *Image) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
-	sd.stablediffusion, err = stablediffusion.New(opts.ModelFile)
+	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
 	return err
 }

-func (sd *StableDiffusion) GenerateImage(opts *pb.GenerateImageRequest) error {
-	return sd.stablediffusion.GenerateImage(
+func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
+	return image.stablediffusion.GenerateImage(
 		int(opts.Height),
 		int(opts.Width),
 		int(opts.Mode),
--- a/backend/go/image/tinydream/main.go
+++ b/backend/go/image/tinydream/main.go
@@ -5,7 +5,6 @@ package main
 import (
 	"flag"

-	bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -16,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &bert.Embeddings{}); err != nil {
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -0,0 +1,32 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/tinydream"
+)
+
+type Image struct {
+	base.SingleThread
+	tinydream *tinydream.TinyDream
+}
+
+func (image *Image) Load(opts *pb.ModelOptions) error {
+	var err error
+	// Note: the Model here is a path to a directory containing the model files
+	image.tinydream, err = tinydream.New(opts.ModelFile)
+	return err
+}
+
+func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
+	return image.tinydream.GenerateImage(
+		int(opts.Height),
+		int(opts.Width),
+		int(opts.Step),
+		int(opts.Seed),
+		opts.PositivePrompt,
+		opts.NegativePrompt,
+		opts.Dst)
+}
--- a/pkg/backend/llm/bert/bert.go
+++ b/pkg/backend/llm/bert/bert.go
@@ -1,4 +1,4 @@
-package bert
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -5,8 +5,6 @@ package main
 import (
 	"flag"

-	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -17,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
+	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/dolly/main.go
+++ b/backend/go/llm/dolly/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/falcon-ggml/main.go
+++ b/backend/go/llm/falcon-ggml/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/gpt2/main.go
+++ b/backend/go/llm/gpt2/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/pkg/backend/llm/gpt4all/gpt4all.go
+++ b/pkg/backend/llm/gpt4all/gpt4all.go
@@ -1,4 +1,4 @@
-package gpt4all
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/gptj/main.go
+++ b/backend/go/llm/gptj/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/gptneox/main.go
+++ b/backend/go/llm/gptneox/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/pkg/backend/llm/langchain/langchain.go
+++ b/pkg/backend/llm/langchain/langchain.go
@@ -1,4 +1,4 @@
-package langchain
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/langchain/main.go
+++ b/backend/go/llm/langchain/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/llm/llama-stable/llama.go
+++ b/pkg/backend/llm/llama-stable/llama.go
@@ -1,4 +1,4 @@
-package llama
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -3,8 +3,6 @@ package main
 import (
 	"flag"

-	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -15,7 +13,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/pkg/backend/llm/llama/llama.go
+++ b/pkg/backend/llm/llama/llama.go
@@ -1,4 +1,4 @@
-package llama
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/llama/main.go
+++ b/backend/go/llm/llama/main.go
@@ -1,12 +1,12 @@
 package main

+// GRPC Falcon server
+
 // Note: this is started internally by LocalAI and a server is allocated for each model

 import (
 	"flag"

-	tts "github.com/go-skynet/LocalAI/pkg/backend/tts"
-
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )

@@ -17,7 +17,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &tts.Piper{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/mpt/main.go
+++ b/backend/go/llm/mpt/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/replit/main.go
+++ b/backend/go/llm/replit/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/backend/go/llm/rwkv/main.go
+++ b/backend/go/llm/rwkv/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/llm/rwkv/rwkv.go
+++ b/pkg/backend/llm/rwkv/rwkv.go
@@ -1,4 +1,4 @@
-package rwkv
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/llm/starcoder/main.go
+++ b/backend/go/llm/starcoder/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/pkg/backend/llm/transformers/dolly.go
+++ b/pkg/backend/llm/transformers/dolly.go
--- a/pkg/backend/llm/transformers/falcon.go
+++ b/pkg/backend/llm/transformers/falcon.go
--- a/pkg/backend/llm/transformers/gpt2.go
+++ b/pkg/backend/llm/transformers/gpt2.go
--- a/pkg/backend/llm/transformers/gptj.go
+++ b/pkg/backend/llm/transformers/gptj.go
--- a/pkg/backend/llm/transformers/gptneox.go
+++ b/pkg/backend/llm/transformers/gptneox.go
--- a/pkg/backend/llm/transformers/mpt.go
+++ b/pkg/backend/llm/transformers/mpt.go
--- a/pkg/backend/llm/transformers/predict.go
+++ b/pkg/backend/llm/transformers/predict.go
--- a/pkg/backend/llm/transformers/replit.go
+++ b/pkg/backend/llm/transformers/replit.go
--- a/pkg/backend/llm/transformers/starcoder.go
+++ b/pkg/backend/llm/transformers/starcoder.go
--- a/backend/go/transcribe/main.go
+++ b/backend/go/transcribe/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Whisper{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/transcribe/transcript.go
+++ b/pkg/backend/transcribe/transcript.go
@@ -1,4 +1,4 @@
-package transcribe
+package main

 import (
 	"fmt"
--- a/pkg/backend/transcribe/whisper.go
+++ b/pkg/backend/transcribe/whisper.go
@@ -1,4 +1,4 @@
-package transcribe
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/go/tts/main.go
+++ b/backend/go/tts/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Piper{}); err != nil {
+		panic(err)
+	}
+}
--- a/pkg/backend/tts/piper.go
+++ b/pkg/backend/tts/piper.go
@@ -1,4 +1,4 @@
-package tts
+package main

 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
--- a/backend/python/README.md
+++ b/backend/python/README.md
@@ -0,0 +1,38 @@
+# Common commands about conda environment
+
+## Create a new empty conda environment
+
+```
+conda create --name <env-name> python=<your version> -y
+
+conda create --name autogptq python=3.11 -y
+```
+
+## To activate the environment
+
+As of conda 4.4
+```
+conda activate autogptq
+```
+
+The conda version older than 4.4
+
+```
+source activate autogptq
+```
+
+## Install the packages to your environment
+
+Sometimes you need to install the packages from the conda-forge channel
+
+By using `conda`
+```
+conda install <your-package-name>
+
+conda install -c conda-forge <your package-name>
+```
+
+Or by using `pip`
+```
+pip install <your-package-name>
+```
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -0,0 +1,4 @@
+.PHONY: autogptq
+autogptq:
+	$(MAKE) -C ../common-env/transformers
+
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the autogptq project
+
+```
+make autogptq
+```
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
-import grpc
 from concurrent import futures
-import time
-import backend_pb2
-import backend_pb2_grpc
 import argparse
 import signal
 import sys
 import os
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-from pathlib import Path
+import time
+
+import grpc
+import backend_pb2
+import backend_pb2_grpc
+from auto_gptq import AutoGPTQForCausalLM
 from transformers import AutoTokenizer
 from transformers import TextGenerationPipeline

--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@@ -0,0 +1,86 @@
+name: autogptq
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.08.22=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.11=h7f8727e_2
+  - pip=23.2.1=py311h06a4308_0
+  - python=3.11.5=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=68.0.0=py311h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - wheel=0.41.2=py311h06a4308_0
+  - xz=5.4.2=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+      - accelerate==0.23.0
+      - aiohttp==3.8.5
+      - aiosignal==1.3.1
+      - async-timeout==4.0.3
+      - attrs==23.1.0
+      - auto-gptq==0.4.2
+      - certifi==2023.7.22
+      - charset-normalizer==3.3.0
+      - datasets==2.14.5
+      - dill==0.3.7
+      - filelock==3.12.4
+      - frozenlist==1.4.0
+      - fsspec==2023.6.0
+      - grpcio==1.59.0
+      - huggingface-hub==0.16.4
+      - idna==3.4
+      - jinja2==3.1.2
+      - markupsafe==2.1.3
+      - mpmath==1.3.0
+      - multidict==6.0.4
+      - multiprocess==0.70.15
+      - networkx==3.1
+      - numpy==1.26.0
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.18.1
+      - nvidia-nvjitlink-cu12==12.2.140
+      - nvidia-nvtx-cu12==12.1.105
+      - packaging==23.2
+      - pandas==2.1.1
+      - peft==0.5.0
+      - protobuf==4.24.4
+      - psutil==5.9.5
+      - pyarrow==13.0.0
+      - python-dateutil==2.8.2
+      - pytz==2023.3.post1
+      - pyyaml==6.0.1
+      - regex==2023.10.3
+      - requests==2.31.0
+      - rouge==1.0.1
+      - safetensors==0.3.3
+      - six==1.16.0
+      - sympy==1.12
+      - tokenizers==0.14.0
+      - torch==2.1.0
+      - tqdm==4.66.1
+      - transformers==4.34.0
+      - triton==2.1.0
+      - typing-extensions==4.8.0
+      - tzdata==2023.3
+      - urllib3==2.0.6
+      - xxhash==3.4.1
+      - yarl==1.9.2
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/autogptq/backend_pb2_grpc.py
+++ b/backend/python/autogptq/backend_pb2_grpc.py
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the autogptq server with conda
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/autogptq.py $@
--- a/backend/python/bark/Makefile
+++ b/backend/python/bark/Makefile
@@ -0,0 +1,15 @@
+.PHONY: ttsbark
+ttsbark:
+	$(MAKE) -C ../common-env/transformers
+
+.PHONY: run
+run:
+	@echo "Running bark..."
+	bash run.sh
+	@echo "bark run."
+
+.PHONY: test
+test:
+	@echo "Testing bark..."
+	bash test.sh
+	@echo "bark tested."
--- a/backend/python/bark/README.md
+++ b/backend/python/bark/README.md
@@ -0,0 +1,16 @@
+# Creating a separate environment for ttsbark project
+
+```
+make ttsbark
+```
+
+# Testing the gRPC server
+
+```
+<The path of your python interpreter> -m unittest test_ttsbark.py
+```
+
+For example
+```
+/opt/conda/envs/bark/bin/python -m unittest extra/grpc/bark/test_ttsbark.py
+``````
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/bark/backend_pb2_grpc.py
+++ b/backend/python/bark/backend_pb2_grpc.py
--- a/backend/python/bark/run.sh
+++ b/backend/python/bark/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the ttsbark server with conda
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/ttsbark.py $@
--- a/backend/python/bark/test.py
+++ b/backend/python/bark/test.py
@@ -0,0 +1,81 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "ttsbark.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
+                self.assertTrue(response.success)
+                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/bark/test.sh
+++ b/backend/python/bark/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the bark server with conda
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test.py
--- a/backend/python/bark/ttsbark.py
+++ b/backend/python/bark/ttsbark.py
@@ -1,18 +1,22 @@
 #!/usr/bin/env python3
-import grpc
+"""
+This is an extra gRPC server of LocalAI for Bark TTS
+"""
 from concurrent import futures
 import time
-import backend_pb2
-import backend_pb2_grpc
 import argparse
 import signal
 import sys
 import os
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-from pathlib import Path
-from bark import SAMPLE_RATE, generate_audio, preload_models
 from scipy.io.wavfile import write as write_wav

+import backend_pb2
+import backend_pb2_grpc
+from bark import SAMPLE_RATE, generate_audio, preload_models
+
+import grpc
+
+
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
@@ -20,6 +24,9 @@ MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    BackendServicer is the class that implements the gRPC service
+    """
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
@@ -83,4 +90,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    serve(args.addr)
+    serve(args.addr)
--- a/Show More
+++ b/Show More