models(gallery): ⬆️ update checksum (#2278 )

⬆️ Checksum updates in gallery/index.yaml Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
ci: get latest git version
2026-02-03 03:02:38 -05:00 · 2024-05-09 12:21:24 +00:00 · 2024-05-09 11:33:16 +02:00 · 2024-05-09 10:57:23 +02:00 · 2024-05-09 09:34:07 +02:00 · 2024-05-09 00:57:22 +02:00
224 changed files with 11681 additions and 4166 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,4 +5,7 @@ models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
-Dockerfile*
+Dockerfile*
+
+# SonarQube
+.scannerwork
--- a/.env
+++ b/.env
@@ -10,7 +10,7 @@
 #
 ## Define galleries.
 ## models will to install will be visible in `/models/available`
-# LOCALAI_GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+# LOCALAI_GALLERIES=[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]

 ## CORS settings
 # LOCALAI_CORS=true
@@ -86,4 +86,4 @@
 # LOCALAI_WATCHDOG_BUSY=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered busy
-# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
+# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
--- a/.github/bump_docs.sh
+++ b/.github/bump_docs.sh
@@ -2,6 +2,6 @@
 set -xe
 REPO=$1

-LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.name')
+LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.tag_name')

 cat <<< $(jq ".version = \"$LATEST_TAG\"" docs/data/version.json) > docs/data/version.json
--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# This scripts needs yq and huggingface_hub to be installed
+# to install hugingface_hub run pip install huggingface_hub
+
+# Path to the input YAML file
+input_yaml=$1
+
+# Function to download file and check checksum using Python
+function check_and_update_checksum() {
+    model_name="$1"
+    file_name="$2"
+    uri="$3"
+    old_checksum="$4"
+    idx="$5"
+
+    # Download the file and calculate new checksum using Python
+    new_checksum=$(python3 -c "
+import hashlib
+from huggingface_hub import hf_hub_download
+import requests
+import sys
+import os
+
+uri = '$uri'
+file_name = uri.split('/')[-1]
+
+# Function to parse the URI and determine download method
+# Function to parse the URI and determine download method
+def parse_uri(uri):
+    if uri.startswith('huggingface://'):
+        repo_id = uri.split('://')[1]
+        return 'huggingface', repo_id.rsplit('/', 1)[0]
+    elif 'huggingface.co' in uri:
+        parts = uri.split('/resolve/')
+        if len(parts) > 1:
+            repo_path = parts[0].split('https://huggingface.co/')[-1]
+            return 'huggingface', repo_path
+    return 'direct', uri
+
+def calculate_sha256(file_path):
+    sha256_hash = hashlib.sha256()
+    with open(file_path, 'rb') as f:
+        for byte_block in iter(lambda: f.read(4096), b''):
+            sha256_hash.update(byte_block)
+    return sha256_hash.hexdigest()
+
+download_type, repo_id_or_url = parse_uri(uri)
+
+# Decide download method based on URI type
+if download_type == 'huggingface':
+    try:
+        file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
+    except Exception as e:
+        print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+        sys.exit(2)
+else:
+    response = requests.get(repo_id_or_url)
+    if response.status_code == 200:
+        with open(file_name, 'wb') as f:
+            f.write(response.content)
+        file_path = file_name
+    elif response.status_code == 404:
+        print(f'File not found: {response.status_code}', file=sys.stderr)
+        sys.exit(2)
+    else:
+        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
+        sys.exit(1)
+
+print(calculate_sha256(file_path))
+# Clean up the downloaded file
+os.remove(file_path)
+")
+
+    if [[ "$new_checksum" == "" ]]; then
+        echo "Error calculating checksum for $file_name. Skipping..."
+        return
+    fi
+
+    echo "Checksum for $file_name: $new_checksum"
+
+    # Compare and update the YAML file if checksums do not match
+    result=$?
+    if [[ $result -eq 2 ]]; then
+        echo "File not found, deleting entry for $file_name..."
+        # yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\"))" "$input_yaml"
+    elif [[ "$old_checksum" != "$new_checksum" ]]; then
+        echo "Checksum mismatch for $file_name. Updating..."
+        yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\").sha256)" "$input_yaml"
+        yq eval -i "(.[$idx].files[] | select(.filename == \"$file_name\")).sha256 = \"$new_checksum\"" "$input_yaml"
+    elif [[ $result -ne 0 ]]; then
+        echo "Error downloading file $file_name. Skipping..."
+    else
+        echo "Checksum match for $file_name. No update needed."
+    fi
+}
+
+# Read the YAML and process each file
+len=$(yq eval '. | length' "$input_yaml")
+for ((i=0; i<$len; i++))
+do
+    name=$(yq eval ".[$i].name" "$input_yaml")
+    files_len=$(yq eval ".[$i].files | length" "$input_yaml")
+    for ((j=0; j<$files_len; j++))
+    do
+        filename=$(yq eval ".[$i].files[$j].filename" "$input_yaml")
+        uri=$(yq eval ".[$i].files[$j].uri" "$input_yaml")
+        checksum=$(yq eval ".[$i].files[$j].sha256" "$input_yaml")
+        echo "Checking model $name, file $filename. URI = $uri, Checksum = $checksum"
+        check_and_update_checksum "$name" "$filename" "$uri" "$checksum" "$i"
+    done
+done
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -8,6 +8,11 @@ kind/documentation:
  - changed-files:
    - any-glob-to-any-file: '*.md'

+area/ai-model:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'gallery/*'
+
 examples:
 - any:
  - changed-files:
@@ -16,4 +21,4 @@ examples:
 ci:
 - any:
  - changed-files:
-    - any-glob-to-any-file: '.github/*'
+    - any-glob-to-any-file: '.github/*'
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -0,0 +1,47 @@
+name: Check if checksums are up-to-date
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  checksum_check:
+    runs-on: arc-runner-set
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y pip wget
+          sudo pip install --upgrade pip 
+          pip install huggingface_hub
+      - name: 'Setup yq'
+        uses: dcarbone/install-yq-action@v1.1.1
+        with:
+          version: 'v4.43.1'
+          download-compressed: true
+          force: true
+
+      - name: Checksum checker 🔧
+        run: |
+          export HF_HOME=/hf_cache
+          sudo mkdir /hf_cache
+          sudo chmod 777 /hf_cache
+          bash .github/checksum_checker.sh gallery/index.yaml
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
+          title: 'models(gallery): :arrow_up: update checksum'
+          branch: "update/checksum"
+          body: Updating checksums in gallery/index.yaml
+          signoff: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.0.0
+        uses: dependabot/fetch-metadata@v2.1.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -0,0 +1,94 @@
+name: 'generate and publish GRPC docker caches'
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - grpc-base-image: ubuntu:22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Release space from worker
+        if: matrix.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache GRPC
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          build-args: |
+            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.63.0
+          context: .
+          file: ./Dockerfile
+          cache-to: type=gha,ignore-error=true
+          cache-from: type=gha
+          target: grpc
+          platforms: ${{ matrix.platforms }}
+          push: false
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -22,6 +22,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -60,13 +61,15 @@ jobs:
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -85,6 +88,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -102,11 +106,12 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -122,4 +127,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -26,6 +26,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
@@ -128,7 +129,8 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
@@ -139,13 +141,15 @@ jobs:
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -157,7 +161,8 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -170,7 +175,8 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -179,7 +185,8 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -188,7 +195,8 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -197,7 +205,8 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -209,7 +218,8 @@ jobs:
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
@@ -218,7 +228,8 @@ jobs:
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  
@@ -236,6 +247,7 @@ jobs:
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
@@ -258,7 +270,7 @@ jobs:
            aio: "-aio-cpu"
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -269,7 +281,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -280,7 +292,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -291,7 +303,7 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -302,4 +314,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=5 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -6,6 +6,10 @@ on:
    inputs:
      base-image:
        description: 'Base image'
+        required: true
+        type: string
+      grpc-base-image:
+        description: 'GRPC Base image, must be a compatible image with base-image'
        required: false
        default: ''
        type: string
@@ -57,7 +61,7 @@ on:
      makeflags:
        description: 'Make Flags'
        required: false
-        default: '--jobs=3 --output-sync=target'
+        default: '--jobs=4 --output-sync=target'
        type: string
      aio:
        description: 'AIO Image Name'
@@ -197,29 +201,14 @@ jobs:
          username: ${{ secrets.quayUsername }}
          password: ${{ secrets.quayPassword }}

-      - name: Cache GRPC
-        uses: docker/build-push-action@v5
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          build-args: |
-            IMAGE_TYPE=${{ inputs.image-type }}
-            BASE_IMAGE=${{ inputs.base-image }}
-            MAKEFLAGS=${{ inputs.makeflags }}
-            GRPC_VERSION=v1.58.0
-          context: .
-          file: ./Dockerfile
-          cache-from: type=gha
-          cache-to: type=gha,ignore-error=true
-          target: grpc
-          platforms: ${{ inputs.platforms }}
-          push: false
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-
      - name: Build and push
        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
          build-args: |
            BUILD_TYPE=${{ inputs.build-type }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
@@ -227,6 +216,9 @@ jobs:
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.63.0
            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
@@ -236,14 +228,6 @@ jobs:
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}

-      - name: Inspect image
-        if: github.event_name != 'pull_request'
-        run: |
-          docker pull localai/localai:${{ steps.meta.outputs.version }}
-          docker image inspect localai/localai:${{ steps.meta.outputs.version }}
-          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
-          docker image inspect quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
-
      - name: Build and push AIO image
        if: inputs.aio != ''
        uses: docker/build-push-action@v5
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -5,7 +5,7 @@ on:
 - pull_request

 env:
-  GRPC_VERSION: v1.58.0
+  GRPC_VERSION: v1.63.0

 permissions:
  contents: write
@@ -19,12 +19,8 @@ jobs:
    strategy:
      matrix:
        include:
-          - build: 'avx2'
+          - build: ''
            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
          - build: 'cuda12'
            defines: ''
          - build: 'cuda11'
@@ -74,7 +70,6 @@ jobs:
      - name: Build
        id: build
        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
@@ -124,63 +119,7 @@ jobs:
          name: stablediffusion
          path: release/

-  build-macOS:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
-    runs-on: macOS-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
-      - name: Build
-        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
-        run: |
-          export C_INCLUDE_PATH=/usr/local/include
-          export CPLUS_INCLUDE_PATH=/usr/local/include
-          export PATH=$PATH:$GOPATH/bin
-          make dist
-      - uses: actions/upload-artifact@v4
-        with:
-          name: LocalAI-MacOS-${{ matrix.build }}
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v2
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-
-
  build-macOS-arm64:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
    runs-on: macos-14
    steps:
      - name: Clone
@@ -198,9 +137,6 @@ jobs:
          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
      - name: Build
        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
@@ -208,7 +144,7 @@ jobs:
          make dist
      - uses: actions/upload-artifact@v4
        with:
-          name: LocalAI-MacOS-arm64-${{ matrix.build }}
+          name: LocalAI-MacOS-arm64
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -34,7 +34,7 @@ jobs:
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -64,7 +64,7 @@ jobs:
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -74,6 +74,37 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test

+
+  tests-rerankers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.63.0
+          
+          sudo rm -rfv /usr/bin/conda || true
+
+      - name: Test rerankers
+        run: |
+           export PATH=$PATH:/opt/conda/bin
+           make --jobs=5 --output-sync=target -C backend/python/rerankers
+           make --jobs=5 --output-sync=target -C backend/python/rerankers test
+
  tests-diffusers:
    runs-on: ubuntu-latest
    steps:
@@ -94,7 +125,7 @@ jobs:
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -124,7 +155,7 @@ jobs:
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -154,7 +185,7 @@ jobs:
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
          
          sudo rm -rfv /usr/bin/conda || true

@@ -186,7 +217,7 @@ jobs:
  #            sudo apt-get install -y conda
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools
+  #         pip install --user grpcio-tools==1.63.0
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -258,7 +289,7 @@ jobs:
  #            sudo apt-get install -y conda
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools
+  #         pip install --user grpcio-tools==1.63.0
          
  #         sudo rm -rfv /usr/bin/conda || true

@@ -291,7 +322,7 @@ jobs:
  #            sudo apt-get install -y conda
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools
+  #         pip install --user grpcio-tools==1.63.0
  #         sudo rm -rfv /usr/bin/conda || true
  #     - name: Test vllm
  #       run: |
@@ -318,7 +349,7 @@ jobs:
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
          sudo rm -rfv /usr/bin/conda || true
      - name: Test vall-e-x
        run: |
@@ -345,7 +376,7 @@ jobs:
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
          sudo rm -rfv /usr/bin/conda || true

      - name: Test coqui
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ on:
      - '*'

 env:
-  GRPC_VERSION: v1.58.0
+  GRPC_VERSION: v1.63.0

 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -123,7 +123,9 @@ jobs:
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
+          detached: true
          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  tests-aio-container:
    runs-on: ubuntu-latest
@@ -176,7 +178,9 @@ jobs:
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
+          detached: true
          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  tests-apple:
    runs-on: macOS-14
@@ -199,7 +203,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
-          pip install --user grpcio-tools
+          pip install --user grpcio-tools==1.63.0
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -211,4 +215,6 @@ jobs:
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
-          connect-timeout-seconds: 180
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -0,0 +1,31 @@
+name: Update swagger
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  swagger:
+    strategy:
+      fail-fast: false
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: 'stable'
+      - run: |
+          go install github.com/swaggo/swag/cmd/swag@latest
+      - name: Bump swagger 🔧
+        run: |
+          make swagger
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: 'feat(swagger): update swagger'
+          title: 'feat(swagger): update swagger'
+          branch: "update/swagger"
+          body:  Update swagger
+          signoff: true
+
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -0,0 +1,18 @@
+name: 'Yamllint GitHub Actions'
+on:
+  - pull_request
+jobs:
+  yamllint:
+    name: 'Yamllint'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@master
+      - name: 'Yamllint'
+        uses: karancode/yamllint-github-action@master
+        with:
+          yamllint_file_or_dir: 'gallery'
+          yamllint_strict: false
+          yamllint_comment: true
+        env:
+          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,6 @@ prepare
 *.pb.go
 *pb2.py
 *pb2_grpc.py
+
+# SonarQube
+.scannerwork
--- a/.yamllint
+++ b/.yamllint
@@ -0,0 +1,4 @@
+extends: default
+
+rules:
+    line-length: disable
--- a/221
+++ b/221
@@ -1,41 +1,43 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}

-# extras or core
-FROM ${BASE_IMAGE} as requirements-core
+# The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
+FROM ${BASE_IMAGE} AS requirements-core

 USER root

 ARG GO_VERSION=1.21.7
-ARG BUILD_TYPE
-ARG CUDA_MAJOR_VERSION=11
-ARG CUDA_MINOR_VERSION=7
 ARG TARGETARCH
 ARG TARGETVARIANT

-ENV BUILD_TYPE=${BUILD_TYPE}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"

 ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates curl python3-pip unzip && apt-get clean
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        python3-pip \
+        python-is-python3 \
+        unzip && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip

 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
-ENV PATH $PATH:/usr/local/go/bin
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+ENV PATH $PATH:/root/go/bin:/usr/local/go/bin

 # Install grpc compilers
-ENV PATH $PATH:/root/go/bin
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \
    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest

-# Install protobuf (the version in 22.04 is too old)
-RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
-    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-    rm protoc.zip
-
 # Install grpcio-tools (the version in 22.04 is too old)
 RUN pip install --user grpcio-tools

@@ -46,16 +48,6 @@ RUN update-ca-certificates
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"

-# CuBLAS requirements
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
-    apt-get install -y software-properties-common && \
-    curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
-    dpkg -i cuda-keyring_1.1-1_all.deb && \
-    rm -f cuda-keyring_1.1-1_all.deb && \
-    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
-    ; fi
-
 # Cuda
 ENV PATH /usr/local/cuda/bin:${PATH}

@@ -63,10 +55,12 @@ ENV PATH /usr/local/cuda/bin:${PATH}
 ENV PATH /opt/rocm/bin:${PATH}

 # OpenBLAS requirements and stable diffusion
-RUN apt-get install -y \
-    libopenblas-dev \
-    libopencv-dev \ 
-    && apt-get clean
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libopenblas-dev \
+        libopencv-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
@@ -79,57 +73,126 @@ RUN test -n "$TARGETARCH" \
 ###################################
 ###################################

-FROM requirements-core as requirements-extras
+# The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
+FROM requirements-core AS requirements-extras

-RUN apt install -y gpg && \
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gpg && \
    curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
    apt-get update && \
-    apt-get install -y conda && apt-get clean
+    apt-get install -y --no-install-recommends \
+        conda && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

 ENV PATH="/root/.cargo/bin:${PATH}"
-RUN apt-get install -y python3-pip && apt-get clean
-RUN pip install --upgrade pip

 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN apt-get install -y espeak-ng espeak && apt-get clean
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        espeak-ng \
+        espeak && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

-RUN if [ ! -e /usr/bin/python ]; then \
-	  ln -s /usr/bin/python3 /usr/bin/python \
+###################################
+###################################
+
+# The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
+# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
+FROM requirements-${IMAGE_TYPE} AS requirements-drivers
+
+ARG BUILD_TYPE
+ARG CUDA_MAJOR_VERSION=11
+ARG CUDA_MINOR_VERSION=7
+
+ENV BUILD_TYPE=${BUILD_TYPE}
+
+# CuBLAS requirements
+RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common && \
+        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
    ; fi

 ###################################
 ###################################

-FROM ${BASE_IMAGE} as grpc
+# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
+# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
+FROM ${GRPC_BASE_IMAGE} AS grpc

-ARG MAKEFLAGS
+# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.58.0

-ENV MAKEFLAGS=${MAKEFLAGS}
+ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

 WORKDIR /build

 RUN apt-get update && \
-    apt-get install -y build-essential cmake git  && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        build-essential \
+        cmake \
+        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc
-
-RUN cd grpc && \
-    mkdir -p cmake/build && \
-    cd cmake/build && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
-    make
+# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
+# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
+# and running make install in the target container
+RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    mkdir -p /build/grpc/cmake/build && \
+    cd /build/grpc/cmake/build && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
+    make && \
+    make install && \
+    rm -rf /build

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE} as builder
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
+# Adjustments to the build process should likely be made here.
+FROM requirements-drivers AS builder

 ARG GO_TAGS="stablediffusion tts"
 ARG GRPC_BACKENDS
@@ -148,39 +211,36 @@ COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"

-RUN apt-get update && \
-    apt-get install -y build-essential cmake git  && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 RUN make prepare

-# If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
-    apt-get update && \
-    apt-get install -y libclblast-dev && \
-    apt-get clean \
-    ; fi
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+    rm protoc.zip

 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

-COPY --from=grpc /build/grpc ./grpc/
-
-RUN cd /build/grpc/cmake/build && make install
+# Install the pre-built GRPC
+COPY --from=grpc /opt/grpc /usr/local

 # Rebuild with defaults backends
+WORKDIR /build
 RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
-    mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
-    touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
+        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
+        touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
    ; fi

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE}
+# This is the final target. The result of this target will be the image uploaded to the registry.
+# If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
+FROM requirements-drivers

 ARG FFMPEG
 ARG BUILD_TYPE
@@ -201,21 +261,13 @@ ENV PIP_CACHE_PURGE=true

 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg && apt-get clean \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            ffmpeg && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
    ; fi

-# Add OpenCL
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
-    apt-get update && \
-    apt-get install -y libclblast1 && \
-    apt-get clean \
-    ; fi
-
-RUN apt-get update && \
-    apt-get install -y cmake git  && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 WORKDIR /build

 # we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
@@ -225,9 +277,9 @@ WORKDIR /build
 COPY . .

 COPY --from=builder /build/sources ./sources/
-COPY --from=grpc /build/grpc ./grpc/
+COPY --from=grpc /opt/grpc /usr/local

-RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf /build/grpc
+RUN make prepare-sources

 # Copy the binary
 COPY --from=builder /build/local-ai ./
@@ -257,6 +309,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/sentencetransformers \
    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+    make -C backend/python/rerankers \
+    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers \
    ; fi
@@ -287,7 +342,7 @@ RUN mkdir -p /build/models

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
-  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
+  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
  
 VOLUME /build/models
 EXPOSE 8080
--- a/155
+++ b/155
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=7593639ce335e8d7f89aa9a54d616951f273af60
+CPPLLAMA_VERSION?=c12452c7aec8a02264afc00196a13caa591a13ac

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=b0c3cbf2e851cf232e432b590dcc514a689ec028
+WHISPER_CPP_VERSION?=73d13ad19a8c9c4da4f405088a85169b1a171e66

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -25,10 +25,10 @@ BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
+STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f

 # tinydream version
-TINYDREAM_VERSION?=22a12a4bc0ac5455856f28f3b771331a551a4293
+TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
@@ -99,7 +99,7 @@ endif
 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
-	export WHISPER_CUBLAS=1
+	export WHISPER_CUDA=1
 	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
 endif

@@ -152,9 +152,11 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
+ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-noavx
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
@@ -179,20 +181,20 @@ endif
 all: help

 ## BERT embeddings
-sources/go-bert:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
-	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert.cpp
+	cd sources/go-bert.cpp && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-bert/libgobert.a: sources/go-bert
-	$(MAKE) -C sources/go-bert libgobert.a
+sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
+	$(MAKE) -C sources/go-bert.cpp libgobert.a

-## go-llama-ggml
-sources/go-llama-ggml:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+## go-llama.cpp
+sources/go-llama.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama.cpp
+	cd sources/go-llama.cpp && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
+	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

 ## go-piper
 sources/go-piper:
@@ -211,12 +213,12 @@ sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
-sources/go-rwkv:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
-	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-rwkv.cpp:
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-rwkv/librwkv.a: sources/go-rwkv
-	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

 ## stable diffusion
 sources/go-stable-diffusion:
@@ -236,23 +238,24 @@ sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream

 ## whisper
 sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
+	git clone https://github.com/ggerganov/whisper.cpp sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && make libwhisper.a
+	cd sources/whisper.cpp && $(MAKE) libwhisper.a

-get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream

 replace:
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
@@ -271,12 +274,12 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama-ggml clean
+	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-rwkv clean
+	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert clean
+	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
@@ -292,6 +295,7 @@ clean: ## Remove build related file
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
+	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
 	rmdir pkg/grpc/proto || true
@@ -301,9 +305,6 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

-halt-backends:		## Used to clean up stray backends sometimes left running when debugging manually
-	ps | grep 'backend-assets/grpc/' | awk '{print $$1}' | xargs -I {} kill -9 {}
-
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -313,14 +314,19 @@ build: prepare backend-assets grpcs ## Build the project
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 build-minimal:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS=backend-assets/grpc/llama-cpp GO_TAGS=none $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build

 build-api:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build

 dist: build
 	mkdir -p release
+# if BUILD_ID is empty, then we don't append it to the binary name
+ifeq ($(BUILD_ID),)
+	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-$(ARCH)
+else
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
+endif

 osx-signed: build
 	codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
@@ -368,13 +374,13 @@ run-e2e-image:

 run-e2e-aio:
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio

 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e

 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
@@ -382,15 +388,15 @@ teardown-e2e:

 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
@@ -439,10 +445,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean

 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -508,6 +514,14 @@ petals-protogen:
 petals-protogen-clean:
 	$(MAKE) -C backend/python/petals protogen-clean

+.PHONY: rerankers-protogen
+rerankers-protogen:
+	$(MAKE) -C backend/python/rerankers protogen
+
+.PHONY: rerankers-protogen-clean
+rerankers-protogen-clean:
+	$(MAKE) -C backend/python/rerankers protogen-clean
+
 .PHONY: sentencetransformers-protogen
 sentencetransformers-protogen:
 	$(MAKE) -C backend/python/sentencetransformers protogen
@@ -566,6 +580,7 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
+	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/transformers-musicgen
 	$(MAKE) -C backend/python/parler-tts
@@ -601,16 +616,16 @@ backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
+backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/

-backend-assets/grpc/langchain-huggingface: backend-assets/grpc
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
+backend-assets/grpc/huggingface: backend-assets/grpc
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/

 backend/cpp/llama/llama.cpp:
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
@@ -622,7 +637,7 @@ ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
 				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
 				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
 				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
-backend/cpp/llama/grpc-server:
+build-llama-cpp-grpc-server:
 # Conditionally build grpc for the llama backend to use if needed
 ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	$(MAKE) -C backend/cpp/grpc build
@@ -631,33 +646,47 @@ ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
 	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) \
-	$(MAKE) -C backend/cpp/llama grpc-server
+	$(MAKE) -C backend/cpp/${VARIANT} grpc-server
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif

-backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
-	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+backend-assets/grpc/llama-cpp: backend-assets/grpc
+	$(info ${GREEN}I llama-cpp build info:standard${RESET})
+	cp -rf backend/cpp/llama backend/cpp/llama-default
+	$(MAKE) -C backend/cpp/llama-default purge
+	$(MAKE) VARIANT="llama-default" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-default/grpc-server backend-assets/grpc/llama-cpp
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
+	cp backend/cpp/llama-default/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

-backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
+backend-assets/grpc/llama-cpp-noavx: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-noavx
+	$(MAKE) -C backend/cpp/llama-noavx purge
+	$(info ${GREEN}I llama-cpp build info:noavx${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF" $(MAKE) VARIANT="llama-noavx" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-noavx/grpc-server backend-assets/grpc/llama-cpp-noavx
+
+backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-fallback
+	$(MAKE) -C backend/cpp/llama-fallback purge
+	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
+
+backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
-#	EXPERIMENTAL:
-ifeq ($(BUILD_TYPE),metal)
-	cp $(CURDIR)/sources/go-llama-ggml/llama.cpp/ggml-metal.metal backend-assets/grpc/
-endif
+
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

-backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
+backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv

 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
@@ -704,7 +733,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -712,7 +741,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -720,4 +749,4 @@ docker-image-intel-xpu:

 .PHONY: swagger
 swagger:
-	swag init -g core/http/api.go --output swagger
+	swag init -g core/http/app.go --output swagger
--- a/README.md
+++ b/README.md
@@ -44,20 +44,24 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+- Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
+- Reranker API: https://github.com/mudler/LocalAI/pull/2121
+- Gallery WebUI: https://github.com/mudler/LocalAI/pull/2104
+- llama3: https://github.com/mudler/LocalAI/discussions/2076
 - Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
- Landing page: https://github.com/mudler/LocalAI/pull/1922
 - Openvino support: https://github.com/mudler/LocalAI/pull/1892
 - Vector store: https://github.com/mudler/LocalAI/pull/1795
 - All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715

 Hot topics (looking for contributors):
+
+- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
@@ -88,7 +92,8 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
- 🆕 [Vision API](https://localai.io/features/gpt-vision/)
+- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
+- 🆕 [Reranker API](https://localai.io/features/reranker/)

 ## 💻 Usage

@@ -109,6 +114,7 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
+- Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation
 - Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
@@ -127,7 +133,7 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/ai/answers/tiZMDoZzZV6TLxgDXNBnFE/deploying-helm-charts-on-aws-eks)
+- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
 - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,20 +1,27 @@
 name: gpt-4
 mmap: true
 parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q2_K.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size

 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"

 check_vars

--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,20 +1,27 @@
 name: gpt-4
 mmap: true
 parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -29,8 +36,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -2,20 +2,27 @@ name: gpt-4
 mmap: false
 f16: false
 parameters:
-  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}<tool_call>{{end}}
-    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
-    {{- if .Content}}
-    {{.Content}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
    {{- end }}
-    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
-    {{- if .FunctionCall }}</tool_call>{{end }}
-    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
-    <|im_end|>
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -30,8 +37,7 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call>
-    <|im_end|>
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -23,6 +23,30 @@ service Backend {
  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
+
+  rpc Rerank(RerankRequest) returns (RerankResult) {}
+}
+
+message RerankRequest {
+  string query = 1;
+  repeated string documents = 2;
+  int32 top_n = 3;
+}
+
+message RerankResult {
+  Usage usage = 1;
+  repeated DocumentResult results = 2;
+}
+
+message Usage {
+  int32 total_tokens = 1;
+  int32 prompt_tokens = 2;
+}
+
+message DocumentResult {
+  int32 index = 1;
+  string text = 2;
+  float relevance_score = 3;
 }

 message StoresKey {
@@ -177,6 +201,7 @@ message ModelOptions {
  bool   EnforceEager = 52;
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
+  int32  TensorParallelSize = 55;

  string MMProj = 41;

--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -43,31 +43,23 @@ llama.cpp:

 llama.cpp/examples/grpc-server: llama.cpp
 	mkdir -p llama.cpp/examples/grpc-server
-	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
-	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/utils.hpp llama.cpp/examples/grpc-server/
-	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
-## XXX: In some versions of CMake clip wasn't being built before llama.
-## This is an hack for now, but it should be fixed in the future.
-	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-	cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-	echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-	cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+	bash prepare.sh

 rebuild:
-	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
-	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	bash prepare.sh
 	rm -rf grpc-server
 	$(MAKE) grpc-server

-clean:
-	rm -rf llama.cpp
+purge:
+	rm -rf llama.cpp/build
+	rm -rf llama.cpp/examples/grpc-server
 	rm -rf grpc-server

+clean: purge
+	rm -rf llama.cpp
+
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	bash -c "source $(ONEAPI_VARS); \
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release"	
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
+cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
+cp -rfv json.hpp llama.cpp/examples/grpc-server/
+cp -rfv utils.hpp llama.cpp/examples/grpc-server/
+    
+if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
+    echo "grpc-server already added"
+else
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+fi
+
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
+echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
+cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
--- a/backend/go/llm/langchain/langchain.go
+++ b/backend/go/llm/langchain/langchain.go
@@ -4,6 +4,7 @@ package main
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
+	"os"

 	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
@@ -18,9 +19,14 @@ type LLM struct {
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	llm.langchain, _ = langchain.NewHuggingFace(opts.Model)
+	var err error
+	hfToken := os.Getenv("HUGGINGFACEHUB_API_TOKEN")
+	if hfToken == "" {
+		return fmt.Errorf("no huggingface token provided")
+	}
+	llm.langchain, err = langchain.NewHuggingFace(opts.Model, hfToken)
 	llm.model = opts.Model
-	return nil
+	return err
 }

 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -11,8 +11,8 @@ import (
 	"github.com/go-skynet/LocalAI/core/schema"
 )

-func runCommand(command []string) (string, error) {
-	cmd := exec.Command(command[0], command[1:]...)
+func ffmpegCommand(args []string) (string, error) {
+	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
 	cmd.Env = os.Environ()
 	out, err := cmd.CombinedOutput()
 	return string(out), err
@@ -21,8 +21,8 @@ func runCommand(command []string) (string, error) {
 // AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
-	command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
-	out, err := runCommand(command)
+	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := ffmpegCommand(commandArgs)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@@ -41,7 +41,7 @@ dependencies:
      - filelock==3.12.4
      - frozenlist==1.4.0
      - fsspec==2023.6.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub==0.16.4
      - idna==3.4
      - jinja2==3.1.2
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -26,7 +26,7 @@ if [ -d "/opt/intel" ]; then
    # Intel GPU: If the directory exists, we assume we are using the intel image
    # (no conda env)
    # https://github.com/intel/intel-extension-for-pytorch/issues/538
-    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino]
+    pip install torch==2.1.0.post0 torchvision==0.16.0.post0 torchaudio==2.1.0.post0 intel-extension-for-pytorch==2.1.20+xpu oneccl_bind_pt==2.1.200+xpu intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 fi

 # If we didn't skip conda, activate the environment
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -47,7 +47,7 @@ dependencies:
      - frozenlist==1.4.0
      - fsspec==2023.6.0
      - funcy==2.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub
      - idna==3.4
      - jinja2==3.1.2
@@ -120,4 +120,6 @@ dependencies:
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -48,7 +48,7 @@ dependencies:
      - frozenlist==1.4.0
      - fsspec==2023.6.0
      - funcy==2.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub
      - idna==3.4
      - jinja2==3.1.2
@@ -108,4 +108,6 @@ dependencies:
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -47,7 +47,7 @@ dependencies:
      - frozenlist==1.4.0
      - fsspec==2023.6.0
      - funcy==2.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub
      - humanfriendly==10.0
      - idna==3.4
@@ -60,9 +60,10 @@ dependencies:
      - networkx
      - numpy==1.26.0
      - onnx==1.15.0
-      - openvino==2024.0.0
-      - openvino-telemetry==2023.2.1
-      - optimum[openvino]==1.17.1
+      - openvino==2024.1.0
+      - openvino-telemetry==2024.1.0
+      - optimum[openvino]==1.19.1
+      - optimum-intel==1.16.1
      - packaging==23.2
      - pandas
      - peft==0.5.0
@@ -111,5 +112,7 @@ dependencies:
      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1  
+      - xformers==0.0.23.post1
+      - rerankers[transformers]
+      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/diffusers/diffusers-rocm.yml
+++ b/backend/python/diffusers/diffusers-rocm.yml
@@ -34,7 +34,7 @@ dependencies:
      - diffusers==0.24.0
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub>=0.19.4
      - idna==3.4
      - importlib-metadata==6.8.0
@@ -61,4 +61,5 @@ dependencies:
      - urllib3==2.0.6
      - zipp==3.17.0
      - torch
+      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -32,7 +32,7 @@ dependencies:
      - diffusers==0.24.0
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - huggingface-hub>=0.19.4
      - idna==3.4
      - importlib-metadata==6.8.0
@@ -71,4 +71,5 @@ dependencies:
      - typing-extensions==4.8.0
      - urllib3==2.0.6
      - zipp==3.17.0
+      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -31,8 +31,8 @@ if [ -d "/opt/intel" ]; then
                --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
    
    pip install google-api-python-client \
-                grpcio \
-                grpcio-tools \
+                grpcio==1.63.0 \
+                grpcio-tools==1.63.0 \
                diffusers==0.24.0 \
                transformers>=4.25.1 \
                accelerate \
--- a/backend/python/exllama/exllama.yml
+++ b/backend/python/exllama/exllama.yml
@@ -27,7 +27,7 @@ dependencies:
  - pip:
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - jinja2==3.1.2
      - markupsafe==2.1.3
      - mpmath==1.3.0
--- a/backend/python/exllama2/exllama2.yml
+++ b/backend/python/exllama2/exllama2.yml
@@ -27,7 +27,7 @@ dependencies:
  - pip:
      - filelock==3.12.4
      - fsspec==2023.9.2
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - markupsafe==2.1.3
      - mpmath==1.3.0
      - networkx==3.1
--- a/backend/python/parler-tts/parler-nvidia.yml
+++ b/backend/python/parler-tts/parler-nvidia.yml
@@ -26,7 +26,7 @@ dependencies:
  - zlib=1.2.13=h5eee18b_0
  - pip:
      - accelerate>=0.11.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - numpy==1.26.0
      - nvidia-cublas-cu12==12.1.3.1
      - nvidia-cuda-cupti-cu12==12.1.105
--- a/backend/python/parler-tts/parler.yml
+++ b/backend/python/parler-tts/parler.yml
@@ -27,7 +27,7 @@ dependencies:
  - pip:
      - accelerate>=0.11.0
      - numpy==1.26.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - torch==2.1.0
      - transformers>=4.34.0
      - descript-audio-codec
--- a/backend/python/rerankers/Makefile
+++ b/backend/python/rerankers/Makefile
@@ -0,0 +1,27 @@
+.PHONY: rerankers
+rerankers: protogen
+	$(MAKE) -C ../common-env/transformers
+
+
+.PHONY: run
+run: protogen
+	@echo "Running rerankers..."
+	bash run.sh
+	@echo "rerankers run."
+
+# It is not working well by using command line. It only6 works with IDE like VSCode.
+.PHONY: test
+test: protogen
+	@echo "Testing rerankers..."
+	bash test.sh
+	@echo "rerankers tested."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/rerankers/README.md
+++ b/backend/python/rerankers/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the reranker project
+
+```
+make reranker
+```
--- a/backend/python/rerankers/reranker.py
+++ b/backend/python/rerankers/reranker.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for Rerankers models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+from rerankers import Reranker
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        try:
+            kwargs = {}
+            if request.Type != "":
+                kwargs['model_type'] = request.Type
+            if request.PipelineType != "": # Reuse the PipelineType field for language
+                kwargs['lang'] = request.PipelineType
+            self.model_name = model_name
+            self.model = Reranker(model_name, **kwargs)  
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Rerank(self, request, context):
+        documents = []
+        for idx, doc in enumerate(request.documents):
+            documents.append(doc)
+        ranked_results=self.model.rank(query=request.query, docs=documents, doc_ids=list(range(len(request.documents))))
+        # Prepare results to return
+        results = [
+            backend_pb2.DocumentResult(
+                index=res.doc_id,
+                text=res.text,
+                relevance_score=res.score
+            ) for res in ranked_results.results
+        ]
+
+        # Calculate the usage and total tokens
+        # TODO: Implement the usage calculation with reranker
+        total_tokens = sum(len(doc.split()) for doc in request.documents) + len(request.query.split())
+        prompt_tokens = len(request.query.split())
+        usage = backend_pb2.Usage(total_tokens=total_tokens, prompt_tokens=prompt_tokens)
+        return backend_pb2.RerankResult(usage=usage, results=results)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/rerankers/run.sh
+++ b/backend/python/rerankers/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+##
+## A bash script wrapper that runs the reranker server with conda
+
+export PATH=$PATH:/opt/conda/bin
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python $DIR/reranker.py $@
--- a/backend/python/rerankers/test.sh
+++ b/backend/python/rerankers/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+##
+## A bash script wrapper that runs the reranker server with conda
+
+# Activate conda environment
+source activate transformers
+
+# get the directory where the bash script is located
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+python -m unittest $DIR/test_reranker.py
--- a/backend/python/rerankers/test_reranker.py
+++ b/backend/python/rerankers/test_reranker.py
@@ -0,0 +1,90 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "reranker.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.kill()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_rerank(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                request = backend_pb2.RerankRequest(
+                    query="I love you",
+                    documents=["I hate you", "I really like you"],
+                    top_n=2
+                )
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
+                self.assertTrue(response.success)
+               
+                rerank_response = stub.Rerank(request)
+                print(rerank_response.results[0])
+                self.assertIsNotNone(rerank_response.results)
+                self.assertEqual(len(rerank_response.results), 2)
+                self.assertEqual(rerank_response.results[0].text, "I really like you")
+                self.assertEqual(rerank_response.results[1].text, "I hate you")
+        except Exception as err:
+            print(err)
+            self.fail("Reranker service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -89,8 +89,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        quantization = None

        if self.CUDA:
-            if request.Device:
-                device_map=request.Device
+            if request.MainGPU:
+                device_map=request.MainGPU
            else:
                device_map="cuda:0"
            if request.Quantization == "bnb_4bit":
@@ -143,12 +143,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                from optimum.intel.openvino import OVModelForCausalLM
                from openvino.runtime import Core

-                if "GPU" in Core().available_devices:
-                    device_map="GPU"
+                if request.MainGPU:
+                    device_map=request.MainGPU
                else:
-                    device_map="CPU"
+                    device_map="AUTO"
+                    devices = Core().available_devices
+                    if "GPU" in " ".join(devices):
+                        device_map="AUTO:GPU"
+                # While working on a fine tuned model, inference may give an inaccuracy and performance drop on GPU if winograd convolutions are selected. 
+                # https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html
+                if "CPU" or "NPU" in device_map:
+                    if "-CPU" or "-NPU" not in device_map:
+                        ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"}
+                else:
+                    ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
                self.model = OVModelForCausalLM.from_pretrained(model_name, 
-                                                                compile=True, 
+                                                                compile=True,
+                                                                trust_remote_code=request.TrustRemoteCode,
+                                                                ov_config=ovconfig,
+                                                                device=device_map)
+                self.OV = True
+            elif request.Type == "OVModelForFeatureExtraction":
+                from optimum.intel.openvino import OVModelForFeatureExtraction
+                from openvino.runtime import Core
+
+                if request.MainGPU:
+                    device_map=request.MainGPU
+                else:
+                    device_map="AUTO"
+                    devices = Core().available_devices
+                    if "GPU" in " ".join(devices):
+                        device_map="AUTO:GPU"
+                # While working on a fine tuned model, inference may give an inaccuracy and performance drop on GPU if winograd convolutions are selected. 
+                # https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html
+                if "CPU" or "NPU" in device_map:
+                    if "-CPU" or "-NPU" not in device_map:
+                        ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"}
+                else:
+                    ovconfig={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT","GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
+                self.model = OVModelForFeatureExtraction.from_pretrained(model_name, 
+                                                                compile=True,
+                                                                trust_remote_code=request.TrustRemoteCode,
+                                                                ov_config=ovconfig, 
+                                                                export=True,
                                                                device=device_map)
                self.OV = True
            else:
@@ -158,6 +195,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                                       quantization_config=quantization, 
                                                       device_map=device_map, 
                                                       torch_dtype=compute)
+            if request.ContextSize > 0:
+                self.max_tokens = request.ContextSize
+            else:
+                self.max_tokens = self.model.config.max_position_embeddings
+ 
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
            self.XPU = False

@@ -204,20 +246,35 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-        print("Embeddings:", sentence_embeddings, file=sys.stderr)
+#        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
+#        print("Embeddings:", sentence_embeddings, file=sys.stderr)
        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])

    async def _predict(self, request, context, streaming=False): 
        set_seed(request.Seed)
        if request.TopP == 0:
            request.TopP = 0.9
+        
+        if request.TopK == 0:
+            request.TopK = 40
+
+        prompt = request.Prompt
+        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:    
+            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
+
+        eos_token_id = self.tokenizer.eos_token_id
+        if request.StopPrompts:
+            eos_token_id = []
+            for word in request.StopPrompts:
+                eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
+
+        inputs = self.tokenizer(prompt, return_tensors="pt")

-        max_tokens = 200
        if request.Tokens > 0:
            max_tokens = request.Tokens
+        else:
+            max_tokens = self.max_tokens - inputs["input_ids"].size()[inputs["input_ids"].dim()-1]

-        inputs = self.tokenizer(request.Prompt, return_tensors="pt")
        if self.CUDA:
            inputs = inputs.to("cuda")
        if XPU and self.OV == False:
@@ -235,7 +292,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
                        streamer=streamer)
            thread=Thread(target=self.model.generate, kwargs=config)
@@ -264,7 +321,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=self.tokenizer.eos_token_id,
+                        eos_token_id=eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id)
            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]

@@ -334,4 +391,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    asyncio.run(serve(args.addr))
+    asyncio.run(serve(args.addr))
--- a/backend/python/vall-e-x/ttsvalle.yml
+++ b/backend/python/vall-e-x/ttsvalle.yml
@@ -42,7 +42,7 @@ dependencies:
      - future==0.18.3
      - gradio==3.47.1
      - gradio-client==0.6.0
-      - grpcio==1.59.0
+      - grpcio==1.63.0
      - h11==0.14.0
      - httpcore==0.18.0
      - httpx==0.25.0
--- a/backend/python/vllm/backend_vllm.py
+++ b/backend/python/vllm/backend_vllm.py
@@ -95,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            engine_args.trust_remote_code = request.TrustRemoteCode
        if request.EnforceEager:
            engine_args.enforce_eager = request.EnforceEager
+        if request.TensorParallelSize:
+            engine_args.tensor_parallel_size = request.TensorParallelSize
        if request.SwapSpace != 0:
            engine_args.swap_space = request.SwapSpace
        if request.MaxModelLen != 0:
--- a/core/application.go
+++ b/core/application.go
@@ -1,13 +1,11 @@
 package core

 import (
-	"github.com/go-skynet/LocalAI/core/backend"
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/services"
 	"github.com/go-skynet/LocalAI/pkg/model"
 )

-// TODO: Can I come up with a better name or location for this?
 // The purpose of this structure is to hold pointers to all initialized services, to make plumbing easy
 // Perhaps a proper DI system is worth it in the future, but for now keep things simple.
 type Application struct {
@@ -21,18 +19,18 @@ type Application struct {
 	ModelLoader         *model.ModelLoader

 	// Backend Services
-	EmbeddingsBackendService      *backend.EmbeddingsBackendService
-	ImageGenerationBackendService *backend.ImageGenerationBackendService
-	LLMBackendService             *backend.LLMBackendService
-	TranscriptionBackendService   *backend.TranscriptionBackendService
-	TextToSpeechBackendService    *backend.TextToSpeechBackendService
+	// EmbeddingsBackendService      *backend.EmbeddingsBackendService
+	// ImageGenerationBackendService *backend.ImageGenerationBackendService
+	// LLMBackendService             *backend.LLMBackendService
+	// TranscriptionBackendService *backend.TranscriptionBackendService
+	// TextToSpeechBackendService  *backend.TextToSpeechBackendService

 	// LocalAI System Services
 	BackendMonitorService *services.BackendMonitorService
 	GalleryService        *services.GalleryService
 	ListModelsService     *services.ListModelsService
 	LocalAIMetricsService *services.LocalAIMetricsService
-	OpenAIService         *services.OpenAIService
+	// OpenAIService         *services.OpenAIService
 }

 // TODO [NEXT PR?]: Break up ApplicationConfig.
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -2,100 +2,14 @@ package backend

 import (
 	"fmt"
-	"time"

 	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/google/uuid"

-	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-type EmbeddingsBackendService struct {
-	ml        *model.ModelLoader
-	bcl       *config.BackendConfigLoader
-	appConfig *config.ApplicationConfig
-}
-
-func NewEmbeddingsBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *EmbeddingsBackendService {
-	return &EmbeddingsBackendService{
-		ml:        ml,
-		bcl:       bcl,
-		appConfig: appConfig,
-	}
-}
-
-func (ebs *EmbeddingsBackendService) Embeddings(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.OpenAIResponse] {
-
-	resultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
-	go func(request *schema.OpenAIRequest) {
-		if request.Model == "" {
-			request.Model = model.StableDiffusionBackend
-		}
-
-		bc, request, err := ebs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, ebs.appConfig)
-		if err != nil {
-			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-			close(resultChannel)
-			return
-		}
-
-		items := []schema.Item{}
-
-		for i, s := range bc.InputToken {
-			// get the model function to call for the result
-			embedFn, err := modelEmbedding("", s, ebs.ml, bc, ebs.appConfig)
-			if err != nil {
-				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-				close(resultChannel)
-				return
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-				close(resultChannel)
-				return
-			}
-			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		for i, s := range bc.InputStrings {
-			// get the model function to call for the result
-			embedFn, err := modelEmbedding(s, []int{}, ebs.ml, bc, ebs.appConfig)
-			if err != nil {
-				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-				close(resultChannel)
-				return
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-				close(resultChannel)
-				return
-			}
-			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		id := uuid.New().String()
-		created := int(time.Now().Unix())
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:    items,
-			Object:  "list",
-		}
-		resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: resp}
-		close(resultChannel)
-	}(request)
-	return resultChannel
-}
-
-func modelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
+func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 	modelFile := backendConfig.Model

 	grpcOpts := gRPCModelOpts(backendConfig)
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -1,252 +1,18 @@
 package backend

 import (
-	"bufio"
-	"encoding/base64"
-	"fmt"
-	"io"
-	"net/http"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"time"
-
 	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/google/uuid"
-	"github.com/rs/zerolog/log"

-	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-type ImageGenerationBackendService struct {
-	ml                        *model.ModelLoader
-	bcl                       *config.BackendConfigLoader
-	appConfig                 *config.ApplicationConfig
-	BaseUrlForGeneratedImages string
-}
-
-func NewImageGenerationBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *ImageGenerationBackendService {
-	return &ImageGenerationBackendService{
-		ml:        ml,
-		bcl:       bcl,
-		appConfig: appConfig,
-	}
-}
-
-func (igbs *ImageGenerationBackendService) GenerateImage(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.OpenAIResponse] {
-	resultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
-	go func(request *schema.OpenAIRequest) {
-		bc, request, err := igbs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, igbs.appConfig)
-		if err != nil {
-			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-			close(resultChannel)
-			return
-		}
-
-		src := ""
-		if request.File != "" {
-
-			var fileData []byte
-			// check if input.File is an URL, if so download it and save it
-			// to a temporary file
-			if strings.HasPrefix(request.File, "http://") || strings.HasPrefix(request.File, "https://") {
-				out, err := downloadFile(request.File)
-				if err != nil {
-					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("failed downloading file:%w", err)}
-					close(resultChannel)
-					return
-				}
-				defer os.RemoveAll(out)
-
-				fileData, err = os.ReadFile(out)
-				if err != nil {
-					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("failed reading file:%w", err)}
-					close(resultChannel)
-					return
-				}
-
-			} else {
-				// base 64 decode the file and write it somewhere
-				// that we will cleanup
-				fileData, err = base64.StdEncoding.DecodeString(request.File)
-				if err != nil {
-					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-					close(resultChannel)
-					return
-				}
-			}
-
-			// Create a temporary file
-			outputFile, err := os.CreateTemp(igbs.appConfig.ImageDir, "b64")
-			if err != nil {
-				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-				close(resultChannel)
-				return
-			}
-			// write the base64 result
-			writer := bufio.NewWriter(outputFile)
-			_, err = writer.Write(fileData)
-			if err != nil {
-				outputFile.Close()
-				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-				close(resultChannel)
-				return
-			}
-			outputFile.Close()
-			src = outputFile.Name()
-			defer os.RemoveAll(src)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", bc)
-
-		switch bc.Backend {
-		case "stablediffusion":
-			bc.Backend = model.StableDiffusionBackend
-		case "tinydream":
-			bc.Backend = model.TinyDreamBackend
-		case "":
-			bc.Backend = model.StableDiffusionBackend
-			if bc.Model == "" {
-				bc.Model = "stablediffusion_assets" // TODO: check?
-			}
-		}
-
-		sizeParts := strings.Split(request.Size, "x")
-		if len(sizeParts) != 2 {
-			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
-			close(resultChannel)
-			return
-		}
-		width, err := strconv.Atoi(sizeParts[0])
-		if err != nil {
-			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
-			close(resultChannel)
-			return
-		}
-		height, err := strconv.Atoi(sizeParts[1])
-		if err != nil {
-			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
-			close(resultChannel)
-			return
-		}
-
-		b64JSON := false
-		if request.ResponseFormat.Type == "b64_json" {
-			b64JSON = true
-		}
-		// src and clip_skip
-		var result []schema.Item
-		for _, i := range bc.PromptStrings {
-			n := request.N
-			if request.N == 0 {
-				n = 1
-			}
-			for j := 0; j < n; j++ {
-				prompts := strings.Split(i, "|")
-				positive_prompt := prompts[0]
-				negative_prompt := ""
-				if len(prompts) > 1 {
-					negative_prompt = prompts[1]
-				}
-
-				mode := 0
-				step := bc.Step
-				if step == 0 {
-					step = 15
-				}
-
-				if request.Mode != 0 {
-					mode = request.Mode
-				}
-
-				if request.Step != 0 {
-					step = request.Step
-				}
-
-				tempDir := ""
-				if !b64JSON {
-					tempDir = igbs.appConfig.ImageDir
-				}
-				// Create a temporary file
-				outputFile, err := os.CreateTemp(tempDir, "b64")
-				if err != nil {
-					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-					close(resultChannel)
-					return
-				}
-				outputFile.Close()
-				output := outputFile.Name() + ".png"
-				// Rename the temporary file
-				err = os.Rename(outputFile.Name(), output)
-				if err != nil {
-					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-					close(resultChannel)
-					return
-				}
-
-				if request.Seed == nil {
-					zVal := 0 // Idiomatic way to do this? Actually needed?
-					request.Seed = &zVal
-				}
-
-				fn, err := imageGeneration(height, width, mode, step, *request.Seed, positive_prompt, negative_prompt, src, output, igbs.ml, bc, igbs.appConfig)
-				if err != nil {
-					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-					close(resultChannel)
-					return
-				}
-				if err := fn(); err != nil {
-					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-					close(resultChannel)
-					return
-				}
-
-				item := &schema.Item{}
-
-				if b64JSON {
-					defer os.RemoveAll(output)
-					data, err := os.ReadFile(output)
-					if err != nil {
-						resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
-						close(resultChannel)
-						return
-					}
-					item.B64JSON = base64.StdEncoding.EncodeToString(data)
-				} else {
-					base := filepath.Base(output)
-					item.URL = igbs.BaseUrlForGeneratedImages + base
-				}
-
-				result = append(result, *item)
-			}
-		}
-
-		id := uuid.New().String()
-		created := int(time.Now().Unix())
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Data:    result,
-		}
-		resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: resp}
-		close(resultChannel)
-	}(request)
-	return resultChannel
-}
-
-func imageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
-
+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
 	threads := backendConfig.Threads
 	if *threads == 0 && appConfig.Threads != 0 {
 		threads = &appConfig.Threads
 	}
-
 	gRPCOpts := gRPCModelOpts(backendConfig)
-
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
@@ -284,24 +50,3 @@ func imageGeneration(height, width, mode, step, seed int, positive_prompt, negat

 	return fn, nil
 }
-
-// TODO: Replace this function with pkg/downloader - no reason to have a (crappier) bespoke download file fn here, but get things working before that change.
-func downloadFile(url string) (string, error) {
-	// Get the data
-	resp, err := http.Get(url)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	// Create the file
-	out, err := os.CreateTemp("", "image")
-	if err != nil {
-		return "", err
-	}
-	defer out.Close()
-
-	// Write the body to file
-	_, err = io.Copy(out, resp.Body)
-	return out.Name(), err
-}
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -11,22 +11,17 @@ import (

 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/rs/zerolog/log"

-	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-type LLMRequest struct {
-	Id          int // TODO Remove if not used.
-	Text        string
-	Images      []string
-	RawMessages []schema.Message
-	// TODO: Other Modalities?
+type LLMResponse struct {
+	Response string // should this be []byte?
+	Usage    TokenUsage
 }

 type TokenUsage struct {
@@ -34,94 +29,57 @@ type TokenUsage struct {
 	Completion int
 }

-type LLMResponse struct {
-	Request  *LLMRequest
-	Response string // should this be []byte?
-	Usage    TokenUsage
-}
-
-// TODO: Does this belong here or in core/services/openai.go?
-type LLMResponseBundle struct {
-	Request  *schema.OpenAIRequest
-	Response []schema.Choice
-	Usage    TokenUsage
-}
-
-type LLMBackendService struct {
-	bcl        *config.BackendConfigLoader
-	ml         *model.ModelLoader
-	appConfig  *config.ApplicationConfig
-	ftMutex    sync.Mutex
-	cutstrings map[string]*regexp.Regexp
-}
-
-func NewLLMBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *LLMBackendService {
-	return &LLMBackendService{
-		bcl:        bcl,
-		ml:         ml,
-		appConfig:  appConfig,
-		ftMutex:    sync.Mutex{},
-		cutstrings: make(map[string]*regexp.Regexp),
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+	modelFile := c.Model
+	threads := c.Threads
+	if *threads == 0 && o.Threads != 0 {
+		threads = &o.Threads
 	}
-}
-
-// TODO: Should ctx param be removed and replaced with hardcoded req.Context?
-func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest, bc *config.BackendConfig, enableTokenChannel bool) (
-	resultChannel <-chan concurrency.ErrorOr[*LLMResponse], tokenChannel <-chan concurrency.ErrorOr[*LLMResponse], err error) {
-
-	threads := bc.Threads
-	if (threads == nil || *threads == 0) && llmbs.appConfig.Threads != 0 {
-		threads = &llmbs.appConfig.Threads
-	}
-
-	grpcOpts := gRPCModelOpts(bc)
+	grpcOpts := gRPCModelOpts(c)

 	var inferenceModel grpc.Backend
+	var err error

-	opts := modelOpts(bc, llmbs.appConfig, []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
-		model.WithAssetDir(llmbs.appConfig.AssetsDestination),
-		model.WithModel(bc.Model),
-		model.WithContext(llmbs.appConfig.Context),
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
 	})

-	if bc.Backend != "" {
-		opts = append(opts, model.WithBackendString(bc.Backend))
+	if c.Backend != "" {
+		opts = append(opts, model.WithBackendString(c.Backend))
 	}

-	// Check if bc.Model exists, if it doesn't try to load it from the gallery
-	if llmbs.appConfig.AutoloadGalleries { // experimental
-		if _, err := os.Stat(bc.Model); os.IsNotExist(err) {
+	// Check if the modelFile exists, if it doesn't try to load it from the gallery
+	if o.AutoloadGalleries { // experimental
+		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
 			utils.ResetDownloadTimers()
 			// if we failed to load the model, we try to download it
-			err := gallery.InstallModelFromGalleryByName(llmbs.appConfig.Galleries, bc.Model, llmbs.appConfig.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
 			if err != nil {
-				return nil, nil, err
+				return nil, err
 			}
 		}
 	}

-	if bc.Backend == "" {
-		log.Debug().Msgf("backend not known for %q, falling back to greedy loader to find it", bc.Model)
-		inferenceModel, err = llmbs.ml.GreedyLoader(opts...)
+	if c.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
-		inferenceModel, err = llmbs.ml.BackendLoader(opts...)
+		inferenceModel, err = loader.BackendLoader(opts...)
 	}

 	if err != nil {
-		log.Error().Err(err).Msg("[llmbs.Inference] failed to load a backend")
-		return
+		return nil, err
 	}

-	grpcPredOpts := gRPCPredictOpts(bc, llmbs.appConfig.ModelPath)
-	grpcPredOpts.Prompt = req.Text
-	grpcPredOpts.Images = req.Images
-
-	if bc.TemplateConfig.UseTokenizerTemplate && req.Text == "" {
-		grpcPredOpts.UseTokenizerTemplate = true
-		protoMessages := make([]*proto.Message, len(req.RawMessages), len(req.RawMessages))
-		for i, message := range req.RawMessages {
+	var protoMessages []*proto.Message
+	// if we are using the tokenizer template, we need to convert the messages to proto messages
+	// unless the prompt has already been tokenized (non-chat endpoints + functions)
+	if c.TemplateConfig.UseTokenizerTemplate && s == "" {
+		protoMessages = make([]*proto.Message, len(messages), len(messages))
+		for i, message := range messages {
 			protoMessages[i] = &proto.Message{
 				Role: message.Role,
 			}
@@ -129,32 +87,47 @@ func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest,
 			case string:
 				protoMessages[i].Content = ct
 			default:
-				err = fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
-				return
+				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
 			}
 		}
 	}

-	tokenUsage := TokenUsage{}
+	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
+	fn := func() (LLMResponse, error) {
+		opts := gRPCPredictOpts(c, loader.ModelPath)
+		opts.Prompt = s
+		opts.Messages = protoMessages
+		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
+		opts.Images = images

-	promptInfo, pErr := inferenceModel.TokenizeString(ctx, grpcPredOpts)
-	if pErr == nil && promptInfo.Length > 0 {
-		tokenUsage.Prompt = int(promptInfo.Length)
-	}
+		tokenUsage := TokenUsage{}

-	rawResultChannel := make(chan concurrency.ErrorOr[*LLMResponse])
-	// TODO this next line is the biggest argument for taking named return values _back_ out!!!
-	var rawTokenChannel chan concurrency.ErrorOr[*LLMResponse]
+		// check the per-model feature flag for usage, since tokenCallback may have a cost.
+		// Defaults to off as for now it is still experimental
+		if c.FeatureFlag.Enabled("usage") {
+			userTokenCallback := tokenCallback
+			if userTokenCallback == nil {
+				userTokenCallback = func(token string, usage TokenUsage) bool {
+					return true
+				}
+			}

-	if enableTokenChannel {
-		rawTokenChannel = make(chan concurrency.ErrorOr[*LLMResponse])
+			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
+			if pErr == nil && promptInfo.Length > 0 {
+				tokenUsage.Prompt = int(promptInfo.Length)
+			}

-		// TODO Needs better name
-		ss := ""
+			tokenCallback = func(token string, usage TokenUsage) bool {
+				tokenUsage.Completion++
+				return userTokenCallback(token, tokenUsage)
+			}
+		}
+
+		if tokenCallback != nil {
+			ss := ""

-		go func() {
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, grpcPredOpts, func(chars []byte) {
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
 				partialRune = append(partialRune, chars...)

 				for len(partialRune) > 0 {
@@ -164,126 +137,54 @@ func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest,
 						break
 					}

-					tokenUsage.Completion++
-					rawTokenChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
-						Response: string(r),
-						Usage:    tokenUsage,
-					}}
-
+					tokenCallback(string(r), tokenUsage)
 					ss += string(r)

 					partialRune = partialRune[size:]
 				}
 			})
-			close(rawTokenChannel)
+			return LLMResponse{
+				Response: ss,
+				Usage:    tokenUsage,
+			}, err
+		} else {
+			// TODO: Is the chicken bit the only way to get here? is that acceptable?
+			reply, err := inferenceModel.Predict(ctx, opts)
 			if err != nil {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
-			} else {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
-					Response: ss,
-					Usage:    tokenUsage,
-				}}
+				return LLMResponse{}, err
 			}
-			close(rawResultChannel)
-		}()
-	} else {
-		go func() {
-			reply, err := inferenceModel.Predict(ctx, grpcPredOpts)
 			if tokenUsage.Prompt == 0 {
 				tokenUsage.Prompt = int(reply.PromptTokens)
 			}
 			if tokenUsage.Completion == 0 {
 				tokenUsage.Completion = int(reply.Tokens)
 			}
-			if err != nil {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
-				close(rawResultChannel)
-			} else {
-				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
-					Response: string(reply.Message),
-					Usage:    tokenUsage,
-				}}
-				close(rawResultChannel)
-			}
-		}()
+			return LLMResponse{
+				Response: string(reply.Message),
+				Usage:    tokenUsage,
+			}, err
+		}
 	}

-	resultChannel = rawResultChannel
-	tokenChannel = rawTokenChannel
-	return
+	return fn, nil
 }

-// TODO: Should predInput be a seperate param still, or should this fn handle extracting it from request??
-func (llmbs *LLMBackendService) GenerateText(predInput string, request *schema.OpenAIRequest, bc *config.BackendConfig,
-	mappingFn func(*LLMResponse) schema.Choice, enableCompletionChannels bool, enableTokenChannels bool) (
-	// Returns:
-	resultChannel <-chan concurrency.ErrorOr[*LLMResponseBundle], completionChannels []<-chan concurrency.ErrorOr[*LLMResponse], tokenChannels []<-chan concurrency.ErrorOr[*LLMResponse], err error) {
+var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
+var mu sync.Mutex = sync.Mutex{}

-	rawChannel := make(chan concurrency.ErrorOr[*LLMResponseBundle])
-	resultChannel = rawChannel
-
-	if request.N == 0 { // number of completions to return
-		request.N = 1
-	}
-	images := []string{}
-	for _, m := range request.Messages {
-		images = append(images, m.StringImages...)
-	}
-
-	for i := 0; i < request.N; i++ {
-
-		individualResultChannel, tokenChannel, infErr := llmbs.Inference(request.Context, &LLMRequest{
-			Text:        predInput,
-			Images:      images,
-			RawMessages: request.Messages,
-		}, bc, enableTokenChannels)
-		if infErr != nil {
-			err = infErr // Avoids complaints about redeclaring err but looks dumb
-			return
-		}
-		completionChannels = append(completionChannels, individualResultChannel)
-		tokenChannels = append(tokenChannels, tokenChannel)
-	}
-
-	go func() {
-		initialBundle := LLMResponseBundle{
-			Request:  request,
-			Response: []schema.Choice{},
-			Usage:    TokenUsage{},
-		}
-
-		wg := concurrency.SliceOfChannelsReducer(completionChannels, rawChannel, func(iv concurrency.ErrorOr[*LLMResponse], ov concurrency.ErrorOr[*LLMResponseBundle]) concurrency.ErrorOr[*LLMResponseBundle] {
-			if iv.Error != nil {
-				ov.Error = iv.Error
-				// TODO: Decide if we should wipe partials or not?
-				return ov
-			}
-			ov.Value.Usage.Prompt += iv.Value.Usage.Prompt
-			ov.Value.Usage.Completion += iv.Value.Usage.Completion
-
-			ov.Value.Response = append(ov.Value.Response, mappingFn(iv.Value))
-			return ov
-		}, concurrency.ErrorOr[*LLMResponseBundle]{Value: &initialBundle}, true)
-		wg.Wait()
-
-	}()
-
-	return
-}
-
-func (llmbs *LLMBackendService) Finetune(config config.BackendConfig, input, prediction string) string {
+func Finetune(config config.BackendConfig, input, prediction string) string {
 	if config.Echo {
 		prediction = input + prediction
 	}

 	for _, c := range config.Cutstrings {
-		llmbs.ftMutex.Lock()
-		reg, ok := llmbs.cutstrings[c]
+		mu.Lock()
+		reg, ok := cutstrings[c]
 		if !ok {
-			llmbs.cutstrings[c] = regexp.MustCompile(c)
-			reg = llmbs.cutstrings[c]
+			cutstrings[c] = regexp.MustCompile(c)
+			reg = cutstrings[c]
 		}
-		llmbs.ftMutex.Unlock()
+		mu.Unlock()
 		prediction = reg.ReplaceAllString(prediction, "")
 	}

--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -7,10 +7,11 @@ import (

 	"github.com/go-skynet/LocalAI/core/config"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
 )

-func modelOpts(bc *config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
 	if so.SingleBackend {
 		opts = append(opts, model.WithSingleActiveBackend())
 	}
@@ -19,12 +20,12 @@ func modelOpts(bc *config.BackendConfig, so *config.ApplicationConfig, opts []mo
 		opts = append(opts, model.EnableParallelRequests)
 	}

-	if bc.GRPC.Attempts != 0 {
-		opts = append(opts, model.WithGRPCAttempts(bc.GRPC.Attempts))
+	if c.GRPC.Attempts != 0 {
+		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
 	}

-	if bc.GRPC.AttemptsSleepTime != 0 {
-		opts = append(opts, model.WithGRPCAttemptsDelay(bc.GRPC.AttemptsSleepTime))
+	if c.GRPC.AttemptsSleepTime != 0 {
+		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
 	}

 	for k, v := range so.ExternalGRPCBackends {
@@ -34,7 +35,7 @@ func modelOpts(bc *config.BackendConfig, so *config.ApplicationConfig, opts []mo
 	return opts
 }

-func getSeed(c *config.BackendConfig) int32 {
+func getSeed(c config.BackendConfig) int32 {
 	seed := int32(*c.Seed)
 	if seed == config.RAND_SEED {
 		seed = rand.Int31()
@@ -43,7 +44,7 @@ func getSeed(c *config.BackendConfig) int32 {
 	return seed
 }

-func gRPCModelOpts(c *config.BackendConfig) *pb.ModelOptions {
+func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
@@ -74,6 +75,7 @@ func gRPCModelOpts(c *config.BackendConfig) *pb.ModelOptions {
 		EnforceEager:         c.EnforceEager,
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
+		TensorParallelSize:   int32(c.TensorParallelSize),
 		MMProj:               c.MMProj,
 		YarnExtFactor:        c.YarnExtFactor,
 		YarnAttnFactor:       c.YarnAttnFactor,
@@ -104,47 +106,51 @@ func gRPCModelOpts(c *config.BackendConfig) *pb.ModelOptions {
 	}
 }

-func gRPCPredictOpts(bc *config.BackendConfig, modelPath string) *pb.PredictOptions {
+func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOptions {
 	promptCachePath := ""
-	if bc.PromptCachePath != "" {
-		p := filepath.Join(modelPath, bc.PromptCachePath)
-		os.MkdirAll(filepath.Dir(p), 0755)
-		promptCachePath = p
+	if c.PromptCachePath != "" {
+		p := filepath.Join(modelPath, c.PromptCachePath)
+		err := os.MkdirAll(filepath.Dir(p), 0750)
+		if err == nil {
+			promptCachePath = p
+		} else {
+			log.Error().Err(err).Str("promptCachePath", promptCachePath).Msg("error creating prompt cache folder")
+		}
 	}

 	return &pb.PredictOptions{
-		Temperature:         float32(*bc.Temperature),
-		TopP:                float32(*bc.TopP),
-		NDraft:              bc.NDraft,
-		TopK:                int32(*bc.TopK),
-		Tokens:              int32(*bc.Maxtokens),
-		Threads:             int32(*bc.Threads),
-		PromptCacheAll:      bc.PromptCacheAll,
-		PromptCacheRO:       bc.PromptCacheRO,
+		Temperature:         float32(*c.Temperature),
+		TopP:                float32(*c.TopP),
+		NDraft:              c.NDraft,
+		TopK:                int32(*c.TopK),
+		Tokens:              int32(*c.Maxtokens),
+		Threads:             int32(*c.Threads),
+		PromptCacheAll:      c.PromptCacheAll,
+		PromptCacheRO:       c.PromptCacheRO,
 		PromptCachePath:     promptCachePath,
-		F16KV:               *bc.F16,
-		DebugMode:           *bc.Debug,
-		Grammar:             bc.Grammar,
-		NegativePromptScale: bc.NegativePromptScale,
-		RopeFreqBase:        bc.RopeFreqBase,
-		RopeFreqScale:       bc.RopeFreqScale,
-		NegativePrompt:      bc.NegativePrompt,
-		Mirostat:            int32(*bc.LLMConfig.Mirostat),
-		MirostatETA:         float32(*bc.LLMConfig.MirostatETA),
-		MirostatTAU:         float32(*bc.LLMConfig.MirostatTAU),
-		Debug:               *bc.Debug,
-		StopPrompts:         bc.StopWords,
-		Repeat:              int32(bc.RepeatPenalty),
-		NKeep:               int32(bc.Keep),
-		Batch:               int32(bc.Batch),
-		IgnoreEOS:           bc.IgnoreEOS,
-		Seed:                getSeed(bc),
-		FrequencyPenalty:    float32(bc.FrequencyPenalty),
-		MLock:               *bc.MMlock,
-		MMap:                *bc.MMap,
-		MainGPU:             bc.MainGPU,
-		TensorSplit:         bc.TensorSplit,
-		TailFreeSamplingZ:   float32(*bc.TFZ),
-		TypicalP:            float32(*bc.TypicalP),
+		F16KV:               *c.F16,
+		DebugMode:           *c.Debug,
+		Grammar:             c.Grammar,
+		NegativePromptScale: c.NegativePromptScale,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeFreqScale:       c.RopeFreqScale,
+		NegativePrompt:      c.NegativePrompt,
+		Mirostat:            int32(*c.LLMConfig.Mirostat),
+		MirostatETA:         float32(*c.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(*c.LLMConfig.MirostatTAU),
+		Debug:               *c.Debug,
+		StopPrompts:         c.StopWords,
+		Repeat:              int32(c.RepeatPenalty),
+		NKeep:               int32(c.Keep),
+		Batch:               int32(c.Batch),
+		IgnoreEOS:           c.IgnoreEOS,
+		Seed:                getSeed(c),
+		FrequencyPenalty:    float32(c.FrequencyPenalty),
+		MLock:               *c.MMlock,
+		MMap:                *c.MMap,
+		MainGPU:             c.MainGPU,
+		TensorSplit:         c.TensorSplit,
+		TailFreeSamplingZ:   float32(*c.TFZ),
+		TypicalP:            float32(*c.TypicalP),
 	}
 }
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -0,0 +1,39 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
+	bb := backend
+	if bb == "" {
+		return nil, fmt.Errorf("backend is required")
+	}
+
+	grpcOpts := gRPCModelOpts(backendConfig)
+
+	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
+		model.WithBackendString(bb),
+		model.WithModel(modelFile),
+		model.WithContext(appConfig.Context),
+		model.WithAssetDir(appConfig.AssetsDestination),
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
+	})
+	rerankModel, err := loader.BackendLoader(opts...)
+	if err != nil {
+		return nil, err
+	}
+
+	if rerankModel == nil {
+		return nil, fmt.Errorf("could not load rerank model")
+	}
+
+	res, err := rerankModel.Rerank(context.Background(), request)
+
+	return res, err
+}
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -7,48 +7,11 @@ import (
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"

-	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-type TranscriptionBackendService struct {
-	ml        *model.ModelLoader
-	bcl       *config.BackendConfigLoader
-	appConfig *config.ApplicationConfig
-}
-
-func NewTranscriptionBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *TranscriptionBackendService {
-	return &TranscriptionBackendService{
-		ml:        ml,
-		bcl:       bcl,
-		appConfig: appConfig,
-	}
-}
-
-func (tbs *TranscriptionBackendService) Transcribe(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.TranscriptionResult] {
-	responseChannel := make(chan concurrency.ErrorOr[*schema.TranscriptionResult])
-	go func(request *schema.OpenAIRequest) {
-		bc, request, err := tbs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, tbs.appConfig)
-		if err != nil {
-			responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Error: fmt.Errorf("failed reading parameters from request:%w", err)}
-			close(responseChannel)
-			return
-		}
-
-		tr, err := modelTranscription(request.File, request.Language, tbs.ml, bc, tbs.appConfig)
-		if err != nil {
-			responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Error: err}
-			close(responseChannel)
-			return
-		}
-		responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Value: tr}
-		close(responseChannel)
-	}(request)
-	return responseChannel
-}
-
-func modelTranscription(audio, language string, ml *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
+func ModelTranscription(audio, language string, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {

 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -7,60 +7,29 @@ import (
 	"path/filepath"

 	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"

-	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-type TextToSpeechBackendService struct {
-	ml        *model.ModelLoader
-	bcl       *config.BackendConfigLoader
-	appConfig *config.ApplicationConfig
-}
+func generateUniqueFileName(dir, baseName, ext string) string {
+	counter := 1
+	fileName := baseName + ext

-func NewTextToSpeechBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *TextToSpeechBackendService {
-	return &TextToSpeechBackendService{
-		ml:        ml,
-		bcl:       bcl,
-		appConfig: appConfig,
+	for {
+		filePath := filepath.Join(dir, fileName)
+		_, err := os.Stat(filePath)
+		if os.IsNotExist(err) {
+			return fileName
+		}
+
+		counter++
+		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
 	}
 }

-func (ttsbs *TextToSpeechBackendService) TextToAudioFile(request *schema.TTSRequest) <-chan concurrency.ErrorOr[*string] {
-	responseChannel := make(chan concurrency.ErrorOr[*string])
-	go func(request *schema.TTSRequest) {
-		cfg, err := ttsbs.bcl.LoadBackendConfigFileByName(request.Model, ttsbs.appConfig.ModelPath,
-			config.LoadOptionDebug(ttsbs.appConfig.Debug),
-			config.LoadOptionThreads(ttsbs.appConfig.Threads),
-			config.LoadOptionContextSize(ttsbs.appConfig.ContextSize),
-			config.LoadOptionF16(ttsbs.appConfig.F16),
-		)
-		if err != nil {
-			responseChannel <- concurrency.ErrorOr[*string]{Error: err}
-			close(responseChannel)
-			return
-		}
-
-		if request.Backend != "" {
-			cfg.Backend = request.Backend
-		}
-
-		outFile, _, err := modelTTS(cfg.Backend, request.Input, cfg.Model, request.Voice, ttsbs.ml, ttsbs.appConfig, cfg)
-		if err != nil {
-			responseChannel <- concurrency.ErrorOr[*string]{Error: err}
-			close(responseChannel)
-			return
-		}
-		responseChannel <- concurrency.ErrorOr[*string]{Value: &outFile}
-		close(responseChannel)
-	}(request)
-	return responseChannel
-}
-
-func modelTTS(backend, text, modelFile string, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig *config.BackendConfig) (string, *proto.Result, error) {
+func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -68,7 +37,7 @@ func modelTTS(backend, text, modelFile string, voice string, loader *model.Model

 	grpcOpts := gRPCModelOpts(backendConfig)

-	opts := modelOpts(&config.BackendConfig{}, appConfig, []model.Option{
+	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
@@ -84,7 +53,7 @@ func modelTTS(backend, text, modelFile string, voice string, loader *model.Model
 		return "", nil, fmt.Errorf("could not load piper model")
 	}

-	if err := os.MkdirAll(appConfig.AudioDir, 0755); err != nil {
+	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}

@@ -118,19 +87,3 @@ func modelTTS(backend, text, modelFile string, voice string, loader *model.Model

 	return filePath, res, err
 }
-
-func generateUniqueFileName(dir, baseName, ext string) string {
-	counter := 1
-	fileName := baseName + ext
-
-	for {
-		filePath := filepath.Join(dir, fileName)
-		_, err := os.Stat(filePath)
-		if os.IsNotExist(err) {
-			return fileName
-		}
-
-		counter++
-		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
-	}
-}
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -4,7 +4,7 @@ import "embed"

 type Context struct {
 	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
-	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug" help:"Set the level of logs to output [${enum}]"`
+	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`

 	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
 	BackendAssets embed.FS `kong:"-"`
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -25,7 +25,7 @@ type ModelsInstall struct {
 }

 type ModelsCMD struct {
-	List    ModelsList    `cmd:"" help:"List the models avaiable in your galleries" default:"withargs"`
+	List    ModelsList    `cmd:"" help:"List the models available in your galleries" default:"withargs"`
 	Install ModelsInstall `cmd:"" help:"Install a model from the gallery"`
 }

@@ -64,7 +64,11 @@ func (mi *ModelsInstall) Run(ctx *Context) error {
 		progressbar.OptionClearOnFinish(),
 	)
 	progressCallback := func(fileName string, current string, total string, percentage float64) {
-		progressBar.Set(int(percentage * 10))
+		v := int(percentage * 10)
+		err := progressBar.Set(v)
+		if err != nil {
+			log.Error().Err(err).Str("filename", fileName).Int("value", v).Msg("error while updating progress bar")
+		}
 	}
 	err := gallery.InstallModelFromGallery(galleries, modelName, mi.ModelsPath, gallery.GalleryModel{}, progressCallback)
 	if err != nil {
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -2,30 +2,31 @@ package cli

 import (
 	"fmt"
-	"os"
 	"strings"
 	"time"

 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/http"
 	"github.com/go-skynet/LocalAI/core/startup"
+	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )

 type RunCMD struct {
 	ModelArgs []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`

-	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
-	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
-	ImagePath         string `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
-	AudioPath         string `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
-	UploadPath        string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
-	ConfigPath        string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
-	LocalaiConfigDir  string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
+	ModelsPath                   string        `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+	BackendAssetsPath            string        `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+	ImagePath                    string        `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
+	AudioPath                    string        `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
+	UploadPath                   string        `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
+	ConfigPath                   string        `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
+	LocalaiConfigDir             string        `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
+	LocalaiConfigDirPollInterval time.Duration `env:"LOCALAI_CONFIG_DIR_POLL_INTERVAL" help:"Typically the config path picks up changes automatically, but if your system has broken fsnotify events, set this to an interval to poll the LocalAI Config Dir (example: 1m)" group:"storage"`
 	// The alias on this option is there to preserve functionality with the old `--config-file` parameter
 	ModelsConfigFile string `env:"LOCALAI_MODELS_CONFIG_FILE,CONFIG_FILE" aliases:"config-file" help:"YAML file containing a list of model backend configs" group:"storage"`

-	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models"`
+	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
 	AutoloadGalleries   bool     `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
 	RemoteLibrary       string   `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
 	PreloadModels       string   `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
@@ -41,7 +42,7 @@ type RunCMD struct {
 	CORSAllowOrigins string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
 	UploadLimit      int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
 	APIKeys          []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWelcome   bool     `env:"LOCALAI_DISABLE_WELCOME,DISABLE_WELCOME" default:"false" help:"Disable welcome pages" group:"api"`
+	DisableWebUI     bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`

 	ParallelRequests     bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
 	SingleActiveBackend  bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
@@ -60,15 +61,16 @@ func (r *RunCMD) Run(ctx *Context) error {
 		config.WithYAMLConfigPreload(r.PreloadModelsConfig),
 		config.WithModelPath(r.ModelsPath),
 		config.WithContextSize(r.ContextSize),
-		config.WithDebug(*ctx.LogLevel == "debug"),
+		config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
 		config.WithImageDir(r.ImagePath),
 		config.WithAudioDir(r.AudioPath),
 		config.WithUploadDir(r.UploadPath),
 		config.WithConfigsDir(r.ConfigPath),
+		config.WithDynamicConfigDir(r.LocalaiConfigDir),
+		config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
 		config.WithF16(r.F16),
 		config.WithStringGalleries(r.Galleries),
 		config.WithModelLibraryURL(r.RemoteLibrary),
-		config.WithDisableMessage(false),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
 		config.WithThreads(r.Threads),
@@ -82,8 +84,8 @@ func (r *RunCMD) Run(ctx *Context) error {
 	idleWatchDog := r.EnableWatchdogIdle
 	busyWatchDog := r.EnableWatchdogBusy

-	if r.DisableWelcome {
-		opts = append(opts, config.DisableWelcomePage)
+	if r.DisableWebUI {
+		opts = append(opts, config.DisableWebUI)
 	}

 	if idleWatchDog || busyWatchDog {
@@ -124,28 +126,16 @@ func (r *RunCMD) Run(ctx *Context) error {
 	}

 	if r.PreloadBackendOnly {
-		_, err := startup.Startup(opts...)
+		_, _, _, err := startup.Startup(opts...)
 		return err
 	}

-	application, err := startup.Startup(opts...)
-
+	cl, ml, options, err := startup.Startup(opts...)
 	if err != nil {
 		return fmt.Errorf("failed basic startup tasks with error %s", err.Error())
 	}

-	// Watch the configuration directory
-	// If the directory does not exist, we don't watch it
-	if _, err := os.Stat(r.LocalaiConfigDir); err == nil {
-		closeConfigWatcherFn, err := startup.WatchConfigDirectory(r.LocalaiConfigDir, application.ApplicationConfig)
-		defer closeConfigWatcherFn()
-
-		if err != nil {
-			return fmt.Errorf("failed while watching configuration directory %s", r.LocalaiConfigDir)
-		}
-	}
-
-	appHTTP, err := http.App(application)
+	appHTTP, err := http.App(cl, ml, options)
 	if err != nil {
 		log.Error().Err(err).Msg("error during HTTP App construction")
 		return err
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -7,8 +7,8 @@ import (

 	"github.com/go-skynet/LocalAI/core/backend"
 	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
 )

 type TranscriptCMD struct {
@@ -42,23 +42,18 @@ func (t *TranscriptCMD) Run(ctx *Context) error {

 	c.Threads = &t.Threads

-	defer ml.StopAllGRPC()
+	defer func() {
+		err := ml.StopAllGRPC()
+		if err != nil {
+			log.Error().Err(err).Msg("unable to stop all grpc processes")
+		}
+	}()

-	tbs := backend.NewTranscriptionBackendService(ml, cl, opts)
-
-	resultChannel := tbs.Transcribe(&schema.OpenAIRequest{
-		PredictionOptions: schema.PredictionOptions{
-			Language: t.Language,
-		},
-		File: t.Filename,
-	})
-
-	r := <-resultChannel
-
-	if r.Error != nil {
-		return r.Error
+	tr, err := backend.ModelTranscription(t.Filename, t.Language, ml, c, opts)
+	if err != nil {
+		return err
 	}
-	for _, segment := range r.Value.Segments {
+	for _, segment := range tr.Segments {
 		fmt.Println(segment.Start.String(), "-", segment.Text)
 	}
 	return nil
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -9,8 +9,8 @@ import (

 	"github.com/go-skynet/LocalAI/core/backend"
 	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
 )

 type TTSCMD struct {
@@ -41,31 +41,27 @@ func (t *TTSCMD) Run(ctx *Context) error {
 	}
 	ml := model.NewModelLoader(opts.ModelPath)

-	defer ml.StopAllGRPC()
+	defer func() {
+		err := ml.StopAllGRPC()
+		if err != nil {
+			log.Error().Err(err).Msg("unable to stop all grpc processes")
+		}
+	}()

-	ttsbs := backend.NewTextToSpeechBackendService(ml, config.NewBackendConfigLoader(), opts)
+	options := config.BackendConfig{}
+	options.SetDefaults()

-	request := &schema.TTSRequest{
-		Model:   t.Model,
-		Input:   text,
-		Backend: t.Backend,
-		Voice:   t.Voice,
-	}
-
-	resultsChannel := ttsbs.TextToAudioFile(request)
-
-	rawResult := <-resultsChannel
-
-	if rawResult.Error != nil {
-		return rawResult.Error
+	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
+	if err != nil {
+		return err
 	}
 	if outputFile != "" {
-		if err := os.Rename(*rawResult.Value, outputFile); err != nil {
+		if err := os.Rename(filePath, outputFile); err != nil {
 			return err
 		}
-		fmt.Printf("Generated file %q\n", outputFile)
+		fmt.Printf("Generate file %s\n", outputFile)
 	} else {
-		fmt.Printf("Generated file %q\n", *rawResult.Value)
+		fmt.Printf("Generate file %s\n", filePath)
 	}
 	return nil
 }
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -15,13 +15,15 @@ type ApplicationConfig struct {
 	ConfigFile                          string
 	ModelPath                           string
 	UploadLimitMB, Threads, ContextSize int
-	DisableWelcomePage                  bool
+	DisableWebUI                        bool
 	F16                                 bool
-	Debug, DisableMessage               bool
+	Debug                               bool
 	ImageDir                            string
 	AudioDir                            string
 	UploadDir                           string
 	ConfigsDir                          string
+	DynamicConfigsDir                   string
+	DynamicConfigsDirPollInterval       time.Duration
 	CORS                                bool
 	PreloadJSONModels                   string
 	PreloadModelsFromPath               string
@@ -55,12 +57,11 @@ type AppOption func(*ApplicationConfig)

 func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 	opt := &ApplicationConfig{
-		Context:        context.Background(),
-		UploadLimitMB:  15,
-		Threads:        1,
-		ContextSize:    512,
-		Debug:          true,
-		DisableMessage: true,
+		Context:       context.Background(),
+		UploadLimitMB: 15,
+		Threads:       1,
+		ContextSize:   512,
+		Debug:         true,
 	}
 	for _, oo := range o {
 		oo(opt)
@@ -106,8 +107,8 @@ var EnableWatchDogBusyCheck = func(o *ApplicationConfig) {
 	o.WatchDogBusy = true
 }

-var DisableWelcomePage = func(o *ApplicationConfig) {
-	o.DisableWelcomePage = true
+var DisableWebUI = func(o *ApplicationConfig) {
+	o.DisableWebUI = true
 }

 func SetWatchDogBusyTimeout(t time.Duration) AppOption {
@@ -234,12 +235,6 @@ func WithDebug(debug bool) AppOption {
 	}
 }

-func WithDisableMessage(disableMessage bool) AppOption {
-	return func(o *ApplicationConfig) {
-		o.DisableMessage = disableMessage
-	}
-}
-
 func WithAudioDir(audioDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.AudioDir = audioDir
@@ -264,6 +259,18 @@ func WithConfigsDir(configsDir string) AppOption {
 	}
 }

+func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.DynamicConfigsDir = dynamicConfigsDir
+	}
+}
+
+func WithDynamicConfigDirPollInterval(interval time.Duration) AppOption {
+	return func(o *ApplicationConfig) {
+		o.DynamicConfigsDirPollInterval = interval
+	}
+}
+
 func WithApiKeys(apiKeys []string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ApiKeys = apiKeys
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -1,7 +1,12 @@
 package config

 import (
+	"os"
+
 	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/downloader"
+	"github.com/go-skynet/LocalAI/pkg/functions"
+	"github.com/go-skynet/LocalAI/pkg/utils"
 )

 const (
@@ -24,7 +29,7 @@ type BackendConfig struct {
 	InputToken                                 [][]int  `yaml:"-"`
 	functionCallString, functionCallNameString string   `yaml:"-"`

-	FunctionsConfig Functions `yaml:"function"`
+	FunctionsConfig functions.FunctionsConfig `yaml:"function"`

 	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
@@ -124,6 +129,7 @@ type LLMConfig struct {
 	EnforceEager         bool    `yaml:"enforce_eager"`          // vLLM
 	SwapSpace            int     `yaml:"swap_space"`             // vLLM
 	MaxModelLen          int     `yaml:"max_model_len"`          // vLLM
+	TensorParallelSize   int     `yaml:"tensor_parallel_size"`   // vLLM
 	MMProj               string  `yaml:"mmproj"`

 	RopeScaling string `yaml:"rope_scaling"`
@@ -142,13 +148,6 @@ type AutoGPTQ struct {
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

-type Functions struct {
-	DisableNoAction         bool   `yaml:"disable_no_action"`
-	NoActionFunctionName    string `yaml:"no_action_function_name"`
-	NoActionDescriptionName string `yaml:"no_action_description_name"`
-	ParallelCalls           bool   `yaml:"parallel_calls"`
-}
-
 type TemplateConfig struct {
 	Chat                 string `yaml:"chat"`
 	ChatMessage          string `yaml:"chat_message"`
@@ -174,6 +173,36 @@ func (c *BackendConfig) ShouldCallSpecificFunction() bool {
 	return len(c.functionCallNameString) > 0
 }

+// MMProjFileName returns the filename of the MMProj file
+// If the MMProj is a URL, it will return the MD5 of the URL which is the filename
+func (c *BackendConfig) MMProjFileName() string {
+	modelURL := downloader.ConvertURL(c.MMProj)
+	if downloader.LooksLikeURL(modelURL) {
+		return utils.MD5(modelURL)
+	}
+
+	return c.MMProj
+}
+
+func (c *BackendConfig) IsMMProjURL() bool {
+	return downloader.LooksLikeURL(downloader.ConvertURL(c.MMProj))
+}
+
+func (c *BackendConfig) IsModelURL() bool {
+	return downloader.LooksLikeURL(downloader.ConvertURL(c.Model))
+}
+
+// ModelFileName returns the filename of the model
+// If the model is a URL, it will return the MD5 of the URL which is the filename
+func (c *BackendConfig) ModelFileName() string {
+	modelURL := downloader.ConvertURL(c.Model)
+	if downloader.LooksLikeURL(modelURL) {
+		return utils.MD5(modelURL)
+	}
+
+	return c.Model
+}
+
 func (c *BackendConfig) FunctionToCall() string {
 	if c.functionCallNameString != "" &&
 		c.functionCallNameString != "none" && c.functionCallNameString != "auto" {
@@ -184,7 +213,7 @@ func (c *BackendConfig) FunctionToCall() string {
 }

 func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
-	lo := &ConfigLoaderOptions{}
+	lo := &LoadOptions{}
 	lo.Apply(opts...)

 	ctx := lo.ctxSize
@@ -195,15 +224,15 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTopP := 0.95
 	defaultTopK := 40
 	defaultTemp := 0.9
-	defaultMaxTokens := 2048
 	defaultMirostat := 2
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
 	defaultTypicalP := 1.0
 	defaultTFZ := 1.0
+	defaultZero := 0

 	// Try to offload all GPU layers (if GPU is found)
-	defaultNGPULayers := 99999999
+	defaultHigh := 99999999

 	trueV := true
 	falseV := false
@@ -228,7 +257,13 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {

 	if cfg.MMap == nil {
 		// MMap is enabled by default
-		cfg.MMap = &trueV
+
+		// Only exception is for Intel GPUs
+		if os.Getenv("XPU") != "" {
+			cfg.MMap = &falseV
+		} else {
+			cfg.MMap = &trueV
+		}
 	}

 	if cfg.MMlock == nil {
@@ -244,7 +279,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	}

 	if cfg.Maxtokens == nil {
-		cfg.Maxtokens = &defaultMaxTokens
+		cfg.Maxtokens = &defaultZero
 	}

 	if cfg.Mirostat == nil {
@@ -259,7 +294,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.NGPULayers == nil {
-		cfg.NGPULayers = &defaultNGPULayers
+		cfg.NGPULayers = &defaultHigh
 	}

 	if cfg.LowVRAM == nil {
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -1,7 +1,6 @@
 package config

 import (
-	"encoding/json"
 	"errors"
 	"fmt"
 	"io/fs"
@@ -14,10 +13,9 @@ import (
 	"github.com/charmbracelet/glamour"
 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/downloader"
-	"github.com/go-skynet/LocalAI/pkg/grammar"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
-	"gopkg.in/yaml.v2"
+	"gopkg.in/yaml.v3"
 )

 type BackendConfigLoader struct {
@@ -25,96 +23,185 @@ type BackendConfigLoader struct {
 	sync.Mutex
 }

-type ConfigLoaderOptions struct {
+type LoadOptions struct {
 	debug            bool
 	threads, ctxSize int
 	f16              bool
 }

 func LoadOptionDebug(debug bool) ConfigLoaderOption {
-	return func(o *ConfigLoaderOptions) {
+	return func(o *LoadOptions) {
 		o.debug = debug
 	}
 }

 func LoadOptionThreads(threads int) ConfigLoaderOption {
-	return func(o *ConfigLoaderOptions) {
+	return func(o *LoadOptions) {
 		o.threads = threads
 	}
 }

 func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
-	return func(o *ConfigLoaderOptions) {
+	return func(o *LoadOptions) {
 		o.ctxSize = ctxSize
 	}
 }

 func LoadOptionF16(f16 bool) ConfigLoaderOption {
-	return func(o *ConfigLoaderOptions) {
+	return func(o *LoadOptions) {
 		o.f16 = f16
 	}
 }

-type ConfigLoaderOption func(*ConfigLoaderOptions)
+type ConfigLoaderOption func(*LoadOptions)

-func (lo *ConfigLoaderOptions) Apply(options ...ConfigLoaderOption) {
+func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
 	for _, l := range options {
 		l(lo)
 	}
 }

+// Load a config file for a model
+func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+
+	// Load a config file if present after the model name
+	cfg := &BackendConfig{
+		PredictionOptions: schema.PredictionOptions{
+			Model: modelName,
+		},
+	}
+
+	cfgExisting, exists := cl.GetBackendConfig(modelName)
+	if exists {
+		cfg = &cfgExisting
+	} else {
+		// Try loading a model config file
+		modelConfig := filepath.Join(modelPath, modelName+".yaml")
+		if _, err := os.Stat(modelConfig); err == nil {
+			if err := cl.LoadBackendConfig(
+				modelConfig, opts...,
+			); err != nil {
+				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+			}
+			cfgExisting, exists = cl.GetBackendConfig(modelName)
+			if exists {
+				cfg = &cfgExisting
+			}
+		}
+	}
+
+	cfg.SetDefaults(opts...)
+
+	return cfg, nil
+}
+
 func NewBackendConfigLoader() *BackendConfigLoader {
 	return &BackendConfigLoader{
 		configs: make(map[string]BackendConfig),
 	}
 }
+func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
+	c := &[]*BackendConfig{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}

-func (bcl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
-	bcl.Lock()
-	defer bcl.Unlock()
-	c, err := readBackendConfig(file, opts...)
+	for _, cc := range *c {
+		cc.SetDefaults(opts...)
+	}
+
+	return *c, nil
+}
+
+func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+	lo := &LoadOptions{}
+	lo.Apply(opts...)
+
+	c := &BackendConfig{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	c.SetDefaults(opts...)
+	return c, nil
+}
+
+func (cm *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
+	cm.Lock()
+	defer cm.Unlock()
+	c, err := ReadBackendConfigFile(file, opts...)
+	if err != nil {
+		return fmt.Errorf("cannot load config file: %w", err)
+	}
+
+	for _, cc := range c {
+		cm.configs[cc.Name] = *cc
+	}
+	return nil
+}
+
+func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
+	cl.Lock()
+	defer cl.Unlock()
+	c, err := ReadBackendConfig(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot read config file: %w", err)
 	}

-	bcl.configs[c.Name] = *c
+	cl.configs[c.Name] = *c
 	return nil
 }

-func (bcl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
-	bcl.Lock()
-	defer bcl.Unlock()
-	v, exists := bcl.configs[m]
+func (cl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
+	cl.Lock()
+	defer cl.Unlock()
+	v, exists := cl.configs[m]
 	return v, exists
 }

-func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
-	bcl.Lock()
-	defer bcl.Unlock()
+func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
+	cl.Lock()
+	defer cl.Unlock()
 	var res []BackendConfig
-	for _, v := range bcl.configs {
+	for _, v := range cl.configs {
 		res = append(res, v)
 	}
+
 	sort.SliceStable(res, func(i, j int) bool {
 		return res[i].Name < res[j].Name
 	})
+
 	return res
 }

-func (bcl *BackendConfigLoader) ListBackendConfigs() []string {
-	bcl.Lock()
-	defer bcl.Unlock()
+func (cl *BackendConfigLoader) RemoveBackendConfig(m string) {
+	cl.Lock()
+	defer cl.Unlock()
+	delete(cl.configs, m)
+}
+
+func (cl *BackendConfigLoader) ListBackendConfigs() []string {
+	cl.Lock()
+	defer cl.Unlock()
 	var res []string
-	for k := range bcl.configs {
+	for k := range cl.configs {
 		res = append(res, k)
 	}
 	return res
 }

 // Preload prepare models if they are not local but url or huggingface repositories
-func (bcl *BackendConfigLoader) Preload(modelPath string) error {
-	bcl.Lock()
-	defer bcl.Unlock()
+func (cl *BackendConfigLoader) Preload(modelPath string) error {
+	cl.Lock()
+	defer cl.Unlock()

 	status := func(fileName, current, total string, percent float64) {
 		utils.DisplayDownloadFunction(fileName, current, total, percent)
@@ -136,10 +223,10 @@ func (bcl *BackendConfigLoader) Preload(modelPath string) error {
 		}
 	}

-	for i, config := range bcl.configs {
+	for i, config := range cl.configs {

 		// Download files and verify their SHA
-		for _, file := range config.DownloadFiles {
+		for i, file := range config.DownloadFiles {
 			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)

 			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
@@ -148,49 +235,66 @@ func (bcl *BackendConfigLoader) Preload(modelPath string) error {
 			// Create file path
 			filePath := filepath.Join(modelPath, file.Filename)

-			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
+			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, i, len(config.DownloadFiles), status); err != nil {
 				return err
 			}
 		}

-		modelURL := config.PredictionOptions.Model
-		modelURL = downloader.ConvertURL(modelURL)
-
-		if downloader.LooksLikeURL(modelURL) {
-			// md5 of model name
-			md5Name := utils.MD5(modelURL)
-
+		// If the model is an URL, expand it, and download the file
+		if config.IsModelURL() {
+			modelFileName := config.ModelFileName()
+			modelURL := downloader.ConvertURL(config.Model)
 			// check if file exists
-			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
-				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
+			if _, err := os.Stat(filepath.Join(modelPath, modelFileName)); errors.Is(err, os.ErrNotExist) {
+				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, modelFileName), "", 0, 0, status)
 				if err != nil {
 					return err
 				}
 			}

-			cc := bcl.configs[i]
+			cc := cl.configs[i]
 			c := &cc
-			c.PredictionOptions.Model = md5Name
-			bcl.configs[i] = *c
+			c.PredictionOptions.Model = modelFileName
+			cl.configs[i] = *c
 		}
-		if bcl.configs[i].Name != "" {
-			glamText(fmt.Sprintf("**Model name**: _%s_", bcl.configs[i].Name))
+
+		if config.IsMMProjURL() {
+			modelFileName := config.MMProjFileName()
+			modelURL := downloader.ConvertURL(config.MMProj)
+			// check if file exists
+			if _, err := os.Stat(filepath.Join(modelPath, modelFileName)); errors.Is(err, os.ErrNotExist) {
+				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, modelFileName), "", 0, 0, status)
+				if err != nil {
+					return err
+				}
+			}
+
+			cc := cl.configs[i]
+			c := &cc
+			c.MMProj = modelFileName
+			cl.configs[i] = *c
 		}
-		if bcl.configs[i].Description != "" {
+
+		if cl.configs[i].Name != "" {
+			glamText(fmt.Sprintf("**Model name**: _%s_", cl.configs[i].Name))
+		}
+		if cl.configs[i].Description != "" {
 			//glamText("**Description**")
-			glamText(bcl.configs[i].Description)
+			glamText(cl.configs[i].Description)
 		}
-		if bcl.configs[i].Usage != "" {
+		if cl.configs[i].Usage != "" {
 			//glamText("**Usage**")
-			glamText(bcl.configs[i].Usage)
+			glamText(cl.configs[i].Usage)
 		}
 	}
 	return nil
 }

-func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
-	bcl.Lock()
-	defer bcl.Unlock()
+// LoadBackendConfigsFromPath reads all the configurations of the models from a path
+// (non-recursive)
+func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
+	cm.Lock()
+	defer cm.Unlock()
 	entries, err := os.ReadDir(path)
 	if err != nil {
 		return err
@@ -205,305 +309,15 @@ func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...
 	}
 	for _, file := range files {
 		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
+		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") ||
+			strings.HasPrefix(file.Name(), ".") {
 			continue
 		}
-		c, err := readBackendConfig(filepath.Join(path, file.Name()), opts...)
+		c, err := ReadBackendConfig(filepath.Join(path, file.Name()), opts...)
 		if err == nil {
-			bcl.configs[c.Name] = *c
+			cm.configs[c.Name] = *c
 		}
 	}

 	return nil
 }
-
-func (bcl *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
-	bcl.Lock()
-	defer bcl.Unlock()
-	c, err := readBackendConfigFile(file, opts...)
-	if err != nil {
-		return fmt.Errorf("cannot load config file: %w", err)
-	}
-
-	for _, cc := range c {
-		bcl.configs[cc.Name] = *cc
-	}
-	return nil
-}
-
-//////////
-
-// Load a config file for a model
-func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName string, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
-
-	// Load a config file if present after the model name
-	cfg := &BackendConfig{
-		PredictionOptions: schema.PredictionOptions{
-			Model: modelName,
-		},
-	}
-
-	cfgExisting, exists := bcl.GetBackendConfig(modelName)
-	if exists {
-		cfg = &cfgExisting
-	} else {
-		// Load a config file if present after the model name
-		modelConfig := filepath.Join(modelPath, modelName+".yaml")
-		if _, err := os.Stat(modelConfig); err == nil {
-			if err := bcl.LoadBackendConfig(modelConfig); err != nil {
-				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-			}
-			cfgExisting, exists = bcl.GetBackendConfig(modelName)
-			if exists {
-				cfg = &cfgExisting
-			}
-		}
-	}
-
-	cfg.SetDefaults(opts...)
-	return cfg, nil
-}
-
-func readBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
-	c := &[]*BackendConfig{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	for _, cc := range *c {
-		cc.SetDefaults(opts...)
-	}
-
-	return *c, nil
-}
-
-func readBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
-	c := &BackendConfig{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	c.SetDefaults(opts...)
-	return c, nil
-}
-
-func (bcl *BackendConfigLoader) LoadBackendConfigForModelAndOpenAIRequest(modelFile string, input *schema.OpenAIRequest, appConfig *ApplicationConfig) (*BackendConfig, *schema.OpenAIRequest, error) {
-	cfg, err := bcl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-		LoadOptionContextSize(appConfig.ContextSize),
-		LoadOptionDebug(appConfig.Debug),
-		LoadOptionF16(appConfig.F16),
-		LoadOptionThreads(appConfig.Threads),
-	)
-
-	// Set the parameters for the language model prediction
-	updateBackendConfigFromOpenAIRequest(cfg, input)
-
-	return cfg, input, err
-}
-
-func updateBackendConfigFromOpenAIRequest(bc *BackendConfig, request *schema.OpenAIRequest) {
-	if request.Echo {
-		bc.Echo = request.Echo
-	}
-	if request.TopK != nil && *request.TopK != 0 {
-		bc.TopK = request.TopK
-	}
-	if request.TopP != nil && *request.TopP != 0 {
-		bc.TopP = request.TopP
-	}
-
-	if request.Backend != "" {
-		bc.Backend = request.Backend
-	}
-
-	if request.ClipSkip != 0 {
-		bc.Diffusers.ClipSkip = request.ClipSkip
-	}
-
-	if request.ModelBaseName != "" {
-		bc.AutoGPTQ.ModelBaseName = request.ModelBaseName
-	}
-
-	if request.NegativePromptScale != 0 {
-		bc.NegativePromptScale = request.NegativePromptScale
-	}
-
-	if request.UseFastTokenizer {
-		bc.UseFastTokenizer = request.UseFastTokenizer
-	}
-
-	if request.NegativePrompt != "" {
-		bc.NegativePrompt = request.NegativePrompt
-	}
-
-	if request.RopeFreqBase != 0 {
-		bc.RopeFreqBase = request.RopeFreqBase
-	}
-
-	if request.RopeFreqScale != 0 {
-		bc.RopeFreqScale = request.RopeFreqScale
-	}
-
-	if request.Grammar != "" {
-		bc.Grammar = request.Grammar
-	}
-
-	if request.Temperature != nil && *request.Temperature != 0 {
-		bc.Temperature = request.Temperature
-	}
-
-	if request.Maxtokens != nil && *request.Maxtokens != 0 {
-		bc.Maxtokens = request.Maxtokens
-	}
-
-	switch stop := request.Stop.(type) {
-	case string:
-		if stop != "" {
-			bc.StopWords = append(bc.StopWords, stop)
-		}
-	case []interface{}:
-		for _, pp := range stop {
-			if s, ok := pp.(string); ok {
-				bc.StopWords = append(bc.StopWords, s)
-			}
-		}
-	}
-
-	if len(request.Tools) > 0 {
-		for _, tool := range request.Tools {
-			request.Functions = append(request.Functions, tool.Function)
-		}
-	}
-
-	if request.ToolsChoice != nil {
-		var toolChoice grammar.Tool
-		switch content := request.ToolsChoice.(type) {
-		case string:
-			_ = json.Unmarshal([]byte(content), &toolChoice)
-		case map[string]interface{}:
-			dat, _ := json.Marshal(content)
-			_ = json.Unmarshal(dat, &toolChoice)
-		}
-		request.FunctionCall = map[string]interface{}{
-			"name": toolChoice.Function.Name,
-		}
-	}
-
-	// Decode each request's message content
-	index := 0
-	for i, m := range request.Messages {
-		switch content := m.Content.(type) {
-		case string:
-			request.Messages[i].StringContent = content
-		case []interface{}:
-			dat, _ := json.Marshal(content)
-			c := []schema.Content{}
-			json.Unmarshal(dat, &c)
-			for _, pp := range c {
-				if pp.Type == "text" {
-					request.Messages[i].StringContent = pp.Text
-				} else if pp.Type == "image_url" {
-					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
-					base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
-					if err == nil {
-						request.Messages[i].StringImages = append(request.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
-						// set a placeholder for each image
-						request.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + request.Messages[i].StringContent
-						index++
-					} else {
-						fmt.Print("Failed encoding image", err)
-					}
-				}
-			}
-		}
-	}
-
-	if request.RepeatPenalty != 0 {
-		bc.RepeatPenalty = request.RepeatPenalty
-	}
-
-	if request.FrequencyPenalty != 0 {
-		bc.FrequencyPenalty = request.FrequencyPenalty
-	}
-
-	if request.PresencePenalty != 0 {
-		bc.PresencePenalty = request.PresencePenalty
-	}
-
-	if request.Keep != 0 {
-		bc.Keep = request.Keep
-	}
-
-	if request.Batch != 0 {
-		bc.Batch = request.Batch
-	}
-
-	if request.IgnoreEOS {
-		bc.IgnoreEOS = request.IgnoreEOS
-	}
-
-	if request.Seed != nil {
-		bc.Seed = request.Seed
-	}
-
-	if request.TypicalP != nil {
-		bc.TypicalP = request.TypicalP
-	}
-
-	switch inputs := request.Input.(type) {
-	case string:
-		if inputs != "" {
-			bc.InputStrings = append(bc.InputStrings, inputs)
-		}
-	case []interface{}:
-		for _, pp := range inputs {
-			switch i := pp.(type) {
-			case string:
-				bc.InputStrings = append(bc.InputStrings, i)
-			case []interface{}:
-				tokens := []int{}
-				for _, ii := range i {
-					tokens = append(tokens, int(ii.(float64)))
-				}
-				bc.InputToken = append(bc.InputToken, tokens)
-			}
-		}
-	}
-
-	// Can be either a string or an object
-	switch fnc := request.FunctionCall.(type) {
-	case string:
-		if fnc != "" {
-			bc.SetFunctionCallString(fnc)
-		}
-	case map[string]interface{}:
-		var name string
-		n, exists := fnc["name"]
-		if exists {
-			nn, e := n.(string)
-			if e {
-				name = nn
-			}
-		}
-		bc.SetFunctionCallNameString(name)
-	}
-
-	switch p := request.Prompt.(type) {
-	case string:
-		bc.PromptStrings = append(bc.PromptStrings, p)
-	case []interface{}:
-		for _, pp := range p {
-			if s, ok := pp.(string); ok {
-				bc.PromptStrings = append(bc.PromptStrings, s)
-			}
-		}
-	}
-}
--- a/core/config/exports_test.go
+++ b/core/config/exports_test.go
@@ -1,6 +0,0 @@
-package config
-
-// This file re-exports private functions to be used directly in unit tests.
-// Since this file's name ends in _test.go, theoretically these should not be exposed past the tests.
-
-var ReadBackendConfigFile = readBackendConfigFile
--- a/core/http/api.go
+++ b/core/http/api.go
@@ -1,278 +0,0 @@
-package http
-
-import (
-	"errors"
-	"strings"
-
-	"github.com/go-skynet/LocalAI/core"
-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/gofiber/swagger" // swagger handler
-
-	"github.com/go-skynet/LocalAI/core/http/endpoints/elevenlabs"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/core/services"
-	"github.com/go-skynet/LocalAI/internal"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-
-	"github.com/gofiber/fiber/v2"
-	"github.com/gofiber/fiber/v2/middleware/cors"
-	"github.com/gofiber/fiber/v2/middleware/logger"
-	"github.com/gofiber/fiber/v2/middleware/recover"
-)
-
-func readAuthHeader(c *fiber.Ctx) string {
-	authHeader := c.Get("Authorization")
-
-	// elevenlabs
-	xApiKey := c.Get("xi-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	// anthropic
-	xApiKey = c.Get("x-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	return authHeader
-}
-
-// @title LocalAI API
-// @version 2.0.0
-// @description The LocalAI Rest API.
-// @termsOfService
-// @contact.name LocalAI
-// @contact.url https://localai.io
-// @license.name MIT
-// @license.url https://raw.githubusercontent.com/mudler/LocalAI/master/LICENSE
-// @BasePath /
-// @securityDefinitions.apikey BearerAuth
-// @in header
-// @name Authorization
-func App(application *core.Application) (*fiber.App, error) {
-	// Return errors as JSON responses
-	app := fiber.New(fiber.Config{
-		Views:                 renderEngine(),
-		BodyLimit:             application.ApplicationConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
-		DisableStartupMessage: application.ApplicationConfig.DisableMessage,
-		// Override default error handler
-		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
-			// Status code defaults to 500
-			code := fiber.StatusInternalServerError
-
-			// Retrieve the custom status code if it's a *fiber.Error
-			var e *fiber.Error
-			if errors.As(err, &e) {
-				code = e.Code
-			}
-
-			// Send custom error page
-			return ctx.Status(code).JSON(
-				schema.ErrorResponse{
-					Error: &schema.APIError{Message: err.Error(), Code: code},
-				},
-			)
-		},
-	})
-
-	if application.ApplicationConfig.Debug {
-		app.Use(logger.New(logger.Config{
-			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
-		}))
-	}
-
-	// Default middleware config
-
-	if !application.ApplicationConfig.Debug {
-		app.Use(recover.New())
-	}
-
-	metricsService, err := services.NewLocalAIMetricsService()
-	if err != nil {
-		return nil, err
-	}
-
-	if metricsService != nil {
-		app.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
-		app.Hooks().OnShutdown(func() error {
-			return metricsService.Shutdown()
-		})
-	}
-
-	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
-	auth := func(c *fiber.Ctx) error {
-		if len(application.ApplicationConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		authHeader := readAuthHeader(c)
-		if authHeader == "" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-		}
-
-		// If it's a bearer token
-		authHeaderParts := strings.Split(authHeader, " ")
-		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
-		}
-
-		apiKey := authHeaderParts[1]
-		for _, key := range application.ApplicationConfig.ApiKeys {
-			if apiKey == key {
-				return c.Next()
-			}
-		}
-
-		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
-	}
-
-	if application.ApplicationConfig.CORS {
-		var c func(ctx *fiber.Ctx) error
-		if application.ApplicationConfig.CORSAllowOrigins == "" {
-			c = cors.New()
-		} else {
-			c = cors.New(cors.Config{AllowOrigins: application.ApplicationConfig.CORSAllowOrigins})
-		}
-
-		app.Use(c)
-	}
-
-	fiberContextExtractor := fiberContext.NewFiberContextExtractor(application.ModelLoader, application.ApplicationConfig)
-
-	// LocalAI API endpoints
-	galleryService := services.NewGalleryService(application.ApplicationConfig.ModelPath)
-	galleryService.Start(application.ApplicationConfig.Context, application.BackendConfigLoader)
-
-	app.Get("/version", auth, func(c *fiber.Ctx) error {
-		return c.JSON(struct {
-			Version string `json:"version"`
-		}{Version: internal.PrintableVersion()})
-	})
-
-	app.Get("/swagger/*", swagger.HandlerDefault) // default
-
-	welcomeRoute(
-		app,
-		application.BackendConfigLoader,
-		application.ModelLoader,
-		application.ApplicationConfig,
-		auth,
-	)
-
-	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(application.ApplicationConfig.Galleries, application.ApplicationConfig.ModelPath, galleryService)
-	app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
-	app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
-	app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
-	app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
-	app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
-	app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
-	app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
-
-	// Stores
-	storeLoader := model.NewModelLoader("") // TODO: Investigate if this should be migrated to application and reused. Should the path be configurable? Merging for now.
-	app.Post("/stores/set", auth, localai.StoresSetEndpoint(storeLoader, application.ApplicationConfig))
-	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(storeLoader, application.ApplicationConfig))
-	app.Post("/stores/get", auth, localai.StoresGetEndpoint(storeLoader, application.ApplicationConfig))
-	app.Post("/stores/find", auth, localai.StoresFindEndpoint(storeLoader, application.ApplicationConfig))
-
-	// openAI compatible API endpoints
-
-	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(fiberContextExtractor, application.OpenAIService))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(fiberContextExtractor, application.OpenAIService))
-
-	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(fiberContextExtractor, application.OpenAIService))
-	app.Post("/edits", auth, openai.EditEndpoint(fiberContextExtractor, application.OpenAIService))
-
-	// assistant
-	// TODO: Refactor this to the new style eventually
-	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
-
-	// files
-	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Post("/files", auth, openai.UploadFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Get("/v1/files", auth, openai.ListFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Get("/files", auth, openai.ListFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
-
-	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
-	app.Post("/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
-
-	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
-
-	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(fiberContextExtractor, application.TranscriptionBackendService))
-	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
-
-	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(fiberContextExtractor, application.ImageGenerationBackendService))
-
-	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
-
-	// LocalAI TTS?
-	app.Post("/tts", auth, localai.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
-
-	if application.ApplicationConfig.ImageDir != "" {
-		app.Static("/generated-images", application.ApplicationConfig.ImageDir)
-	}
-
-	if application.ApplicationConfig.AudioDir != "" {
-		app.Static("/generated-audio", application.ApplicationConfig.AudioDir)
-	}
-
-	ok := func(c *fiber.Ctx) error {
-		return c.SendStatus(200)
-	}
-
-	// Kubernetes health checks
-	app.Get("/healthz", ok)
-	app.Get("/readyz", ok)
-
-	// Experimental Backend Statistics Module
-	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(application.BackendMonitorService))
-	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(application.BackendMonitorService))
-
-	// models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(application.ListModelsService))
-	app.Get("/models", auth, openai.ListModelsEndpoint(application.ListModelsService))
-
-	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
-
-	// Define a custom 404 handler
-	// Note: keep this at the bottom!
-	app.Use(notFoundHandler)
-
-	return app, nil
-}
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -0,0 +1,205 @@
+package http
+
+import (
+	"embed"
+	"errors"
+	"net/http"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/pkg/utils"
+
+	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
+	"github.com/go-skynet/LocalAI/core/http/routes"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/pkg/model"
+
+	"github.com/gofiber/contrib/fiberzerolog"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/favicon"
+	"github.com/gofiber/fiber/v2/middleware/filesystem"
+	"github.com/gofiber/fiber/v2/middleware/recover"
+
+	// swagger handler
+	"github.com/rs/zerolog/log"
+)
+
+func readAuthHeader(c *fiber.Ctx) string {
+	authHeader := c.Get("Authorization")
+
+	// elevenlabs
+	xApiKey := c.Get("xi-api-key")
+	if xApiKey != "" {
+		authHeader = "Bearer " + xApiKey
+	}
+
+	// anthropic
+	xApiKey = c.Get("x-api-key")
+	if xApiKey != "" {
+		authHeader = "Bearer " + xApiKey
+	}
+
+	return authHeader
+}
+
+// Embed a directory
+//
+//go:embed static/*
+var embedDirStatic embed.FS
+
+// @title LocalAI API
+// @version 2.0.0
+// @description The LocalAI Rest API.
+// @termsOfService
+// @contact.name LocalAI
+// @contact.url https://localai.io
+// @license.name MIT
+// @license.url https://raw.githubusercontent.com/mudler/LocalAI/master/LICENSE
+// @BasePath /
+// @securityDefinitions.apikey BearerAuth
+// @in header
+// @name Authorization
+
+func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
+	// Return errors as JSON responses
+	app := fiber.New(fiber.Config{
+		Views:     renderEngine(),
+		BodyLimit: appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		// We disable the Fiber startup message as it does not conform to structured logging.
+		// We register a startup log line with connection information in the OnListen hook to keep things user friendly though
+		DisableStartupMessage: true,
+		// Override default error handler
+		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
+			// Status code defaults to 500
+			code := fiber.StatusInternalServerError
+
+			// Retrieve the custom status code if it's a *fiber.Error
+			var e *fiber.Error
+			if errors.As(err, &e) {
+				code = e.Code
+			}
+
+			// Send custom error page
+			return ctx.Status(code).JSON(
+				schema.ErrorResponse{
+					Error: &schema.APIError{Message: err.Error(), Code: code},
+				},
+			)
+		},
+	})
+
+	app.Hooks().OnListen(func(listenData fiber.ListenData) error {
+		scheme := "http"
+		if listenData.TLS {
+			scheme = "https"
+		}
+		log.Info().Str("endpoint", scheme+"://"+listenData.Host+":"+listenData.Port).Msg("LocalAI API is listening! Please connect to the endpoint for API documentation.")
+		return nil
+	})
+
+	// Have Fiber use zerolog like the rest of the application rather than it's built-in logger
+	logger := log.Logger
+	app.Use(fiberzerolog.New(fiberzerolog.Config{
+		Logger: &logger,
+	}))
+
+	// Default middleware config
+
+	if !appConfig.Debug {
+		app.Use(recover.New())
+	}
+
+	metricsService, err := services.NewLocalAIMetricsService()
+	if err != nil {
+		return nil, err
+	}
+
+	if metricsService != nil {
+		app.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
+		app.Hooks().OnShutdown(func() error {
+			return metricsService.Shutdown()
+		})
+	}
+
+	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
+	auth := func(c *fiber.Ctx) error {
+		if len(appConfig.ApiKeys) == 0 {
+			return c.Next()
+		}
+
+		if len(appConfig.ApiKeys) == 0 {
+			return c.Next()
+		}
+
+		authHeader := readAuthHeader(c)
+		if authHeader == "" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+		}
+
+		// If it's a bearer token
+		authHeaderParts := strings.Split(authHeader, " ")
+		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+		}
+
+		apiKey := authHeaderParts[1]
+		for _, key := range appConfig.ApiKeys {
+			if apiKey == key {
+				return c.Next()
+			}
+		}
+
+		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+	}
+
+	if appConfig.CORS {
+		var c func(ctx *fiber.Ctx) error
+		if appConfig.CORSAllowOrigins == "" {
+			c = cors.New()
+		} else {
+			c = cors.New(cors.Config{AllowOrigins: appConfig.CORSAllowOrigins})
+		}
+
+		app.Use(c)
+	}
+
+	// Load config jsons
+	utils.LoadConfig(appConfig.UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
+	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
+	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
+
+	galleryService := services.NewGalleryService(appConfig.ModelPath)
+	galleryService.Start(appConfig.Context, cl)
+
+	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig, auth)
+	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService, auth)
+	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig, auth)
+	if !appConfig.DisableWebUI {
+		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService, auth)
+	}
+	routes.RegisterJINARoutes(app, cl, ml, appConfig, auth)
+
+	httpFS := http.FS(embedDirStatic)
+
+	app.Use(favicon.New(favicon.Config{
+		URL:        "/favicon.ico",
+		FileSystem: httpFS,
+		File:       "static/favicon.ico",
+	}))
+
+	app.Use("/static", filesystem.New(filesystem.Config{
+		Root:       httpFS,
+		PathPrefix: "static",
+		Browse:     true,
+	}))
+
+	// Define a custom 404 handler
+	// Note: keep this at the bottom!
+	app.Use(notFoundHandler)
+
+	return app, nil
+}
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -12,9 +12,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
-	"strings"

-	"github.com/go-skynet/LocalAI/core"
 	"github.com/go-skynet/LocalAI/core/config"
 	. "github.com/go-skynet/LocalAI/core/http"
 	"github.com/go-skynet/LocalAI/core/schema"
@@ -207,11 +205,12 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string
 	var modelDir string
-	var application *core.Application
+	var bcl *config.BackendConfigLoader
+	var ml *model.ModelLoader
+	var applicationConfig *config.ApplicationConfig

 	commonOpts := []config.AppOption{
 		config.WithDebug(true),
-		config.WithDisableMessage(true),
 	}

 	Context("API with ephemeral models", func() {
@@ -223,7 +222,7 @@ var _ = Describe("API test", func() {

 			modelDir = filepath.Join(tmpdir, "models")
 			backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
-			err = os.Mkdir(backendAssetsDir, 0755)
+			err = os.Mkdir(backendAssetsDir, 0750)
 			Expect(err).ToNot(HaveOccurred())

 			c, cancel = context.WithCancel(context.Background())
@@ -242,7 +241,7 @@ var _ = Describe("API test", func() {
 			}
 			out, err := yaml.Marshal(g)
 			Expect(err).ToNot(HaveOccurred())
-			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
+			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0600)
 			Expect(err).ToNot(HaveOccurred())

 			galleries := []gallery.Gallery{
@@ -252,7 +251,7 @@ var _ = Describe("API test", func() {
 				},
 			}

-			application, err = startup.Startup(
+			bcl, ml, applicationConfig, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithGalleries(galleries),
@@ -261,7 +260,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())

-			app, err = App(application)
+			app, err = App(bcl, ml, applicationConfig)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -474,11 +473,11 @@ var _ = Describe("API test", func() {
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
-				Expect(resp2.Choices[0].Message.ToolCalls[0].Function).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.ToolCalls[0].Function.Name)
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)

 				var res map[string]string
-				err = json.Unmarshal([]byte(resp2.Choices[0].Message.ToolCalls[0].Function.Arguments), &res)
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
@@ -487,14 +486,13 @@ var _ = Describe("API test", func() {
 			})

 			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
-				// if runtime.GOOS != "linux" {
-				// 	Skip("test supported only on linux")
-				// }
-				modelName := "codellama"
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+
+				modelName := "hermes-2-pro-mistral"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
-					Name:      modelName,
-					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -504,7 +502,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "480s", "10s").Should(Equal(true))
+				}, "360s", "10s").Should(Equal(true))

 				By("testing chat")
 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
@@ -551,15 +549,13 @@ var _ = Describe("API test", func() {
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
-				fmt.Printf("\n--- %+v\n\n", resp2.Choices[0].Message)
-				Expect(resp2.Choices[0].Message.ToolCalls).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.ToolCalls[0]).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.ToolCalls[0].Function.Name)
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)

 				var res map[string]string
-				err = json.Unmarshal([]byte(resp2.Choices[0].Message.ToolCalls[0].Function.Arguments), &res)
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
+				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
@@ -599,7 +595,7 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred())
 			modelDir = filepath.Join(tmpdir, "models")
 			backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
-			err = os.Mkdir(backendAssetsDir, 0755)
+			err = os.Mkdir(backendAssetsDir, 0750)
 			Expect(err).ToNot(HaveOccurred())

 			c, cancel = context.WithCancel(context.Background())
@@ -611,7 +607,7 @@ var _ = Describe("API test", func() {
 				},
 			}

-			application, err = startup.Startup(
+			bcl, ml, applicationConfig, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithAudioDir(tmpdir),
@@ -622,7 +618,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(tmpdir))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(application)
+			app, err = App(bcl, ml, applicationConfig)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -712,10 +708,26 @@ var _ = Describe("API test", func() {
 			// The response should contain an URL
 			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
 			dat, err := io.ReadAll(resp.Body)
-			Expect(err).ToNot(HaveOccurred(), string(dat))
-			Expect(string(dat)).To(ContainSubstring("http://127.0.0.1:9090/"), string(dat))
-			Expect(string(dat)).To(ContainSubstring(".png"), string(dat))
+			Expect(err).ToNot(HaveOccurred(), "error reading /image/generations response")

+			imgUrlResp := &schema.OpenAIResponse{}
+			err = json.Unmarshal(dat, imgUrlResp)
+			Expect(imgUrlResp.Data).ToNot(Or(BeNil(), BeZero()))
+			imgUrl := imgUrlResp.Data[0].URL
+			Expect(imgUrl).To(ContainSubstring("http://127.0.0.1:9090/"), imgUrl)
+			Expect(imgUrl).To(ContainSubstring(".png"), imgUrl)
+
+			imgResp, err := http.Get(imgUrl)
+			Expect(err).To(BeNil())
+			Expect(imgResp).ToNot(BeNil())
+			Expect(imgResp.StatusCode).To(Equal(200))
+			Expect(imgResp.ContentLength).To(BeNumerically(">", 0))
+			imgData := make([]byte, 512)
+			count, err := io.ReadFull(imgResp.Body, imgData)
+			Expect(err).To(Or(BeNil(), MatchError(io.EOF)))
+			Expect(count).To(BeNumerically(">", 0))
+			Expect(count).To(BeNumerically("<=", 512))
+			Expect(http.DetectContentType(imgData)).To(Equal("image/png"))
 		})
 	})

@@ -726,14 +738,14 @@ var _ = Describe("API test", func() {

 			var err error

-			application, err = startup.Startup(
+			bcl, ml, applicationConfig, err = startup.Startup(
 				append(commonOpts,
 					config.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(application)
+			app, err = App(bcl, ml, applicationConfig)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

@@ -763,11 +775,6 @@ var _ = Describe("API test", func() {
 			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions via ggml", func() {
-			bt, ok := os.LookupEnv("BUILD_TYPE")
-			if ok && strings.ToLower(bt) == "metal" {
-				Skip("GGML + Metal is known flaky, skip test temporarily")
-			}
-
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel.ggml", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -775,11 +782,6 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate chat completions via ggml", func() {
-			bt, ok := os.LookupEnv("BUILD_TYPE")
-			if ok && strings.ToLower(bt) == "metal" {
-				Skip("GGML + Metal is known flaky, skip test temporarily")
-			}
-
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel.ggml", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -787,11 +789,6 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate completions from model configs", func() {
-			bt, ok := os.LookupEnv("BUILD_TYPE")
-			if ok && strings.ToLower(bt) == "metal" {
-				Skip("GGML + Metal is known flaky, skip test temporarily")
-			}
-
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -799,11 +796,6 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate chat completions from model configs", func() {
-			bt, ok := os.LookupEnv("BUILD_TYPE")
-			if ok && strings.ToLower(bt) == "metal" {
-				Skip("GGML + Metal is known flaky, skip test temporarily")
-			}
-
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -811,11 +803,11 @@ var _ = Describe("API test", func() {
 		})

 		It("returns errors", func() {
-			backends := len(model.AutoLoadBackends) + 1 // +1 for huggingface
 			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: testPrompt})
 			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends)))
+			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error:"))
 		})
+
 		It("transcribes audio", func() {
 			if runtime.GOOS != "linux" {
 				Skip("test supported only on linux")
@@ -890,9 +882,9 @@ var _ = Describe("API test", func() {

 		Context("backends", func() {
 			It("runs rwkv completion", func() {
-				// if runtime.GOOS != "linux" {
-				// 	Skip("test supported only on linux")
-				// }
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices) > 0).To(BeTrue())
@@ -913,20 +905,17 @@ var _ = Describe("API test", func() {
 					}

 					Expect(err).ToNot(HaveOccurred())
-
-					if len(response.Choices) > 0 {
-						text += response.Choices[0].Text
-						tokens++
-					}
+					text += response.Choices[0].Text
+					tokens++
 				}
 				Expect(text).ToNot(BeEmpty())
 				Expect(text).To(ContainSubstring("five"))
 				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
 			})
 			It("runs rwkv chat completion", func() {
-				// if runtime.GOOS != "linux" {
-				// 	Skip("test supported only on linux")
-				// }
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
 				resp, err := client.CreateChatCompletion(context.TODO(),
 					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -1035,14 +1024,14 @@ var _ = Describe("API test", func() {
 			c, cancel = context.WithCancel(context.Background())

 			var err error
-			application, err = startup.Startup(
+			bcl, ml, applicationConfig, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 					config.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(application)
+			app, err = App(bcl, ml, applicationConfig)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -1066,33 +1055,18 @@ var _ = Describe("API test", func() {
 			}
 		})
 		It("can generate chat completions from config file (list1)", func() {
-			bt, ok := os.LookupEnv("BUILD_TYPE")
-			if ok && strings.ToLower(bt) == "metal" {
-				Skip("GGML + Metal is known flaky, skip test temporarily")
-			}
-
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate chat completions from config file (list2)", func() {
-			bt, ok := os.LookupEnv("BUILD_TYPE")
-			if ok && strings.ToLower(bt) == "metal" {
-				Skip("GGML + Metal is known flaky, skip test temporarily")
-			}
-
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate edit completions from config file", func() {
-			bt, ok := os.LookupEnv("BUILD_TYPE")
-			if ok && strings.ToLower(bt) == "metal" {
-				Skip("GGML + Metal is known flaky, skip test temporarily")
-			}
-
 			request := openaigo.EditCreateRequestBody{
 				Model:       "list2",
 				Instruction: "foo",
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -1,88 +1,43 @@
 package fiberContext

 import (
-	"context"
-	"encoding/json"
 	"fmt"
 	"strings"

-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

-type FiberContextExtractor struct {
-	ml        *model.ModelLoader
-	appConfig *config.ApplicationConfig
-}
-
-func NewFiberContextExtractor(ml *model.ModelLoader, appConfig *config.ApplicationConfig) *FiberContextExtractor {
-	return &FiberContextExtractor{
-		ml:        ml,
-		appConfig: appConfig,
-	}
-}
-
 // ModelFromContext returns the model from the context
 // If no model is specified, it will take the first available
 // Takes a model string as input which should be the one received from the user request.
 // It returns the model name resolved from the context and an error if any.
-func (fce *FiberContextExtractor) ModelFromContext(ctx *fiber.Ctx, modelInput string, firstModel bool) (string, error) {
-	ctxPM := ctx.Params("model")
-	if ctxPM != "" {
-		log.Debug().Msgf("[FCE] Overriding param modelInput %q with ctx.Params value %q", modelInput, ctxPM)
-		modelInput = ctxPM
+func ModelFromContext(ctx *fiber.Ctx, loader *model.ModelLoader, modelInput string, firstModel bool) (string, error) {
+	if ctx.Params("model") != "" {
+		modelInput = ctx.Params("model")
 	}

 	// Set model from bearer token, if available
-	bearer := strings.TrimPrefix(ctx.Get("authorization"), "Bearer ")
-	bearerExists := bearer != "" && fce.ml.ExistsInModelPath(bearer)
+	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)

 	// If no model was specified, take the first available
 	if modelInput == "" && !bearerExists && firstModel {
-		models, _ := fce.ml.ListModels()
+		models, _ := loader.ListModels()
 		if len(models) > 0 {
 			modelInput = models[0]
-			log.Debug().Msgf("[FCE] No model specified, using first available: %s", modelInput)
+			log.Debug().Msgf("No model specified, using: %s", modelInput)
 		} else {
-			log.Warn().Msgf("[FCE] No model specified, none available")
-			return "", fmt.Errorf("[fce] no model specified, none available")
+			log.Debug().Msgf("No model specified, returning error")
+			return "", fmt.Errorf("no model specified")
 		}
 	}

 	// If a model is found in bearer token takes precedence
 	if bearerExists {
-		log.Debug().Msgf("[FCE] Using model from bearer token: %s", bearer)
+		log.Debug().Msgf("Using model from bearer token: %s", bearer)
 		modelInput = bearer
 	}
-
-	if modelInput == "" {
-		log.Warn().Msg("[FCE] modelInput is empty")
-	}
 	return modelInput, nil
 }
-
-// TODO: Do we still need the first return value?
-func (fce *FiberContextExtractor) OpenAIRequestFromContext(c *fiber.Ctx, firstModel bool) (string, *schema.OpenAIRequest, error) {
-	input := new(schema.OpenAIRequest)
-
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
-	}
-
-	received, _ := json.Marshal(input)
-
-	ctx, cancel := context.WithCancel(fce.appConfig.Context)
-	input.Context = ctx
-	input.Cancel = cancel
-
-	log.Debug().Msgf("Request received: %s", string(received))
-
-	var err error
-	input.Model, err = fce.ModelFromContext(c, input.Model, firstModel)
-
-	return input.Model, input, err
-}
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -0,0 +1,405 @@
+package elements
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/chasefleming/elem-go"
+	"github.com/chasefleming/elem-go/attrs"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/xsync"
+)
+
+const (
+	noImage = "https://upload.wikimedia.org/wikipedia/commons/6/65/No-Image-Placeholder.svg"
+)
+
+func DoneProgress(galleryID, text string, showDelete bool) string {
+	var modelName = galleryID
+	// Split by @ and grab the name
+	if strings.Contains(galleryID, "@") {
+		modelName = strings.Split(galleryID, "@")[1]
+	}
+
+	return elem.Div(
+		attrs.Props{
+			"id": "action-div-" + dropBadChars(galleryID),
+		},
+		elem.H3(
+			attrs.Props{
+				"role":      "status",
+				"id":        "pblabel",
+				"tabindex":  "-1",
+				"autofocus": "",
+			},
+			elem.Text(text),
+		),
+		elem.If(showDelete, deleteButton(galleryID, modelName), reInstallButton(galleryID)),
+	).Render()
+}
+
+func ErrorProgress(err, galleryName string) string {
+	return elem.Div(
+		attrs.Props{},
+		elem.H3(
+			attrs.Props{
+				"role":      "status",
+				"id":        "pblabel",
+				"tabindex":  "-1",
+				"autofocus": "",
+			},
+			elem.Text("Error "+err),
+		),
+		installButton(galleryName),
+	).Render()
+}
+
+func ProgressBar(progress string) string {
+	return elem.Div(attrs.Props{
+		"class":           "progress",
+		"role":            "progressbar",
+		"aria-valuemin":   "0",
+		"aria-valuemax":   "100",
+		"aria-valuenow":   "0",
+		"aria-labelledby": "pblabel",
+	},
+		elem.Div(attrs.Props{
+			"id":    "pb",
+			"class": "progress-bar",
+			"style": "width:" + progress + "%",
+		}),
+	).Render()
+}
+
+func StartProgressBar(uid, progress, text string) string {
+	if progress == "" {
+		progress = "0"
+	}
+	return elem.Div(
+		attrs.Props{
+			"hx-trigger": "done",
+			"hx-get":     "/browse/job/" + uid,
+			"hx-swap":    "outerHTML",
+			"hx-target":  "this",
+		},
+		elem.H3(
+			attrs.Props{
+				"role":      "status",
+				"id":        "pblabel",
+				"tabindex":  "-1",
+				"autofocus": "",
+			},
+			elem.Text(text),
+			elem.Div(attrs.Props{
+				"hx-get":     "/browse/job/progress/" + uid,
+				"hx-trigger": "every 600ms",
+				"hx-target":  "this",
+				"hx-swap":    "innerHTML",
+			},
+				elem.Raw(ProgressBar(progress)),
+			),
+		),
+	).Render()
+}
+
+func cardSpan(text, icon string) elem.Node {
+	return elem.Span(
+		attrs.Props{
+			"class": "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
+		},
+		elem.I(attrs.Props{
+			"class": icon + " pr-2",
+		}),
+
+		elem.Text(text),
+
+		//elem.Text(text),
+	)
+}
+
+func searchableElement(text, icon string) elem.Node {
+	return elem.Form(
+		attrs.Props{},
+		elem.Input(
+			attrs.Props{
+				"type":  "hidden",
+				"name":  "search",
+				"value": text,
+			},
+		),
+		elem.Span(
+			attrs.Props{
+				"class": "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2",
+			},
+
+			elem.A(
+				attrs.Props{
+					//	"name":      "search",
+					//	"value":     text,
+					//"class":     "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
+					"href":      "#!",
+					"hx-post":   "/browse/search/models",
+					"hx-target": "#search-results",
+					// TODO: this doesn't work
+					//	"hx-vals":      `{ \"search\": \"` + text + `\" }`,
+					"hx-indicator": ".htmx-indicator",
+				},
+				elem.I(attrs.Props{
+					"class": icon + " pr-2",
+				}),
+				elem.Text(text),
+			),
+		),
+
+		//elem.Text(text),
+	)
+}
+
+func link(text, url string) elem.Node {
+	return elem.A(
+		attrs.Props{
+			"class":  "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2 hover:bg-gray-300 hover:shadow-gray-2",
+			"href":   url,
+			"target": "_blank",
+		},
+		elem.I(attrs.Props{
+			"class": "fas fa-link pr-2",
+		}),
+		elem.Text(text),
+	)
+}
+func installButton(galleryName string) elem.Node {
+	return elem.Button(
+		attrs.Props{
+			"data-twe-ripple-init":  "",
+			"data-twe-ripple-color": "light",
+			"class":                 "float-right inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
+			"hx-swap":               "outerHTML",
+			// post the Model ID as param
+			"hx-post": "/browse/install/model/" + galleryName,
+		},
+		elem.I(
+			attrs.Props{
+				"class": "fa-solid fa-download pr-2",
+			},
+		),
+		elem.Text("Install"),
+	)
+}
+
+func reInstallButton(galleryName string) elem.Node {
+	return elem.Button(
+		attrs.Props{
+			"data-twe-ripple-init":  "",
+			"data-twe-ripple-color": "light",
+			"class":                 "float-right inline-block rounded bg-primary ml-2 px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
+			"hx-target":             "#action-div-" + dropBadChars(galleryName),
+			"hx-swap":               "outerHTML",
+			// post the Model ID as param
+			"hx-post": "/browse/install/model/" + galleryName,
+		},
+		elem.I(
+			attrs.Props{
+				"class": "fa-solid fa-arrow-rotate-right pr-2",
+			},
+		),
+		elem.Text("Reinstall"),
+	)
+}
+
+func deleteButton(galleryID, modelName string) elem.Node {
+	return elem.Button(
+		attrs.Props{
+			"data-twe-ripple-init":  "",
+			"data-twe-ripple-color": "light",
+			"hx-confirm":            "Are you sure you wish to delete the model?",
+			"class":                 "float-right inline-block rounded bg-red-800 px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-red-accent-300 hover:shadow-red-2 focus:bg-red-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-red-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
+			"hx-target":             "#action-div-" + dropBadChars(galleryID),
+			"hx-swap":               "outerHTML",
+			// post the Model ID as param
+			"hx-post": "/browse/delete/model/" + galleryID,
+		},
+		elem.I(
+			attrs.Props{
+				"class": "fa-solid fa-cancel pr-2",
+			},
+		),
+		elem.Text("Delete"),
+	)
+}
+
+// Javascript/HTMX doesn't like weird IDs
+func dropBadChars(s string) string {
+	return strings.ReplaceAll(s, "@", "__")
+}
+
+func ListModels(models []*gallery.GalleryModel, processing *xsync.SyncedMap[string, string], galleryService *services.GalleryService) string {
+	modelsElements := []elem.Node{}
+	descriptionDiv := func(m *gallery.GalleryModel) elem.Node {
+		return elem.Div(
+			attrs.Props{
+				"class": "p-6 text-surface dark:text-white",
+			},
+			elem.H5(
+				attrs.Props{
+					"class": "mb-2 text-xl font-medium leading-tight",
+				},
+				elem.Text(m.Name),
+			),
+			elem.P(
+				attrs.Props{
+					"class": "mb-4 text-base",
+				},
+				elem.Text(m.Description),
+			),
+		)
+	}
+
+	actionDiv := func(m *gallery.GalleryModel) elem.Node {
+		galleryID := fmt.Sprintf("%s@%s", m.Gallery.Name, m.Name)
+		currentlyProcessing := processing.Exists(galleryID)
+		jobID := ""
+		isDeletionOp := false
+		if currentlyProcessing {
+			status := galleryService.GetStatus(galleryID)
+			if status != nil && status.Deletion {
+				isDeletionOp = true
+			}
+			jobID = processing.Get(galleryID)
+			// TODO:
+			// case not handled, if status == nil : "Waiting"
+		}
+
+		nodes := []elem.Node{
+			cardSpan("Repository: "+m.Gallery.Name, "fa-brands fa-git-alt"),
+		}
+
+		if m.License != "" {
+			nodes = append(nodes,
+				cardSpan("License: "+m.License, "fas fa-book"),
+			)
+		}
+
+		tagsNodes := []elem.Node{}
+		for _, tag := range m.Tags {
+			tagsNodes = append(tagsNodes,
+				searchableElement(tag, "fas fa-tag"),
+			)
+		}
+
+		nodes = append(nodes,
+			elem.Div(
+				attrs.Props{
+					"class": "flex flex-row flex-wrap content-center",
+				},
+				tagsNodes...,
+			),
+		)
+
+		for i, url := range m.URLs {
+			nodes = append(nodes,
+				link("Link #"+fmt.Sprintf("%d", i+1), url),
+			)
+		}
+
+		progressMessage := "Installation"
+		if isDeletionOp {
+			progressMessage = "Deletion"
+		}
+
+		return elem.Div(
+			attrs.Props{
+				"class": "px-6 pt-4 pb-2",
+			},
+			elem.P(
+				attrs.Props{
+					"class": "mb-4 text-base",
+				},
+				nodes...,
+			),
+			elem.Div(
+				attrs.Props{
+					"id": "action-div-" + dropBadChars(galleryID),
+				},
+				elem.If(
+					currentlyProcessing,
+					elem.Node( // If currently installing, show progress bar
+						elem.Raw(StartProgressBar(jobID, "0", progressMessage)),
+					), // Otherwise, show install button (if not installed) or display "Installed"
+					elem.If(m.Installed,
+						elem.Node(elem.Div(
+							attrs.Props{},
+							reInstallButton(m.ID()),
+							deleteButton(m.ID(), m.Name),
+						)),
+						installButton(m.ID()),
+					),
+				),
+			),
+		)
+	}
+
+	for _, m := range models {
+		elems := []elem.Node{}
+
+		if m.Icon == "" {
+			m.Icon = noImage
+		}
+
+		divProperties := attrs.Props{
+			"class": "flex justify-center items-center",
+		}
+
+		elems = append(elems,
+			elem.Div(divProperties,
+				elem.A(attrs.Props{
+					"href": "#!",
+					//		"class": "justify-center items-center",
+				},
+					elem.Img(attrs.Props{
+						//	"class": "rounded-t-lg object-fit object-center h-96",
+						"class": "rounded-t-lg max-h-48 max-w-96 object-cover mt-3",
+						"src":   m.Icon,
+					}),
+				),
+			),
+		)
+
+		// Special/corner case: if a model sets Trust Remote Code as required, show a warning
+		// TODO: handle this more generically later
+		_, trustRemoteCodeExists := m.Overrides["trust_remote_code"]
+		if trustRemoteCodeExists {
+			elems = append(elems, elem.Div(
+				attrs.Props{
+					"class": "flex justify-center items-center bg-red-500 text-white p-2 rounded-lg mt-2",
+				},
+				elem.I(attrs.Props{
+					"class": "fa-solid fa-circle-exclamation pr-2",
+				}),
+				elem.Text("Attention: Trust Remote Code is required for this model"),
+			))
+		}
+
+		elems = append(elems, descriptionDiv(m), actionDiv(m))
+		modelsElements = append(modelsElements,
+			elem.Div(
+				attrs.Props{
+					"class": " me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2",
+				},
+				elem.Div(
+					attrs.Props{
+						//	"class": "p-6",
+					},
+					elems...,
+				),
+			),
+		)
+	}
+
+	wrapper := elem.Div(attrs.Props{
+		"class": "dark grid grid-cols-1 grid-rows-1 md:grid-cols-3 block rounded-lg shadow-secondary-1 dark:bg-surface-dark",
+	}, modelsElements...)
+
+	return wrapper.Render()
+}
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -2,7 +2,9 @@ package elevenlabs

 import (
 	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
 	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
@@ -15,7 +17,7 @@ import (
 // @Param request body schema.TTSRequest true "query params"
 // @Success 200 {string} binary	 "Response"
 // @Router /v1/text-to-speech/{voice-id} [post]
-func TTSEndpoint(fce *fiberContext.FiberContextExtractor, ttsbs *backend.TextToSpeechBackendService) func(c *fiber.Ctx) error {
+func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

 		input := new(schema.ElevenLabsTTSRequest)
@@ -26,21 +28,34 @@ func TTSEndpoint(fce *fiberContext.FiberContextExtractor, ttsbs *backend.TextToS
 			return err
 		}

-		var err error
-		input.ModelID, err = fce.ModelFromContext(c, input.ModelID, false)
+		modelFile, err := fiberContext.ModelFromContext(c, ml, input.ModelID, false)
 		if err != nil {
+			modelFile = input.ModelID
 			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
 		}

-		responseChannel := ttsbs.TextToAudioFile(&schema.TTSRequest{
-			Model: input.ModelID,
-			Voice: voiceID,
-			Input: input.Text,
-		})
-		rawValue := <-responseChannel
-		if rawValue.Error != nil {
-			return rawValue.Error
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+		if err != nil {
+			modelFile = input.ModelID
+			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
+		} else {
+			if input.ModelID != "" {
+				modelFile = input.ModelID
+			} else {
+				modelFile = cfg.Model
+			}
 		}
-		return c.Download(*rawValue.Value)
+		log.Debug().Msgf("Request for model: %s", modelFile)
+
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
+		if err != nil {
+			return err
+		}
+		return c.Download(filePath)
 	}
 }
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -0,0 +1,84 @@
+package jina
+
+import (
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		req := new(schema.JINARerankRequest)
+		if err := c.BodyParser(req); err != nil {
+			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{
+				"error": "Cannot parse JSON",
+			})
+		}
+
+		input := new(schema.TTSRequest)
+
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, false)
+		if err != nil {
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		}
+
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+
+		if err != nil {
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		} else {
+			modelFile = cfg.Model
+		}
+		log.Debug().Msgf("Request for model: %s", modelFile)
+
+		if input.Backend != "" {
+			cfg.Backend = input.Backend
+		}
+
+		request := &proto.RerankRequest{
+			Query:     req.Query,
+			TopN:      int32(req.TopN),
+			Documents: req.Documents,
+		}
+
+		results, err := backend.Rerank(cfg.Backend, modelFile, request, ml, appConfig, *cfg)
+		if err != nil {
+			return err
+		}
+
+		response := &schema.JINARerankResponse{
+			Model: req.Model,
+		}
+
+		for _, r := range results.Results {
+			response.Results = append(response.Results, schema.JINADocumentResult{
+				Index:          int(r.Index),
+				Document:       schema.JINAText{Text: r.Text},
+				RelevanceScore: float64(r.RelevanceScore),
+			})
+		}
+
+		response.Usage.TotalTokens = int(results.Usage.TotalTokens)
+		response.Usage.PromptTokens = int(results.Usage.PromptTokens)
+
+		return c.Status(fiber.StatusOK).JSON(response)
+	}
+}
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -61,11 +61,11 @@ func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fibe
 			return err
 		}
 		mgs.galleryApplier.C <- gallery.GalleryOp{
-			Req:         input.GalleryModel,
-			Id:          uuid.String(),
-			GalleryName: input.ID,
-			Galleries:   mgs.galleries,
-			ConfigURL:   input.ConfigURL,
+			Req:              input.GalleryModel,
+			Id:               uuid.String(),
+			GalleryModelName: input.ID,
+			Galleries:        mgs.galleries,
+			ConfigURL:        input.ConfigURL,
 		}
 		return c.JSON(struct {
 			ID        string `json:"uuid"`
@@ -74,6 +74,27 @@ func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fibe
 	}
 }

+func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		modelName := c.Params("name")
+
+		mgs.galleryApplier.C <- gallery.GalleryOp{
+			Delete:           true,
+			GalleryModelName: modelName,
+		}
+
+		uuid, err := uuid.NewUUID()
+		if err != nil {
+			return err
+		}
+
+		return c.JSON(struct {
+			ID        string `json:"uuid"`
+			StatusURL string `json:"status"`
+		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+	}
+}
+
 func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -2,7 +2,9 @@ package localai

 import (
 	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
 	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
@@ -14,26 +16,45 @@ import (
 // @Param request body schema.TTSRequest true "query params"
 // @Success 200 {string} binary	 "Response"
 // @Router /v1/audio/speech [post]
-func TTSEndpoint(fce *fiberContext.FiberContextExtractor, ttsbs *backend.TextToSpeechBackendService) func(c *fiber.Ctx) error {
+func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		var err error
+
 		input := new(schema.TTSRequest)

 		// Get input data from the request body
-		if err = c.BodyParser(input); err != nil {
+		if err := c.BodyParser(input); err != nil {
 			return err
 		}

-		input.Model, err = fce.ModelFromContext(c, input.Model, false)
+		modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, false)
 		if err != nil {
+			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}

-		responseChannel := ttsbs.TextToAudioFile(input)
-		rawValue := <-responseChannel
-		if rawValue.Error != nil {
-			return rawValue.Error
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+
+		if err != nil {
+			modelFile = input.Model
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
+		} else {
+			modelFile = cfg.Model
 		}
-		return c.Download(*rawValue.Value)
+		log.Debug().Msgf("Request for model: %s", modelFile)
+
+		if input.Backend != "" {
+			cfg.Backend = input.Backend
+		}
+
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
+		if err != nil {
+			return err
+		}
+		return c.Download(filePath)
 	}
 }
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -0,0 +1,49 @@
+package localai
+
+import (
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+)
+
+func WelcomeEndpoint(appConfig *config.ApplicationConfig,
+	cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		models, _ := ml.ListModels()
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		galleryConfigs := map[string]*gallery.Config{}
+		for _, m := range backendConfigs {
+
+			cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
+			if err != nil {
+				continue
+			}
+			galleryConfigs[m.Name] = cfg
+		}
+
+		// Get model statuses to display in the UI the operation in progress
+		processingModels, taskTypes := modelStatus()
+
+		summary := fiber.Map{
+			"Title":             "LocalAI API - " + internal.PrintableVersion(),
+			"Version":           internal.PrintableVersion(),
+			"Models":            models,
+			"ModelsConfig":      backendConfigs,
+			"GalleryConfig":     galleryConfigs,
+			"ApplicationConfig": appConfig,
+			"ProcessingModels":  processingModels,
+			"TaskTypes":         taskTypes,
+		}
+
+		if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
+			// The client expects a JSON response
+			return c.Status(fiber.StatusOK).JSON(summary)
+		} else {
+			// Render index
+			return c.Render("views/index", summary)
+		}
+	}
+}
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -339,7 +339,7 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 			}
 		}

-		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistantID %q", assistantID))
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find "))
 	}
 }

@@ -455,21 +455,19 @@ func DeleteAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 		for i, assistant := range Assistants {
 			if assistant.ID == assistantID {
 				for j, fileId := range assistant.FileIDs {
-					if fileId == fileId {
-						Assistants[i].FileIDs = append(Assistants[i].FileIDs[:j], Assistants[i].FileIDs[j+1:]...)
+					Assistants[i].FileIDs = append(Assistants[i].FileIDs[:j], Assistants[i].FileIDs[j+1:]...)

-						// Check if the file exists in the assistantFiles slice
-						for i, assistantFile := range AssistantFiles {
-							if assistantFile.ID == fileId {
-								// Remove the file from the assistantFiles slice
-								AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
-								utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
-								return c.Status(fiber.StatusOK).JSON(DeleteAssistantFileResponse{
-									ID:      fileId,
-									Object:  "assistant.file.deleted",
-									Deleted: true,
-								})
-							}
+					// Check if the file exists in the assistantFiles slice
+					for i, assistantFile := range AssistantFiles {
+						if assistantFile.ID == fileId {
+							// Remove the file from the assistantFiles slice
+							AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
+							utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
+							return c.Status(fiber.StatusOK).JSON(DeleteAssistantFileResponse{
+								ID:      fileId,
+								Object:  "assistant.file.deleted",
+								Deleted: true,
+							})
 						}
 					}
 				}
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -3,10 +3,6 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/stretchr/testify/assert"
 	"io"
 	"io/ioutil"
 	"net/http"
@@ -16,6 +12,11 @@ import (
 	"strings"
 	"testing"
 	"time"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/stretchr/testify/assert"
 )

 var configsDir string = "/tmp/localai/configs"
@@ -49,8 +50,8 @@ func TestAssistantEndpoints(t *testing.T) {
 	}

 	_ = os.RemoveAll(appConfig.ConfigsDir)
-	_ = os.MkdirAll(appConfig.ConfigsDir, 0755)
-	_ = os.MkdirAll(modelPath, 0755)
+	_ = os.MkdirAll(appConfig.ConfigsDir, 0750)
+	_ = os.MkdirAll(modelPath, 0750)
 	os.Create(filepath.Join(modelPath, "ggml-gpt4all-j"))

 	app := fiber.New(fiber.Config{
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -5,11 +5,16 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"strings"
+	"time"

-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/pkg/functions"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -19,82 +24,418 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAIService) func(c *fiber.Ctx) error {
+func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	emptyMessage := ""
+	id := uuid.New().String()
+	created := int(time.Now().Unix())
+
+	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		initialMessage := schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
+			Object:  "chat.completion.chunk",
+		}
+		responses <- initialMessage
+
+		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
+				Object:  "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
+			}
+
+			responses <- resp
+			return true
+		})
+		close(responses)
+	}
+	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		result := ""
+		_, tokenUsage, _ := ComputeChoices(req, prompt, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			result += s
+			// TODO: Change generated BNF grammar to be compliant with the schema so we can
+			// stream the result token by token here.
+			return true
+		})
+
+		results := functions.ParseFunctionCall(result, config.FunctionsConfig)
+		noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0
+
+		switch {
+		case noActionToRun:
+			initialMessage := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
+				Object:  "chat.completion.chunk",
+			}
+			responses <- initialMessage
+
+			result, err := handleQuestion(config, req, ml, startupOptions, results, prompt)
+			if err != nil {
+				log.Error().Err(err).Msg("error handling question")
+				return
+			}
+
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0}},
+				Object:  "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     tokenUsage.Prompt,
+					CompletionTokens: tokenUsage.Completion,
+					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+				},
+			}
+
+			responses <- resp
+
+		default:
+			for i, ss := range results {
+				name, args := ss.Name, ss.Arguments
+
+				initialMessage := schema.OpenAIResponse{
+					ID:      id,
+					Created: created,
+					Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{{
+						Delta: &schema.Message{
+							Role: "assistant",
+							ToolCalls: []schema.ToolCall{
+								{
+									Index: i,
+									ID:    id,
+									Type:  "function",
+									FunctionCall: schema.FunctionCall{
+										Name: name,
+									},
+								},
+							},
+						}}},
+					Object: "chat.completion.chunk",
+				}
+				responses <- initialMessage
+
+				responses <- schema.OpenAIResponse{
+					ID:      id,
+					Created: created,
+					Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{{
+						Delta: &schema.Message{
+							Role: "assistant",
+							ToolCalls: []schema.ToolCall{
+								{
+									Index: i,
+									ID:    id,
+									Type:  "function",
+									FunctionCall: schema.FunctionCall{
+										Arguments: args,
+									},
+								},
+							},
+						}}},
+					Object: "chat.completion.chunk",
+				}
+			}
+		}
+
+		close(responses)
+	}
+
 	return func(c *fiber.Ctx) error {
-		_, request, err := fce.OpenAIRequestFromContext(c, false)
+		modelFile, input, err := readRequest(c, ml, startupOptions, true)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request: %w", err)
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		traceID, finalResultChannel, _, tokenChannel, err := oais.Chat(request, false, request.Stream)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, startupOptions.Debug, startupOptions.Threads, startupOptions.ContextSize, startupOptions.F16)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+		log.Debug().Msgf("Configuration read: %+v", config)
+
+		funcs := input.Functions
+		shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
+
+		// Allow the user to set custom actions via config file
+		// to be "embedded" in each model
+		noActionName := "answer"
+		noActionDescription := "use this action to answer without performing any action"
+
+		if config.FunctionsConfig.NoActionFunctionName != "" {
+			noActionName = config.FunctionsConfig.NoActionFunctionName
+		}
+		if config.FunctionsConfig.NoActionDescriptionName != "" {
+			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}

-		if request.Stream {
+		if input.ResponseFormat.Type == "json_object" {
+			input.Grammar = functions.JSONBNF
+		}

-			log.Debug().Msgf("Chat Stream request received")
+		config.Grammar = input.Grammar

+		if shouldUseFn {
+			log.Debug().Msgf("Response needs to process functions")
+		}
+
+		switch {
+		case !config.FunctionsConfig.NoGrammar && shouldUseFn:
+			noActionGrammar := functions.Function{
+				Name:        noActionName,
+				Description: noActionDescription,
+				Parameters: map[string]interface{}{
+					"properties": map[string]interface{}{
+						"message": map[string]interface{}{
+							"type":        "string",
+							"description": "The message to reply the user with",
+						}},
+				},
+			}
+
+			// Append the no action function
+			if !config.FunctionsConfig.DisableNoAction {
+				funcs = append(funcs, noActionGrammar)
+			}
+
+			// Force picking one of the functions by the request
+			if config.FunctionToCall() != "" {
+				funcs = funcs.Select(config.FunctionToCall())
+			}
+
+			// Update input grammar
+			jsStruct := funcs.ToJSONStructure()
+			config.Grammar = jsStruct.Grammar("", config.FunctionsConfig.ParallelCalls)
+		case input.JSONFunctionGrammarObject != nil:
+			config.Grammar = input.JSONFunctionGrammarObject.Grammar("", config.FunctionsConfig.ParallelCalls)
+		default:
+			// Force picking one of the functions by the request
+			if config.FunctionToCall() != "" {
+				funcs = funcs.Select(config.FunctionToCall())
+			}
+		}
+
+		// process functions if we have any defined or if we have a function call string
+
+		// functions are not supported in stream mode (yet?)
+		toStream := input.Stream
+
+		log.Debug().Msgf("Parameters: %+v", config)
+
+		var predInput string
+
+		// If we are using the tokenizer template, we don't need to process the messages
+		// unless we are processing functions
+		if !config.TemplateConfig.UseTokenizerTemplate || shouldUseFn {
+			suppressConfigSystemPrompt := false
+			mess := []string{}
+			for messageIndex, i := range input.Messages {
+				var content string
+				role := i.Role
+
+				// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
+				// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
+				if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
+					roleFn := "assistant_function_call"
+					r := config.Roles[roleFn]
+					if r != "" {
+						role = roleFn
+					}
+				}
+				r := config.Roles[role]
+				contentExists := i.Content != nil && i.StringContent != ""
+
+				fcall := i.FunctionCall
+				if len(i.ToolCalls) > 0 {
+					fcall = i.ToolCalls
+				}
+
+				// First attempt to populate content via a chat message specific template
+				if config.TemplateConfig.ChatMessage != "" {
+					chatMessageData := model.ChatMessageTemplateData{
+						SystemPrompt: config.SystemPrompt,
+						Role:         r,
+						RoleName:     role,
+						Content:      i.StringContent,
+						FunctionCall: fcall,
+						FunctionName: i.Name,
+						LastMessage:  messageIndex == (len(input.Messages) - 1),
+						Function:     config.Grammar != "" && (messageIndex == (len(input.Messages) - 1)),
+						MessageIndex: messageIndex,
+					}
+					templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
+					if err != nil {
+						log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
+					} else {
+						if templatedChatMessage == "" {
+							log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
+							continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
+						}
+						log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
+						content = templatedChatMessage
+					}
+				}
+
+				marshalAnyRole := func(f any) {
+					j, err := json.Marshal(f)
+					if err == nil {
+						if contentExists {
+							content += "\n" + fmt.Sprint(r, " ", string(j))
+						} else {
+							content = fmt.Sprint(r, " ", string(j))
+						}
+					}
+				}
+				marshalAny := func(f any) {
+					j, err := json.Marshal(f)
+					if err == nil {
+						if contentExists {
+							content += "\n" + string(j)
+						} else {
+							content = string(j)
+						}
+					}
+				}
+				// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
+				if content == "" {
+					if r != "" {
+						if contentExists {
+							content = fmt.Sprint(r, i.StringContent)
+						}
+
+						if i.FunctionCall != nil {
+							marshalAnyRole(i.FunctionCall)
+						}
+						if i.ToolCalls != nil {
+							marshalAnyRole(i.ToolCalls)
+						}
+					} else {
+						if contentExists {
+							content = fmt.Sprint(i.StringContent)
+						}
+						if i.FunctionCall != nil {
+							marshalAny(i.FunctionCall)
+						}
+						if i.ToolCalls != nil {
+							marshalAny(i.ToolCalls)
+						}
+					}
+					// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+					if contentExists && role == "system" {
+						suppressConfigSystemPrompt = true
+					}
+				}
+
+				mess = append(mess, content)
+			}
+
+			predInput = strings.Join(mess, "\n")
+			log.Debug().Msgf("Prompt (before templating): %s", predInput)
+
+			templateFile := ""
+
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+				templateFile = config.Model
+			}
+
+			if config.TemplateConfig.Chat != "" && !shouldUseFn {
+				templateFile = config.TemplateConfig.Chat
+			}
+
+			if config.TemplateConfig.Functions != "" && shouldUseFn {
+				templateFile = config.TemplateConfig.Functions
+			}
+
+			if templateFile != "" {
+				templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
+					SystemPrompt:         config.SystemPrompt,
+					SuppressSystemPrompt: suppressConfigSystemPrompt,
+					Input:                predInput,
+					Functions:            funcs,
+				})
+				if err == nil {
+					predInput = templatedInput
+					log.Debug().Msgf("Template found, input modified to: %s", predInput)
+				} else {
+					log.Debug().Msgf("Template failed loading: %s", err.Error())
+				}
+			}
+
+			log.Debug().Msgf("Prompt (after templating): %s", predInput)
+			if shouldUseFn && config.Grammar != "" {
+				log.Debug().Msgf("Grammar: %+v", config.Grammar)
+			}
+		}
+
+		switch {
+		case toStream:
+
+			log.Debug().Msgf("Stream request received")
 			c.Context().SetContentType("text/event-stream")
 			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//
+			//	c.Set("Content-Type", "text/event-stream")
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")

+			responses := make(chan schema.OpenAIResponse)
+
+			if !shouldUseFn {
+				go process(predInput, input, config, ml, responses)
+			} else {
+				go processTools(noActionName, predInput, input, config, ml, responses)
+			}
+
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
 				usage := &schema.OpenAIUsage{}
 				toolsCalled := false
-				for ev := range tokenChannel {
-					if ev.Error != nil {
-						log.Debug().Err(ev.Error).Msg("chat streaming responseChannel error")
-						request.Cancel()
-						break
-					}
-					usage = &ev.Value.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
-
-					if len(ev.Value.Choices[0].Delta.ToolCalls) > 0 {
+				for ev := range responses {
+					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+					if len(ev.Choices[0].Delta.ToolCalls) > 0 {
 						toolsCalled = true
 					}
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
-					if ev.Error != nil {
-						log.Debug().Err(ev.Error).Msg("[ChatEndpoint] error to debug during tokenChannel handler")
-						enc.Encode(ev.Error)
-					} else {
-						enc.Encode(ev.Value)
-					}
-					log.Debug().Msgf("chat streaming sending chunk: %s", buf.String())
+					enc.Encode(ev)
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
 					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
 					if err != nil {
-						log.Debug().Err(err).Msgf("Sending chunk failed")
-						request.Cancel()
-						break
-					}
-					err = w.Flush()
-					if err != nil {
-						log.Debug().Msg("error while flushing, closing connection")
-						request.Cancel()
+						log.Debug().Msgf("Sending chunk failed: %v", err)
+						input.Cancel()
 						break
 					}
+					w.Flush()
 				}

 				finishReason := "stop"
 				if toolsCalled {
 					finishReason = "tool_calls"
-				} else if toolsCalled && len(request.Tools) == 0 {
+				} else if toolsCalled && len(input.Tools) == 0 {
 					finishReason = "function_call"
 				}

 				resp := &schema.OpenAIResponse{
-					ID:      traceID.ID,
-					Created: traceID.Created,
-					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							FinishReason: finishReason,
 							Index:        0,
-							Delta:        &schema.Message{Content: ""},
+							Delta:        &schema.Message{Content: &emptyMessage},
 						}},
 					Object: "chat.completion.chunk",
 					Usage:  *usage,
@@ -105,21 +446,146 @@ func ChatEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAI
 				w.WriteString("data: [DONE]\n\n")
 				w.Flush()
 			}))
-
 			return nil
+
+		// no streaming mode
+		default:
+			result, tokenUsage, err := ComputeChoices(input, predInput, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
+				if !shouldUseFn {
+					// no function is called, just reply and use stop as finish reason
+					*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
+					return
+				}
+
+				results := functions.ParseFunctionCall(s, config.FunctionsConfig)
+				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0
+
+				switch {
+				case noActionsToRun:
+					result, err := handleQuestion(config, input, ml, startupOptions, results, predInput)
+					if err != nil {
+						log.Error().Err(err).Msg("error handling question")
+						return
+					}
+					*c = append(*c, schema.Choice{
+						Message: &schema.Message{Role: "assistant", Content: &result}})
+				default:
+					toolChoice := schema.Choice{
+						Message: &schema.Message{
+							Role: "assistant",
+						},
+					}
+
+					if len(input.Tools) > 0 {
+						toolChoice.FinishReason = "tool_calls"
+					}
+
+					for _, ss := range results {
+						name, args := ss.Name, ss.Arguments
+						if len(input.Tools) > 0 {
+							// If we are using tools, we condense the function calls into
+							// a single response choice with all the tools
+							toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls,
+								schema.ToolCall{
+									ID:   id,
+									Type: "function",
+									FunctionCall: schema.FunctionCall{
+										Name:      name,
+										Arguments: args,
+									},
+								},
+							)
+						} else {
+							// otherwise we return more choices directly
+							*c = append(*c, schema.Choice{
+								FinishReason: "function_call",
+								Message: &schema.Message{
+									Role: "assistant",
+									FunctionCall: map[string]interface{}{
+										"name":      name,
+										"arguments": args,
+									},
+								},
+							})
+						}
+					}
+
+					if len(input.Tools) > 0 {
+						// we need to append our result if we are using tools
+						*c = append(*c, toolChoice)
+					}
+				}
+
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			resp := &schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: result,
+				Object:  "chat.completion",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     tokenUsage.Prompt,
+					CompletionTokens: tokenUsage.Completion,
+					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+				},
+			}
+			respData, _ := json.Marshal(resp)
+			log.Debug().Msgf("Response: %s", respData)
+
+			// Return the prediction in the response body
+			return c.JSON(resp)
 		}
-
-		// TODO is this proper to have exclusive from Stream, or do we need to issue both responses?
-		rawResponse := <-finalResultChannel
-
-		if rawResponse.Error != nil {
-			return rawResponse.Error
-		}
-
-		jsonResult, _ := json.Marshal(rawResponse.Value)
-		log.Debug().Str("jsonResult", string(jsonResult)).Msg("Chat Final Response")
-
-		// Return the prediction in the response body
-		return c.JSON(rawResponse.Value)
 	}
 }
+
+func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, funcResults []functions.FuncCallResults, prompt string) (string, error) {
+	log.Debug().Msgf("nothing to do, computing a reply")
+	arg := ""
+	if len(funcResults) > 0 {
+		arg = funcResults[0].Arguments
+	}
+	// If there is a message that the LLM already sends as part of the JSON reply, use it
+	arguments := map[string]interface{}{}
+	if err := json.Unmarshal([]byte(arg), &arguments); err != nil {
+		log.Debug().Msg("handleQuestion: function result did not contain a valid JSON object")
+	}
+	m, exists := arguments["message"]
+	if exists {
+		switch message := m.(type) {
+		case string:
+			if message != "" {
+				log.Debug().Msgf("Reply received from LLM: %s", message)
+				message = backend.Finetune(*config, prompt, message)
+				log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
+
+				return message, nil
+			}
+		}
+	}
+
+	log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
+	// Otherwise ask the LLM to understand the JSON output and the context, and return a message
+	// Note: This costs (in term of CPU/GPU) another computation
+	config.Grammar = ""
+	images := []string{}
+	for _, m := range input.Messages {
+		images = append(images, m.StringImages...)
+	}
+
+	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
+	if err != nil {
+		log.Error().Err(err).Msg("model inference failed")
+		return "", err
+	}
+
+	prediction, err := predFunc()
+	if err != nil {
+		log.Error().Err(err).Msg("prediction failed")
+		return "", err
+	}
+	return backend.Finetune(*config, prompt, prediction.Response), nil
+}
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -4,13 +4,18 @@ import (
 	"bufio"
 	"bytes"
 	"encoding/json"
+	"errors"
 	"fmt"
+	"time"

-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"

 	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/functions"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -20,50 +25,116 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
-func CompletionEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAIService) func(c *fiber.Ctx) error {
+func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	id := uuid.New().String()
+	created := int(time.Now().Unix())
+
+	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		ComputeChoices(req, s, config, appConfig, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{
+					{
+						Index: 0,
+						Text:  s,
+					},
+				},
+				Object: "text_completion",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
+			}
+			log.Debug().Msgf("Sending goroutine: %s", s)
+
+			responses <- resp
+			return true
+		})
+		close(responses)
+	}
+
 	return func(c *fiber.Ctx) error {
-		_, request, err := fce.OpenAIRequestFromContext(c, false)
+		modelFile, input, err := readRequest(c, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		log.Debug().Msgf("`OpenAIRequest`: %+v", request)
+		log.Debug().Msgf("`input`: %+v", input)

-		traceID, finalResultChannel, _, _, tokenChannel, err := oais.Completion(request, false, request.Stream)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		if request.Stream {
-			log.Debug().Msgf("Completion Stream request received")
+		if input.ResponseFormat.Type == "json_object" {
+			input.Grammar = functions.JSONBNF
+		}

+		config.Grammar = input.Grammar
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		if input.Stream {
+			log.Debug().Msgf("Stream request received")
 			c.Context().SetContentType("text/event-stream")
 			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
 			//c.Set("Content-Type", "text/event-stream")
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
+		}
+
+		templateFile := ""
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+			templateFile = config.Model
+		}
+
+		if config.TemplateConfig.Completion != "" {
+			templateFile = config.TemplateConfig.Completion
+		}
+
+		if input.Stream {
+			if len(config.PromptStrings) > 1 {
+				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
+			}
+
+			predInput := config.PromptStrings[0]
+
+			if templateFile != "" {
+				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+					Input: predInput,
+				})
+				if err == nil {
+					predInput = templatedInput
+					log.Debug().Msgf("Template found, input modified to: %s", predInput)
+				}
+			}
+
+			responses := make(chan schema.OpenAIResponse)
+
+			go process(predInput, input, config, ml, responses)

 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-				for ev := range tokenChannel {
+
+				for ev := range responses {
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
-					if ev.Error != nil {
-						log.Debug().Msgf("[CompletionEndpoint] error to debug during tokenChannel handler: %q", ev.Error)
-						enc.Encode(ev.Error)
-					} else {
-						enc.Encode(ev.Value)
-					}
+					enc.Encode(ev)

-					log.Debug().Msgf("completion streaming sending chunk: %s", buf.String())
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
 					fmt.Fprintf(w, "data: %v\n", buf.String())
 					w.Flush()
 				}

 				resp := &schema.OpenAIResponse{
-					ID:      traceID.ID,
-					Created: traceID.Created,
-					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							Index:        0,
@@ -80,15 +151,55 @@ func CompletionEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.
 			}))
 			return nil
 		}
-		// TODO is this proper to have exclusive from Stream, or do we need to issue both responses?
-		rawResponse := <-finalResultChannel
-		if rawResponse.Error != nil {
-			return rawResponse.Error
+
+		var result []schema.Choice
+
+		totalTokenUsage := backend.TokenUsage{}
+
+		for k, i := range config.PromptStrings {
+			if templateFile != "" {
+				// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+					SystemPrompt: config.SystemPrompt,
+					Input:        i,
+				})
+				if err == nil {
+					i = templatedInput
+					log.Debug().Msgf("Template found, input modified to: %s", i)
+				}
+			}
+
+			r, tokenUsage, err := ComputeChoices(
+				input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
+				}, nil)
+			if err != nil {
+				return err
+			}
+
+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
+			result = append(result, r...)
 		}
-		jsonResult, _ := json.Marshal(rawResponse.Value)
+
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "text_completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
+		}
+
+		jsonResult, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", jsonResult)

 		// Return the prediction in the response body
-		return c.JSON(rawResponse.Value)
+		return c.JSON(resp)
 	}
 }
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -3,36 +3,92 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"

+	"github.com/go-skynet/LocalAI/core/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"

 	"github.com/rs/zerolog/log"
 )

-func EditEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAIService) func(c *fiber.Ctx) error {
+func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		_, request, err := fce.OpenAIRequestFromContext(c, false)
+		modelFile, input, err := readRequest(c, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		_, finalResultChannel, _, _, _, err := oais.Edit(request, false, request.Stream)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		rawResponse := <-finalResultChannel
-		if rawResponse.Error != nil {
-			return rawResponse.Error
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		templateFile := ""
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+			templateFile = config.Model
 		}

-		jsonResult, _ := json.Marshal(rawResponse.Value)
+		if config.TemplateConfig.Edit != "" {
+			templateFile = config.TemplateConfig.Edit
+		}
+
+		var result []schema.Choice
+		totalTokenUsage := backend.TokenUsage{}
+
+		for _, i := range config.InputStrings {
+			if templateFile != "" {
+				templatedInput, err := ml.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
+					Input:        i,
+					Instruction:  input.Instruction,
+					SystemPrompt: config.SystemPrompt,
+				})
+				if err == nil {
+					i = templatedInput
+					log.Debug().Msgf("Template found, input modified to: %s", i)
+				}
+			}
+
+			r, tokenUsage, err := ComputeChoices(input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
+				*c = append(*c, schema.Choice{Text: s})
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
+			result = append(result, r...)
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "edit",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
+		}
+
+		jsonResult, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", jsonResult)

 		// Return the prediction in the response body
-		return c.JSON(rawResponse.Value)
+		return c.JSON(resp)
 	}
 }
--- a/core/http/endpoints/openai/embeddings.go
+++ b/core/http/endpoints/openai/embeddings.go
@@ -3,9 +3,14 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/core/backend"
-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/pkg/model"
+
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/google/uuid"

 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -16,25 +21,63 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/embeddings [post]
-func EmbeddingsEndpoint(fce *fiberContext.FiberContextExtractor, ebs *backend.EmbeddingsBackendService) func(c *fiber.Ctx) error {
+func EmbeddingsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		_, input, err := fce.OpenAIRequestFromContext(c, true)
+		model, input, err := readRequest(c, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		responseChannel := ebs.Embeddings(input)
-
-		rawResponse := <-responseChannel
-
-		if rawResponse.Error != nil {
-			return rawResponse.Error
+		config, input, err := mergeRequestWithConfig(model, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		jsonResult, _ := json.Marshal(rawResponse.Value)
+		log.Debug().Msgf("Parameter Config: %+v", config)
+		items := []schema.Item{}
+
+		for i, s := range config.InputToken {
+			// get the model function to call for the result
+			embedFn, err := backend.ModelEmbedding("", s, ml, *config, appConfig)
+			if err != nil {
+				return err
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				return err
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		for i, s := range config.InputStrings {
+			// get the model function to call for the result
+			embedFn, err := backend.ModelEmbedding(s, []int{}, ml, *config, appConfig)
+			if err != nil {
+				return err
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				return err
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:    items,
+			Object:  "list",
+		}
+
+		jsonResult, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", jsonResult)

 		// Return the prediction in the response body
-		return c.JSON(rawResponse.Value)
+		return c.JSON(resp)
 	}
 }
--- a/core/http/endpoints/openai/files_test.go
+++ b/core/http/endpoints/openai/files_test.go
@@ -251,7 +251,7 @@ func newMultipartFile(filePath, tag, purpose string) (*strings.Reader, *multipar

 // Helper to create test files
 func createTestFile(t *testing.T, name string, sizeMB int, option *config.ApplicationConfig) *os.File {
-	err := os.MkdirAll(option.UploadDir, 0755)
+	err := os.MkdirAll(option.UploadDir, 0750)
 	if err != nil {

 		t.Fatalf("Error MKDIR: %v", err)
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -1,18 +1,50 @@
 package openai

 import (
+	"bufio"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"

-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/google/uuid"

 	"github.com/go-skynet/LocalAI/core/backend"

+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

-// https://platform.openai.com/docs/api-reference/images/create
+func downloadFile(url string) (string, error) {
+	// Get the data
+	resp, err := http.Get(url)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	// Create the file
+	out, err := os.CreateTemp("", "image")
+	if err != nil {
+		return "", err
+	}
+	defer out.Close()
+
+	// Write the body to file
+	_, err = io.Copy(out, resp.Body)
+	return out.Name(), err
+}
+
+//

 /*
 *
@@ -27,36 +59,186 @@ import (

 *
 */
-
 // ImageEndpoint is the OpenAI Image generation API endpoint https://platform.openai.com/docs/api-reference/images/create
 // @Summary Creates an image given a prompt.
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/images/generations [post]
-func ImageEndpoint(fce *fiberContext.FiberContextExtractor, igbs *backend.ImageGenerationBackendService) func(c *fiber.Ctx) error {
+func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		// TODO: Somewhat a hack. Is there a better place to assign this?
-		if igbs.BaseUrlForGeneratedImages == "" {
-			igbs.BaseUrlForGeneratedImages = c.BaseURL() + "/generated-images/"
-		}
-		_, request, err := fce.OpenAIRequestFromContext(c, false)
+		m, input, err := readRequest(c, ml, appConfig, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		responseChannel := igbs.GenerateImage(request)
-		rawResponse := <-responseChannel
-
-		if rawResponse.Error != nil {
-			return rawResponse.Error
+		if m == "" {
+			m = model.StableDiffusionBackend
 		}
+		log.Debug().Msgf("Loading model: %+v", m)

-		jsonResult, err := json.Marshal(rawResponse.Value)
+		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, 0, 0, false)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
+
+		src := ""
+		if input.File != "" {
+
+			fileData := []byte{}
+			// check if input.File is an URL, if so download it and save it
+			// to a temporary file
+			if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
+				out, err := downloadFile(input.File)
+				if err != nil {
+					return fmt.Errorf("failed downloading file:%w", err)
+				}
+				defer os.RemoveAll(out)
+
+				fileData, err = os.ReadFile(out)
+				if err != nil {
+					return fmt.Errorf("failed reading file:%w", err)
+				}
+
+			} else {
+				// base 64 decode the file and write it somewhere
+				// that we will cleanup
+				fileData, err = base64.StdEncoding.DecodeString(input.File)
+				if err != nil {
+					return err
+				}
+			}
+
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(appConfig.ImageDir, "b64")
+			if err != nil {
+				return err
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(fileData)
+			if err != nil {
+				outputFile.Close()
+				return err
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		switch config.Backend {
+		case "stablediffusion":
+			config.Backend = model.StableDiffusionBackend
+		case "tinydream":
+			config.Backend = model.TinyDreamBackend
+		case "":
+			config.Backend = model.StableDiffusionBackend
+		}
+
+		sizeParts := strings.Split(input.Size, "x")
+		if len(sizeParts) != 2 {
+			return fmt.Errorf("invalid value for 'size'")
+		}
+		width, err := strconv.Atoi(sizeParts[0])
+		if err != nil {
+			return fmt.Errorf("invalid value for 'size'")
+		}
+		height, err := strconv.Atoi(sizeParts[1])
+		if err != nil {
+			return fmt.Errorf("invalid value for 'size'")
+		}
+
+		b64JSON := false
+		if input.ResponseFormat.Type == "b64_json" {
+			b64JSON = true
+		}
+		// src and clip_skip
+		var result []schema.Item
+		for _, i := range config.PromptStrings {
+			n := input.N
+			if input.N == 0 {
+				n = 1
+			}
+			for j := 0; j < n; j++ {
+				prompts := strings.Split(i, "|")
+				positive_prompt := prompts[0]
+				negative_prompt := ""
+				if len(prompts) > 1 {
+					negative_prompt = prompts[1]
+				}
+
+				mode := 0
+				step := config.Step
+				if step == 0 {
+					step = 15
+				}
+
+				if input.Mode != 0 {
+					mode = input.Mode
+				}
+
+				if input.Step != 0 {
+					step = input.Step
+				}
+
+				tempDir := ""
+				if !b64JSON {
+					tempDir = appConfig.ImageDir
+				}
+				// Create a temporary file
+				outputFile, err := os.CreateTemp(tempDir, "b64")
+				if err != nil {
+					return err
+				}
+				outputFile.Close()
+				output := outputFile.Name() + ".png"
+				// Rename the temporary file
+				err = os.Rename(outputFile.Name(), output)
+				if err != nil {
+					return err
+				}
+
+				baseURL := c.BaseURL()
+
+				fn, err := backend.ImageGeneration(height, width, mode, step, *config.Seed, positive_prompt, negative_prompt, src, output, ml, *config, appConfig)
+				if err != nil {
+					return err
+				}
+				if err := fn(); err != nil {
+					return err
+				}
+
+				item := &schema.Item{}
+
+				if b64JSON {
+					defer os.RemoveAll(output)
+					data, err := os.ReadFile(output)
+					if err != nil {
+						return err
+					}
+					item.B64JSON = base64.StdEncoding.EncodeToString(data)
+				} else {
+					base := filepath.Base(output)
+					item.URL = baseURL + "/generated-images/" + base
+				}
+
+				result = append(result, *item)
+			}
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Data:    result,
+		}
+
+		jsonResult, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", jsonResult)
+
 		// Return the prediction in the response body
-		return c.JSON(rawResponse.Value)
+		return c.JSON(resp)
 	}
 }
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -0,0 +1,55 @@
+package openai
+
+import (
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+
+	"github.com/go-skynet/LocalAI/core/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ComputeChoices(
+	req *schema.OpenAIRequest,
+	predInput string,
+	config *config.BackendConfig,
+	o *config.ApplicationConfig,
+	loader *model.ModelLoader,
+	cb func(string, *[]schema.Choice),
+	tokenCallback func(string, backend.TokenUsage) bool) ([]schema.Choice, backend.TokenUsage, error) {
+	n := req.N // number of completions to return
+	result := []schema.Choice{}
+
+	if n == 0 {
+		n = 1
+	}
+
+	images := []string{}
+	for _, m := range req.Messages {
+		images = append(images, m.StringImages...)
+	}
+
+	// get the model function to call for the result
+	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
+	if err != nil {
+		return result, backend.TokenUsage{}, err
+	}
+
+	tokenUsage := backend.TokenUsage{}
+
+	for i := 0; i < n; i++ {
+		prediction, err := predFunc()
+		if err != nil {
+			return result, backend.TokenUsage{}, err
+		}
+
+		tokenUsage.Prompt += prediction.Usage.Prompt
+		tokenUsage.Completion += prediction.Usage.Completion
+
+		finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+		cb(finetunedResponse, &result)
+
+		//result = append(result, Choice{Text: prediction})
+
+	}
+	return result, tokenUsage, err
+}
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -10,6 +10,7 @@ func ListModelsEndpoint(lms *services.ListModelsService) func(ctx *fiber.Ctx) er
 	return func(c *fiber.Ctx) error {
 		// If blank, no filter is applied.
 		filter := c.Query("filter")
+
 		// By default, exclude any loose files that are already referenced by a configuration file.
 		excludeConfigured := c.QueryBool("excludeConfigured", true)

@@ -17,7 +18,6 @@ func ListModelsEndpoint(lms *services.ListModelsService) func(ctx *fiber.Ctx) er
 		if err != nil {
 			return err
 		}
-
 		return c.JSON(struct {
 			Object string               `json:"object"`
 			Data   []schema.OpenAIModel `json:"data"`
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -0,0 +1,289 @@
+package openai
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/functions"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+func readRequest(c *fiber.Ctx, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
+	input := new(schema.OpenAIRequest)
+
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
+	}
+
+	received, _ := json.Marshal(input)
+
+	ctx, cancel := context.WithCancel(o.Context)
+	input.Context = ctx
+	input.Cancel = cancel
+
+	log.Debug().Msgf("Request received: %s", string(received))
+
+	modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, firstModel)
+
+	return modelFile, input, err
+}
+
+// this function check if the string is an URL, if it's an URL downloads the image in memory
+// encodes it in base64 and returns the base64 string
+func getBase64Image(s string) (string, error) {
+	if strings.HasPrefix(s, "http") {
+		// download the image
+		resp, err := http.Get(s)
+		if err != nil {
+			return "", err
+		}
+		defer resp.Body.Close()
+
+		// read the image data into memory
+		data, err := io.ReadAll(resp.Body)
+		if err != nil {
+			return "", err
+		}
+
+		// encode the image data in base64
+		encoded := base64.StdEncoding.EncodeToString(data)
+
+		// return the base64 string
+		return encoded, nil
+	}
+
+	// if the string instead is prefixed with "data:image/...;base64,", drop it
+	dropPrefix := []string{"data:image/jpeg;base64,", "data:image/png;base64,"}
+	for _, prefix := range dropPrefix {
+		if strings.HasPrefix(s, prefix) {
+			return strings.ReplaceAll(s, prefix, ""), nil
+		}
+	}
+
+	return "", fmt.Errorf("not valid string")
+}
+
+func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIRequest) {
+	if input.Echo {
+		config.Echo = input.Echo
+	}
+	if input.TopK != nil {
+		config.TopK = input.TopK
+	}
+	if input.TopP != nil {
+		config.TopP = input.TopP
+	}
+
+	if input.Backend != "" {
+		config.Backend = input.Backend
+	}
+
+	if input.ClipSkip != 0 {
+		config.Diffusers.ClipSkip = input.ClipSkip
+	}
+
+	if input.ModelBaseName != "" {
+		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
+	}
+
+	if input.NegativePromptScale != 0 {
+		config.NegativePromptScale = input.NegativePromptScale
+	}
+
+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
+	if input.NegativePrompt != "" {
+		config.NegativePrompt = input.NegativePrompt
+	}
+
+	if input.RopeFreqBase != 0 {
+		config.RopeFreqBase = input.RopeFreqBase
+	}
+
+	if input.RopeFreqScale != 0 {
+		config.RopeFreqScale = input.RopeFreqScale
+	}
+
+	if input.Grammar != "" {
+		config.Grammar = input.Grammar
+	}
+
+	if input.Temperature != nil {
+		config.Temperature = input.Temperature
+	}
+
+	if input.Maxtokens != nil {
+		config.Maxtokens = input.Maxtokens
+	}
+
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
+	}
+
+	if len(input.Tools) > 0 {
+		for _, tool := range input.Tools {
+			input.Functions = append(input.Functions, tool.Function)
+		}
+	}
+
+	if input.ToolsChoice != nil {
+		var toolChoice functions.Tool
+
+		switch content := input.ToolsChoice.(type) {
+		case string:
+			_ = json.Unmarshal([]byte(content), &toolChoice)
+		case map[string]interface{}:
+			dat, _ := json.Marshal(content)
+			_ = json.Unmarshal(dat, &toolChoice)
+		}
+		input.FunctionCall = map[string]interface{}{
+			"name": toolChoice.Function.Name,
+		}
+	}
+
+	// Decode each request's message content
+	index := 0
+	for i, m := range input.Messages {
+		switch content := m.Content.(type) {
+		case string:
+			input.Messages[i].StringContent = content
+		case []interface{}:
+			dat, _ := json.Marshal(content)
+			c := []schema.Content{}
+			json.Unmarshal(dat, &c)
+			for _, pp := range c {
+				if pp.Type == "text" {
+					input.Messages[i].StringContent = pp.Text
+				} else if pp.Type == "image_url" {
+					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
+					base64, err := getBase64Image(pp.ImageURL.URL)
+					if err == nil {
+						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+						// set a placeholder for each image
+						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
+						index++
+					} else {
+						log.Error().Msgf("Failed encoding image: %s", err)
+					}
+				}
+			}
+		}
+	}
+
+	if input.RepeatPenalty != 0 {
+		config.RepeatPenalty = input.RepeatPenalty
+	}
+
+	if input.FrequencyPenalty != 0 {
+		config.FrequencyPenalty = input.FrequencyPenalty
+	}
+
+	if input.PresencePenalty != 0 {
+		config.PresencePenalty = input.PresencePenalty
+	}
+
+	if input.Keep != 0 {
+		config.Keep = input.Keep
+	}
+
+	if input.Batch != 0 {
+		config.Batch = input.Batch
+	}
+
+	if input.IgnoreEOS {
+		config.IgnoreEOS = input.IgnoreEOS
+	}
+
+	if input.Seed != nil {
+		config.Seed = input.Seed
+	}
+
+	if input.TypicalP != nil {
+		config.TypicalP = input.TypicalP
+	}
+
+	switch inputs := input.Input.(type) {
+	case string:
+		if inputs != "" {
+			config.InputStrings = append(config.InputStrings, inputs)
+		}
+	case []interface{}:
+		for _, pp := range inputs {
+			switch i := pp.(type) {
+			case string:
+				config.InputStrings = append(config.InputStrings, i)
+			case []interface{}:
+				tokens := []int{}
+				for _, ii := range i {
+					tokens = append(tokens, int(ii.(float64)))
+				}
+				config.InputToken = append(config.InputToken, tokens)
+			}
+		}
+	}
+
+	// Can be either a string or an object
+	switch fnc := input.FunctionCall.(type) {
+	case string:
+		if fnc != "" {
+			config.SetFunctionCallString(fnc)
+		}
+	case map[string]interface{}:
+		var name string
+		n, exists := fnc["name"]
+		if exists {
+			nn, e := n.(string)
+			if e {
+				name = nn
+			}
+		}
+		config.SetFunctionCallNameString(name)
+	}
+
+	switch p := input.Prompt.(type) {
+	case string:
+		config.PromptStrings = append(config.PromptStrings, p)
+	case []interface{}:
+		for _, pp := range p {
+			if s, ok := pp.(string); ok {
+				config.PromptStrings = append(config.PromptStrings, s)
+			}
+		}
+	}
+}
+
+func mergeRequestWithConfig(modelFile string, input *schema.OpenAIRequest, cm *config.BackendConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.BackendConfig, *schema.OpenAIRequest, error) {
+	cfg, err := cm.LoadBackendConfigFileByName(modelFile, loader.ModelPath,
+		config.LoadOptionDebug(debug),
+		config.LoadOptionThreads(threads),
+		config.LoadOptionContextSize(ctx),
+		config.LoadOptionF16(f16),
+	)
+
+	// Set the parameters for the language model prediction
+	updateRequestConfig(cfg, input)
+
+	return cfg, input, err
+}
--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -9,7 +9,8 @@ import (
 	"path/filepath"

 	"github.com/go-skynet/LocalAI/core/backend"
-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/config"
+	model "github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -22,15 +23,17 @@ import (
 // @Param file formData file true "file"
 // @Success 200 {object} map[string]string	 "Response"
 // @Router /v1/audio/transcriptions [post]
-func TranscriptEndpoint(fce *fiberContext.FiberContextExtractor, tbs *backend.TranscriptionBackendService) func(c *fiber.Ctx) error {
+func TranscriptEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		_, request, err := fce.OpenAIRequestFromContext(c, false)
+		m, input, err := readRequest(c, ml, appConfig, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		// TODO: Investigate this file copy stuff later - potentially belongs in service.
-
+		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
 		// retrieve the file data from the request
 		file, err := c.FormFile("file")
 		if err != nil {
@@ -62,16 +65,13 @@ func TranscriptEndpoint(fce *fiberContext.FiberContextExtractor, tbs *backend.Tr

 		log.Debug().Msgf("Audio file copied to: %+v", dst)

-		request.File = dst
-
-		responseChannel := tbs.Transcribe(request)
-		rawResponse := <-responseChannel
-
-		if rawResponse.Error != nil {
-			return rawResponse.Error
+		tr, err := backend.ModelTranscription(dst, input.Language, ml, *config, appConfig)
+		if err != nil {
+			return err
 		}
-		log.Debug().Msgf("Transcribed: %+v", rawResponse.Value)
+
+		log.Debug().Msgf("Trascribed: %+v", tr)
 		// TODO: handle different outputs here
-		return c.Status(http.StatusOK).JSON(rawResponse.Value)
+		return c.Status(http.StatusOK).JSON(tr)
 	}
 }
--- a/core/http/http_suite_test.go
+++ b/core/http/http_suite_test.go
--- a/core/http/render.go
+++ b/core/http/render.go
@@ -7,12 +7,10 @@ import (
 	"net/http"

 	"github.com/Masterminds/sprig/v3"
-	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/internal"
-	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	fiberhtml "github.com/gofiber/template/html/v2"
+	"github.com/microcosm-cc/bluemonday"
 	"github.com/russross/blackfriday"
 )

@@ -33,40 +31,6 @@ func notFoundHandler(c *fiber.Ctx) error {
 	return nil
 }

-func welcomeRoute(
-	app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error,
-) {
-	if appConfig.DisableWelcomePage {
-		return
-	}
-
-	models, _ := ml.ListModels()
-	backendConfigs := cl.GetAllBackendConfigs()
-
-	app.Get("/", auth, func(c *fiber.Ctx) error {
-		summary := fiber.Map{
-			"Title":             "LocalAI API - " + internal.PrintableVersion(),
-			"Version":           internal.PrintableVersion(),
-			"Models":            models,
-			"ModelsConfig":      backendConfigs,
-			"ApplicationConfig": appConfig,
-		}
-
-		if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
-			// The client expects a JSON response
-			return c.Status(fiber.StatusOK).JSON(summary)
-		} else {
-			// Render index
-			return c.Render("views/index", summary)
-		}
-	})
-
-}
-
 func renderEngine() *fiberhtml.Engine {
 	engine := fiberhtml.NewFileSystem(http.FS(viewsfs), ".html")
 	engine.AddFuncMap(sprig.FuncMap())
@@ -76,5 +40,5 @@ func renderEngine() *fiberhtml.Engine {

 func markDowner(args ...interface{}) template.HTML {
 	s := blackfriday.MarkdownCommon([]byte(fmt.Sprintf("%s", args...)))
-	return template.HTML(s)
+	return template.HTML(bluemonday.UGCPolicy().Sanitize(string(s)))
 }
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -0,0 +1,19 @@
+package routes
+
+import (
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/elevenlabs"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+)
+
+func RegisterElevenLabsRoutes(app *fiber.App,
+	cl *config.BackendConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	auth func(*fiber.Ctx) error) {
+
+	// Elevenlabs
+	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
+
+}
--- a/core/http/routes/jina.go
+++ b/core/http/routes/jina.go
@@ -0,0 +1,19 @@
+package routes
+
+import (
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/jina"
+
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+)
+
+func RegisterJINARoutes(app *fiber.App,
+	cl *config.BackendConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	auth func(*fiber.Ctx) error) {
+
+	// POST endpoint to mimic the reranking
+	app.Post("/v1/rerank", jina.JINARerankEndpoint(cl, ml, appConfig))
+}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -0,0 +1,65 @@
+package routes
+
+import (
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/swagger"
+)
+
+func RegisterLocalAIRoutes(app *fiber.App,
+	cl *config.BackendConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	galleryService *services.GalleryService,
+	auth func(*fiber.Ctx) error) {
+
+	app.Get("/swagger/*", swagger.HandlerDefault) // default
+
+	// LocalAI API endpoints
+
+	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
+	app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+	app.Post("/models/delete/:name", auth, modelGalleryEndpointService.DeleteModelGalleryEndpoint())
+
+	app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+	app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
+	app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
+	app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+	app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
+	app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
+
+	app.Post("/tts", auth, localai.TTSEndpoint(cl, ml, appConfig))
+
+	// Stores
+	sl := model.NewModelLoader("")
+	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
+	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
+	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
+	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))
+
+	// Kubernetes health checks
+	ok := func(c *fiber.Ctx) error {
+		return c.SendStatus(200)
+	}
+
+	app.Get("/healthz", ok)
+	app.Get("/readyz", ok)
+
+	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
+
+	// Experimental Backend Statistics Module
+	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
+	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(backendMonitorService))
+	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(backendMonitorService))
+
+	app.Get("/version", auth, func(c *fiber.Ctx) error {
+		return c.JSON(struct {
+			Version string `json:"version"`
+		}{Version: internal.PrintableVersion()})
+	})
+
+}
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -0,0 +1,88 @@
+package routes
+
+import (
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+)
+
+func RegisterOpenAIRoutes(app *fiber.App,
+	cl *config.BackendConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	auth func(*fiber.Ctx) error) {
+	// openAI compatible API endpoint
+
+	// chat
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+
+	// edit
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+
+	// assistant
+	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+
+	// files
+	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
+	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/files", auth, openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
+	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
+	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+
+	// completion
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+
+	// embeddings
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+
+	// audio
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(cl, ml, appConfig))
+
+	// images
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, ml, appConfig))
+
+	if appConfig.ImageDir != "" {
+		app.Static("/generated-images", appConfig.ImageDir)
+	}
+
+	if appConfig.AudioDir != "" {
+		app.Static("/generated-audio", appConfig.AudioDir)
+	}
+
+	// models
+	tmpLMS := services.NewListModelsService(ml, cl, appConfig) // TODO: once createApplication() is fully in use, reference the central instance.
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(tmpLMS))
+	app.Get("/models", auth, openai.ListModelsEndpoint(tmpLMS))
+}
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -0,0 +1,336 @@
+package routes
+
+import (
+	"fmt"
+	"html/template"
+	"sort"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/http/elements"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/xsync"
+	"github.com/rs/zerolog/log"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+)
+
+func RegisterUIRoutes(app *fiber.App,
+	cl *config.BackendConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	galleryService *services.GalleryService,
+	auth func(*fiber.Ctx) error) {
+
+	// keeps the state of models that are being installed from the UI
+	var processingModels = xsync.NewSyncedMap[string, string]()
+
+	// modelStatus returns the current status of the models being processed (installation or deletion)
+	// it is called asynchonously from the UI
+	modelStatus := func() (map[string]string, map[string]string) {
+		processingModelsData := processingModels.Map()
+
+		taskTypes := map[string]string{}
+
+		for k, v := range processingModelsData {
+			status := galleryService.GetStatus(v)
+			taskTypes[k] = "Installation"
+			if status != nil && status.Deletion {
+				taskTypes[k] = "Deletion"
+			} else if status == nil {
+				taskTypes[k] = "Waiting"
+			}
+		}
+
+		return processingModelsData, taskTypes
+	}
+
+	app.Get("/", auth, localai.WelcomeEndpoint(appConfig, cl, ml, modelStatus))
+
+	// Show the Models page (all models)
+	app.Get("/browse", auth, func(c *fiber.Ctx) error {
+		term := c.Query("term")
+
+		models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
+
+		// Get all available tags
+		allTags := map[string]struct{}{}
+		tags := []string{}
+		for _, m := range models {
+			for _, t := range m.Tags {
+				allTags[t] = struct{}{}
+			}
+		}
+		for t := range allTags {
+			tags = append(tags, t)
+		}
+		sort.Strings(tags)
+
+		if term != "" {
+			models = gallery.GalleryModels(models).Search(term)
+		}
+
+		// Get model statuses
+		processingModelsData, taskTypes := modelStatus()
+
+		summary := fiber.Map{
+			"Title":            "LocalAI - Models",
+			"Version":          internal.PrintableVersion(),
+			"Models":           template.HTML(elements.ListModels(models, processingModels, galleryService)),
+			"Repositories":     appConfig.Galleries,
+			"AllTags":          tags,
+			"ProcessingModels": processingModelsData,
+			"TaskTypes":        taskTypes,
+			//	"ApplicationConfig": appConfig,
+		}
+
+		// Render index
+		return c.Render("views/models", summary)
+	})
+
+	// Show the models, filtered from the user input
+	// https://htmx.org/examples/active-search/
+	app.Post("/browse/search/models", auth, func(c *fiber.Ctx) error {
+		form := struct {
+			Search string `form:"search"`
+		}{}
+		if err := c.BodyParser(&form); err != nil {
+			return c.Status(fiber.StatusBadRequest).SendString(err.Error())
+		}
+
+		models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
+
+		return c.SendString(elements.ListModels(gallery.GalleryModels(models).Search(form.Search), processingModels, galleryService))
+	})
+
+	/*
+
+		Install routes
+
+	*/
+
+	// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
+	// https://htmx.org/examples/progress-bar/
+	app.Post("/browse/install/model/:id", auth, func(c *fiber.Ctx) error {
+		galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
+		log.Debug().Msgf("UI job submitted to install  : %+v\n", galleryID)
+
+		id, err := uuid.NewUUID()
+		if err != nil {
+			return err
+		}
+
+		uid := id.String()
+
+		processingModels.Set(galleryID, uid)
+
+		op := gallery.GalleryOp{
+			Id:               uid,
+			GalleryModelName: galleryID,
+			Galleries:        appConfig.Galleries,
+		}
+		go func() {
+			galleryService.C <- op
+		}()
+
+		return c.SendString(elements.StartProgressBar(uid, "0", "Installation"))
+	})
+
+	// This route is used when the "Install" button is pressed, we submit here a new job to the gallery service
+	// https://htmx.org/examples/progress-bar/
+	app.Post("/browse/delete/model/:id", auth, func(c *fiber.Ctx) error {
+		galleryID := strings.Clone(c.Params("id")) // note: strings.Clone is required for multiple requests!
+		log.Debug().Msgf("UI job submitted to delete  : %+v\n", galleryID)
+		var galleryName = galleryID
+		if strings.Contains(galleryID, "@") {
+			// if the galleryID contains a @ it means that it's a model from a gallery
+			// but we want to delete it from the local models which does not need
+			// a repository ID
+			galleryName = strings.Split(galleryID, "@")[1]
+		}
+
+		id, err := uuid.NewUUID()
+		if err != nil {
+			return err
+		}
+
+		uid := id.String()
+
+		// Track the deletion job by galleryID and galleryName
+		// The GalleryID contains information about the repository,
+		// while the GalleryName is ONLY the name of the model
+		processingModels.Set(galleryName, uid)
+		processingModels.Set(galleryID, uid)
+
+		op := gallery.GalleryOp{
+			Id:               uid,
+			Delete:           true,
+			GalleryModelName: galleryName,
+		}
+		go func() {
+			galleryService.C <- op
+			cl.RemoveBackendConfig(galleryName)
+		}()
+
+		return c.SendString(elements.StartProgressBar(uid, "0", "Deletion"))
+	})
+
+	// Display the job current progress status
+	// If the job is done, we trigger the /browse/job/:uid route
+	// https://htmx.org/examples/progress-bar/
+	app.Get("/browse/job/progress/:uid", auth, func(c *fiber.Ctx) error {
+		jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!
+
+		status := galleryService.GetStatus(jobUID)
+		if status == nil {
+			//fmt.Errorf("could not find any status for ID")
+			return c.SendString(elements.ProgressBar("0"))
+		}
+
+		if status.Progress == 100 {
+			c.Set("HX-Trigger", "done") // this triggers /browse/job/:uid (which is when the job is done)
+			return c.SendString(elements.ProgressBar("100"))
+		}
+		if status.Error != nil {
+			return c.SendString(elements.ErrorProgress(status.Error.Error(), status.GalleryModelName))
+		}
+
+		return c.SendString(elements.ProgressBar(fmt.Sprint(status.Progress)))
+	})
+
+	// this route is hit when the job is done, and we display the
+	// final state (for now just displays "Installation completed")
+	app.Get("/browse/job/:uid", auth, func(c *fiber.Ctx) error {
+		jobUID := strings.Clone(c.Params("uid")) // note: strings.Clone is required for multiple requests!
+
+		status := galleryService.GetStatus(jobUID)
+
+		galleryID := ""
+		for _, k := range processingModels.Keys() {
+			if processingModels.Get(k) == jobUID {
+				galleryID = k
+				processingModels.Delete(k)
+			}
+		}
+		if galleryID == "" {
+			log.Debug().Msgf("no processing model found for job : %+v\n", jobUID)
+		}
+
+		log.Debug().Msgf("JOB finished  : %+v\n", status)
+		showDelete := true
+		displayText := "Installation completed"
+		if status.Deletion {
+			showDelete = false
+			displayText = "Deletion completed"
+		}
+
+		return c.SendString(elements.DoneProgress(galleryID, displayText, showDelete))
+	})
+
+	// Show the Chat page
+	app.Get("/chat/:model", auth, func(c *fiber.Ctx) error {
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		summary := fiber.Map{
+			"Title":        "LocalAI - Chat with " + c.Params("model"),
+			"ModelsConfig": backendConfigs,
+			"Model":        c.Params("model"),
+			"Version":      internal.PrintableVersion(),
+		}
+
+		// Render index
+		return c.Render("views/chat", summary)
+	})
+	app.Get("/chat/", auth, func(c *fiber.Ctx) error {
+
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		if len(backendConfigs) == 0 {
+			// If no model is available redirect to the index which suggests how to install models
+			return c.Redirect("/")
+		}
+
+		summary := fiber.Map{
+			"Title":        "LocalAI - Chat with " + backendConfigs[0].Name,
+			"ModelsConfig": backendConfigs,
+			"Model":        backendConfigs[0].Name,
+			"Version":      internal.PrintableVersion(),
+		}
+
+		// Render index
+		return c.Render("views/chat", summary)
+	})
+
+	app.Get("/text2image/:model", auth, func(c *fiber.Ctx) error {
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		summary := fiber.Map{
+			"Title":        "LocalAI - Generate images with " + c.Params("model"),
+			"ModelsConfig": backendConfigs,
+			"Model":        c.Params("model"),
+			"Version":      internal.PrintableVersion(),
+		}
+
+		// Render index
+		return c.Render("views/text2image", summary)
+	})
+
+	app.Get("/text2image/", auth, func(c *fiber.Ctx) error {
+
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		if len(backendConfigs) == 0 {
+			// If no model is available redirect to the index which suggests how to install models
+			return c.Redirect("/")
+		}
+
+		summary := fiber.Map{
+			"Title":        "LocalAI - Generate images with " + backendConfigs[0].Name,
+			"ModelsConfig": backendConfigs,
+			"Model":        backendConfigs[0].Name,
+			"Version":      internal.PrintableVersion(),
+		}
+
+		// Render index
+		return c.Render("views/text2image", summary)
+	})
+
+	app.Get("/tts/:model", auth, func(c *fiber.Ctx) error {
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		summary := fiber.Map{
+			"Title":        "LocalAI - Generate images with " + c.Params("model"),
+			"ModelsConfig": backendConfigs,
+			"Model":        c.Params("model"),
+			"Version":      internal.PrintableVersion(),
+		}
+
+		// Render index
+		return c.Render("views/tts", summary)
+	})
+
+	app.Get("/tts/", auth, func(c *fiber.Ctx) error {
+
+		backendConfigs := cl.GetAllBackendConfigs()
+
+		if len(backendConfigs) == 0 {
+			// If no model is available redirect to the index which suggests how to install models
+			return c.Redirect("/")
+		}
+
+		summary := fiber.Map{
+			"Title":        "LocalAI - Generate audio with " + backendConfigs[0].Name,
+			"ModelsConfig": backendConfigs,
+			"Model":        backendConfigs[0].Name,
+			"Version":      internal.PrintableVersion(),
+		}
+
+		// Render index
+		return c.Render("views/tts", summary)
+	})
+}
--- a/Show More
+++ b/Show More