Revert "Revert #1963 (#2056 )"

This reverts commit af9e5a2d05.
2026-02-04 19:52:36 -05:00 · 2024-04-17 23:36:17 +02:00
176 changed files with 4004 additions and 8295 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,7 +5,4 @@ models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
-Dockerfile*
-
-# SonarQube
-.scannerwork
+Dockerfile*
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -1,90 +0,0 @@
-name: 'generate and publish GRPC docker caches'
-
-on:
- workflow_dispatch
-
-concurrency:
-  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
-  cancel-in-progress: true
-
-jobs:
-  generate_caches:
-    strategy:
-      matrix:
-        include:
-          - grpc-base-image: ubuntu:22.04
-            runs-on: 'ubuntu-latest'
-            platforms: 'linux/amd64'
-    runs-on: ${{matrix.runs-on}}
-    steps:
-      - name: Release space from worker
-        if: matrix.runs-on == 'ubuntu-latest'
-        run: |
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          df -h
-          echo
-          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-          sudo rm -rf /usr/local/lib/android
-          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-          sudo rm -rf /usr/share/dotnet
-          sudo apt-get remove -y '^mono-.*' || true
-          sudo apt-get remove -y '^ghc-.*' || true
-          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-          sudo apt-get remove -y 'php.*' || true
-          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-          sudo apt-get remove -y '^google-.*' || true
-          sudo apt-get remove -y azure-cli || true
-          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get remove -y microsoft-edge-stable || true
-          sudo apt-get remove -y firefox || true
-          sudo apt-get remove -y powershell || true
-          sudo apt-get remove -y r-base-core || true
-          sudo apt-get autoremove -y
-          sudo apt-get clean
-          echo
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          sudo rm -rfv build || true
-          sudo rm -rf /usr/share/dotnet || true
-          sudo rm -rf /opt/ghc || true
-          sudo rm -rf "/usr/local/share/boost" || true
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
-          df -h
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@master
-        with:
-          platforms: all
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@master
-
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Cache GRPC
-        uses: docker/build-push-action@v5
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
-          # This means that even the MAKEFLAGS have to be an EXACT match.
-          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
-          build-args: |
-            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
-            MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.58.0
-          context: .
-          file: ./Dockerfile
-          cache-to: type=gha,ignore-error=true
-          target: grpc
-          platforms: ${{ matrix.platforms }}
-          push: false
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -22,7 +22,6 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -62,14 +61,12 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: 'sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -88,7 +85,6 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
@@ -106,12 +102,11 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -127,4 +122,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=5 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -26,7 +26,6 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
@@ -130,7 +129,6 @@ jobs:
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
@@ -142,14 +140,12 @@ jobs:
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -161,8 +157,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -175,8 +170,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -185,8 +179,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
@@ -195,8 +188,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -205,8 +197,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
-            grpc-base-image: "ubuntu:22.04"
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
@@ -219,7 +210,6 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
@@ -229,7 +219,6 @@ jobs:
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  
@@ -247,7 +236,6 @@ jobs:
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
@@ -270,7 +258,7 @@ jobs:
            aio: "-aio-cpu"
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -281,7 +269,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -292,7 +280,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -303,7 +291,7 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -314,4 +302,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=5 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -6,10 +6,6 @@ on:
    inputs:
      base-image:
        description: 'Base image'
-        required: true
-        type: string
-      grpc-base-image:
-        description: 'GRPC Base image, must be a compatible image with base-image'
        required: false
        default: ''
        type: string
@@ -61,7 +57,7 @@ on:
      makeflags:
        description: 'Make Flags'
        required: false
-        default: '--jobs=4 --output-sync=target'
+        default: '--jobs=3 --output-sync=target'
        type: string
      aio:
        description: 'AIO Image Name'
@@ -205,16 +201,15 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
-          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
-          # This means that even the MAKEFLAGS have to be an EXACT match.
-          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
          build-args: |
-            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
-            MAKEFLAGS=--jobs=4 --output-sync=target
+            IMAGE_TYPE=${{ inputs.image-type }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            MAKEFLAGS=${{ inputs.makeflags }}
            GRPC_VERSION=v1.58.0
          context: .
          file: ./Dockerfile
          cache-from: type=gha
+          cache-to: type=gha,ignore-error=true
          target: grpc
          platforms: ${{ inputs.platforms }}
          push: false
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -74,37 +74,6 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test

-
-  tests-rerankers:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test rerankers
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make --jobs=5 --output-sync=target -C backend/python/rerankers
-           make --jobs=5 --output-sync=target -C backend/python/rerankers test
-
  tests-diffusers:
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -123,9 +123,7 @@ jobs:
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
-          detached: true
          connect-timeout-seconds: 180
-          limit-access-to-actor: true

  tests-aio-container:
    runs-on: ubuntu-latest
@@ -178,9 +176,7 @@ jobs:
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
-          detached: true
          connect-timeout-seconds: 180
-          limit-access-to-actor: true

  tests-apple:
    runs-on: macOS-14
@@ -215,6 +211,4 @@ jobs:
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
-          detached: true
-          connect-timeout-seconds: 180
-          limit-access-to-actor: true
+          connect-timeout-seconds: 180
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -1,31 +0,0 @@
-name: Update swagger
-on:
-  schedule:
-    - cron: 0 20 * * *
-  workflow_dispatch:
-jobs:
-  swagger:
-    strategy:
-      fail-fast: false
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version: 'stable'
-      - run: |
-          go install github.com/swaggo/swag/cmd/swag@latest
-      - name: Bump swagger 🔧
-        run: |
-          make swagger
-      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v6
-        with:
-          token: ${{ secrets.UPDATE_BOT_TOKEN }}
-          push-to-fork: ci-forks/LocalAI
-          commit-message: 'feat(swagger): update swagger'
-          title: 'feat(swagger): update swagger'
-          branch: "update/swagger"
-          body:  Update swagger
-          signoff: true
-
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -1,18 +0,0 @@
-name: 'Yamllint GitHub Actions'
-on:
-  - pull_request
-jobs:
-  yamllint:
-    name: 'Yamllint'
-    runs-on: ubuntu-latest
-    steps:
-      - name: 'Checkout'
-        uses: actions/checkout@master
-      - name: 'Yamllint'
-        uses: karancode/yamllint-github-action@master
-        with:
-          yamllint_file_or_dir: 'gallery'
-          yamllint_strict: false
-          yamllint_comment: true
-        env:
-          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -44,6 +44,3 @@ prepare
 *.pb.go
 *pb2.py
 *pb2_grpc.py
-
-# SonarQube
-.scannerwork
--- a/.yamllint
+++ b/.yamllint
@@ -1,4 +0,0 @@
-extends: default
-
-rules:
-    line-length: disable
--- a/29
+++ b/29
@@ -1,9 +1,8 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
-ARG GRPC_BASE_IMAGE=${BASE_IMAGE}

 # extras or core
-FROM ${BASE_IMAGE} AS requirements-core
+FROM ${BASE_IMAGE} as requirements-core

 USER root

@@ -16,7 +15,7 @@ ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"

 ARG GO_TAGS="stablediffusion tinydream tts"

@@ -24,7 +23,7 @@ RUN apt-get update && \
    apt-get install -y ca-certificates curl python3-pip unzip && apt-get clean

 # Install Go
-RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin

 # Install grpc compilers
@@ -80,7 +79,7 @@ RUN test -n "$TARGETARCH" \
 ###################################
 ###################################

-FROM requirements-core AS requirements-extras
+FROM requirements-core as requirements-extras

 RUN apt install -y gpg && \
    curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
@@ -105,7 +104,7 @@ RUN if [ ! -e /usr/bin/python ]; then \
 ###################################
 ###################################

-FROM ${GRPC_BASE_IMAGE} AS grpc
+FROM ${BASE_IMAGE} as grpc

 ARG MAKEFLAGS
 ARG GRPC_VERSION=v1.58.0
@@ -121,15 +120,16 @@ RUN apt-get update && \

 RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc

-WORKDIR /build/grpc/cmake/build
-
-RUN cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
+RUN cd grpc && \
+    mkdir -p cmake/build && \
+    cd cmake/build && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
    make

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE} AS builder
+FROM requirements-${IMAGE_TYPE} as builder

 ARG GO_TAGS="stablediffusion tts"
 ARG GRPC_BACKENDS
@@ -167,11 +167,9 @@ RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

 COPY --from=grpc /build/grpc ./grpc/

-WORKDIR /build/grpc/cmake/build
-RUN make install
+RUN cd /build/grpc/cmake/build && make install

 # Rebuild with defaults backends
-WORKDIR /build
 RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
@@ -259,9 +257,6 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/sentencetransformers \
    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    make -C backend/python/rerankers \
-    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
    make -C backend/python/transformers \
    ; fi
@@ -292,7 +287,7 @@ RUN mkdir -p /build/models

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
-  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
+  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
  
 VOLUME /build/models
 EXPOSE 8080
--- a/103
+++ b/103
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=46e12c4692a37bdd31a0432fc5153d7d22bc7f72
+CPPLLAMA_VERSION?=7593639ce335e8d7f89aa9a54d616951f273af60

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=858452d58dba3acdc3431c9bced2bb8cfd9bf418
+WHISPER_CPP_VERSION?=b0c3cbf2e851cf232e432b590dcc514a689ec028

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -25,7 +25,7 @@ BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=433ea6d9b64d9d08067324a757ef07040ea29568
+STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485

 # tinydream version
 TINYDREAM_VERSION?=22a12a4bc0ac5455856f28f3b771331a551a4293
@@ -179,20 +179,20 @@ endif
 all: help

 ## BERT embeddings
-sources/go-bert.cpp:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert.cpp
-	cd sources/go-bert.cpp && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
+	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
-	$(MAKE) -C sources/go-bert.cpp libgobert.a
+sources/go-bert/libgobert.a: sources/go-bert
+	$(MAKE) -C sources/go-bert libgobert.a

-## go-llama.cpp
-sources/go-llama.cpp:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama.cpp
-	cd sources/go-llama.cpp && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+## go-llama-ggml
+sources/go-llama-ggml:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
+	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
-	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
+	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

 ## go-piper
 sources/go-piper:
@@ -211,12 +211,12 @@ sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
-sources/go-rwkv.cpp:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv.cpp
-	cd sources/go-rwkv.cpp && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
+	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
-	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+sources/go-rwkv/librwkv.a: sources/go-rwkv
+	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

 ## stable diffusion
 sources/go-stable-diffusion:
@@ -236,24 +236,23 @@ sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream

 ## whisper
 sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp sources/whisper.cpp
+	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && make libwhisper.a

-get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream
+get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream

 replace:
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
@@ -272,12 +271,12 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama.cpp clean
+	$(MAKE) -C sources/go-llama-ggml clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-rwkv.cpp clean
+	$(MAKE) -C sources/go-rwkv clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert.cpp clean
+	$(MAKE) -C sources/go-bert clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
@@ -302,6 +301,9 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

+halt-backends:		## Used to clean up stray backends sometimes left running when debugging manually
+	ps | grep 'backend-assets/grpc/' | awk '{print $$1}' | xargs -I {} kill -9 {}
+
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -366,13 +368,13 @@ run-e2e-image:

 run-e2e-aio:
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio

 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e

 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
@@ -380,15 +382,15 @@ teardown-e2e:

 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
@@ -437,10 +439,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean

 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -506,14 +508,6 @@ petals-protogen:
 petals-protogen-clean:
 	$(MAKE) -C backend/python/petals protogen-clean

-.PHONY: rerankers-protogen
-rerankers-protogen:
-	$(MAKE) -C backend/python/rerankers protogen
-
-.PHONY: rerankers-protogen-clean
-rerankers-protogen-clean:
-	$(MAKE) -C backend/python/rerankers protogen-clean
-
 .PHONY: sentencetransformers-protogen
 sentencetransformers-protogen:
 	$(MAKE) -C backend/python/sentencetransformers protogen
@@ -572,7 +566,6 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
-	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/transformers-musicgen
 	$(MAKE) -C backend/python/parler-tts
@@ -608,8 +601,8 @@ backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
+backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
@@ -651,16 +644,20 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

-backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
+backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
-
+#	EXPERIMENTAL:
+ifeq ($(BUILD_TYPE),metal)
+	cp $(CURDIR)/sources/go-llama-ggml/llama.cpp/ggml-metal.metal backend-assets/grpc/
+endif
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

-backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
+backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv

 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
@@ -707,7 +704,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -715,7 +712,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -723,4 +720,4 @@ docker-image-intel-xpu:

 .PHONY: swagger
 swagger:
-	swag init -g core/http/app.go --output swagger
+	swag init -g core/http/api.go --output swagger
--- a/README.md
+++ b/README.md
@@ -44,19 +44,18 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.

 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- Reranker API: https://github.com/mudler/LocalAI/pull/2121
- Gallery WebUI: https://github.com/mudler/LocalAI/pull/2104
- llama3: https://github.com/mudler/LocalAI/discussions/2076
 - Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
+- Landing page: https://github.com/mudler/LocalAI/pull/1922
 - Openvino support: https://github.com/mudler/LocalAI/pull/1892
 - Vector store: https://github.com/mudler/LocalAI/pull/1795
 - All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
+- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715

 Hot topics (looking for contributors):
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -1,27 +0,0 @@
-name: jina-reranker-v1-base-en
-backend: rerankers
-parameters:
-  model: cross-encoder
-
-usage: |
-    You can test this model with curl like this:
-
-    curl http://localhost:8080/v1/rerank \
-      -H "Content-Type: application/json" \
-      -d '{
-      "model": "jina-reranker-v1-base-en",
-      "query": "Organic skincare products for sensitive skin",
-      "documents": [
-        "Eco-friendly kitchenware for modern homes",
-        "Biodegradable cleaning supplies for eco-conscious consumers",
-        "Organic cotton baby clothes for sensitive skin",
-        "Natural organic skincare range for sensitive skin",
-        "Tech gadgets for smart homes: 2024 edition",
-        "Sustainable gardening tools and compost solutions",
-        "Sensitive skin-friendly facial cleansers and toners",
-        "Organic food wraps and storage solutions",
-        "All-natural pet food for dogs with allergies",
-        "Yoga mats made from recycled materials"
-      ],
-      "top_n": 3
-    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -6,22 +6,15 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
+    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
    {{- if .Content}}
-    {{.Content }}
+    {{.Content}}
    {{- end }}
-    {{- if .FunctionCall}}
-    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    <|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -36,7 +29,8 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call><|im_end|>
+    </tool_call>
+    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size

 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"

 check_vars

--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -1,27 +0,0 @@
-name: jina-reranker-v1-base-en
-backend: rerankers
-parameters:
-  model: cross-encoder
-
-usage: |
-    You can test this model with curl like this:
-
-    curl http://localhost:8080/v1/rerank \
-      -H "Content-Type: application/json" \
-      -d '{
-      "model": "jina-reranker-v1-base-en",
-      "query": "Organic skincare products for sensitive skin",
-      "documents": [
-        "Eco-friendly kitchenware for modern homes",
-        "Biodegradable cleaning supplies for eco-conscious consumers",
-        "Organic cotton baby clothes for sensitive skin",
-        "Natural organic skincare range for sensitive skin",
-        "Tech gadgets for smart homes: 2024 edition",
-        "Sustainable gardening tools and compost solutions",
-        "Sensitive skin-friendly facial cleansers and toners",
-        "Organic food wraps and storage solutions",
-        "All-natural pet food for dogs with allergies",
-        "Yoga mats made from recycled materials"
-      ],
-      "top_n": 3
-    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -6,22 +6,15 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
+    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
    {{- if .Content}}
-    {{.Content }}
+    {{.Content}}
    {{- end }}
-    {{- if .FunctionCall}}
-    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    <|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -36,7 +29,8 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call><|im_end|>
+    </tool_call>
+    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -1,27 +0,0 @@
-name: jina-reranker-v1-base-en
-backend: rerankers
-parameters:
-  model: cross-encoder
-
-usage: |
-    You can test this model with curl like this:
-
-    curl http://localhost:8080/v1/rerank \
-      -H "Content-Type: application/json" \
-      -d '{
-      "model": "jina-reranker-v1-base-en",
-      "query": "Organic skincare products for sensitive skin",
-      "documents": [
-        "Eco-friendly kitchenware for modern homes",
-        "Biodegradable cleaning supplies for eco-conscious consumers",
-        "Organic cotton baby clothes for sensitive skin",
-        "Natural organic skincare range for sensitive skin",
-        "Tech gadgets for smart homes: 2024 edition",
-        "Sustainable gardening tools and compost solutions",
-        "Sensitive skin-friendly facial cleansers and toners",
-        "Organic food wraps and storage solutions",
-        "All-natural pet food for dogs with allergies",
-        "Yoga mats made from recycled materials"
-      ],
-      "top_n": 3
-    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -7,22 +7,15 @@ parameters:
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{- if .FunctionCall }}
-    <tool_call>
-    {{- else if eq .RoleName "tool" }}
-    <tool_response>
-    {{- end }}
+    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
    {{- if .Content}}
-    {{.Content }}
+    {{.Content}}
    {{- end }}
-    {{- if .FunctionCall}}
-    {{toJson .FunctionCall}}
-    {{- end }}
-    {{- if .FunctionCall }}
-    </tool_call>
-    {{- else if eq .RoleName "tool" }}
-    </tool_response>
-    {{- end }}<|im_end|>
+    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    <|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
@@ -37,7 +30,8 @@ template:
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
-    </tool_call><|im_end|>
+    </tool_call>
+    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -23,30 +23,6 @@ service Backend {
  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
-
-  rpc Rerank(RerankRequest) returns (RerankResult) {}
-}
-
-message RerankRequest {
-  string query = 1;
-  repeated string documents = 2;
-  int32 top_n = 3;
-}
-
-message RerankResult {
-  Usage usage = 1;
-  repeated DocumentResult results = 2;
-}
-
-message Usage {
-  int32 total_tokens = 1;
-  int32 prompt_tokens = 2;
-}
-
-message DocumentResult {
-  int32 index = 1;
-  string text = 2;
-  float relevance_score = 3;
 }

 message StoresKey {
@@ -201,7 +177,6 @@ message ModelOptions {
  bool   EnforceEager = 52;
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
-  int32  TensorParallelSize = 55;

  string MMProj = 41;

--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -11,8 +11,8 @@ import (
 	"github.com/go-skynet/LocalAI/core/schema"
 )

-func ffmpegCommand(args []string) (string, error) {
-	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
+func runCommand(command []string) (string, error) {
+	cmd := exec.Command(command[0], command[1:]...)
 	cmd.Env = os.Environ()
 	out, err := cmd.CombinedOutput()
 	return string(out), err
@@ -21,16 +21,16 @@ func ffmpegCommand(args []string) (string, error) {
 // AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
-	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
-	out, err := ffmpegCommand(commandArgs)
+	command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := runCommand(command)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
 	return nil
 }

-func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.Result, error) {
-	res := schema.Result{}
+func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.TranscriptionResult, error) {
+	res := schema.TranscriptionResult{}

 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -21,6 +21,6 @@ func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.Result, error) {
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
 	return Transcript(sd.whisper, opts.Dst, opts.Language, uint(opts.Threads))
 }
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -120,6 +120,4 @@ dependencies:
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
-      - rerankers[transformers]
-      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -108,6 +108,4 @@ dependencies:
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1
-      - rerankers[transformers]
-      - pydantic
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -60,10 +60,9 @@ dependencies:
      - networkx
      - numpy==1.26.0
      - onnx==1.15.0
-      - openvino==2024.1.0
-      - openvino-telemetry==2024.1.0
-      - optimum[openvino]==1.19.1
-      - optimum-intel==1.16.1
+      - openvino==2024.0.0
+      - openvino-telemetry==2023.2.1
+      - optimum[openvino]==1.17.1
      - packaging==23.2
      - pandas
      - peft==0.5.0
@@ -112,7 +111,5 @@ dependencies:
      - vllm>=0.4.0
      - transformers>=4.38.2  # Updated Version
      - transformers_stream_generator==0.0.5
-      - xformers==0.0.23.post1
-      - rerankers[transformers]
-      - pydantic
+      - xformers==0.0.23.post1  
 prefix: /opt/conda/envs/transformers
--- a/backend/python/diffusers/diffusers-rocm.yml
+++ b/backend/python/diffusers/diffusers-rocm.yml
@@ -61,5 +61,4 @@ dependencies:
      - urllib3==2.0.6
      - zipp==3.17.0
      - torch
-      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/diffusers/diffusers.yml
+++ b/backend/python/diffusers/diffusers.yml
@@ -71,5 +71,4 @@ dependencies:
      - typing-extensions==4.8.0
      - urllib3==2.0.6
      - zipp==3.17.0
-      - opencv-python
 prefix: /opt/conda/envs/diffusers
--- a/backend/python/rerankers/Makefile
+++ b/backend/python/rerankers/Makefile
@@ -1,27 +0,0 @@
-.PHONY: rerankers
-rerankers: protogen
-	$(MAKE) -C ../common-env/transformers
-
-
-.PHONY: run
-run: protogen
-	@echo "Running rerankers..."
-	bash run.sh
-	@echo "rerankers run."
-
-# It is not working well by using command line. It only6 works with IDE like VSCode.
-.PHONY: test
-test: protogen
-	@echo "Testing rerankers..."
-	bash test.sh
-	@echo "rerankers tested."
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/rerankers/README.md
+++ b/backend/python/rerankers/README.md
@@ -1,5 +0,0 @@
-# Creating a separate environment for the reranker project
-
-```
-make reranker
-```
--- a/backend/python/rerankers/reranker.py
+++ b/backend/python/rerankers/reranker.py
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-"""
-Extra gRPC server for Rerankers models.
-"""
-from concurrent import futures
-
-import argparse
-import signal
-import sys
-import os
-
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-from rerankers import Reranker
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    """
-    A gRPC servicer for the backend service.
-
-    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
-    """
-    def Health(self, request, context):
-        """
-        A gRPC method that returns the health status of the backend service.
-
-        Args:
-            request: A HealthRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            A Reply object that contains the health status of the backend service.
-        """
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-
-    def LoadModel(self, request, context):
-        """
-        A gRPC method that loads a model into memory.
-
-        Args:
-            request: A LoadModelRequest object that contains the request parameters.
-            context: A grpc.ServicerContext object that provides information about the RPC.
-
-        Returns:
-            A Result object that contains the result of the LoadModel operation.
-        """
-        model_name = request.Model
-        try:
-            kwargs = {}
-            if request.Type != "":
-                kwargs['model_type'] = request.Type
-            if request.PipelineType != "": # Reuse the PipelineType field for language
-                kwargs['lang'] = request.PipelineType
-            self.model_name = model_name
-            self.model = Reranker(model_name, **kwargs)  
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-
-        # Implement your logic here for the LoadModel service
-        # Replace this with your desired response
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Rerank(self, request, context):
-        documents = []
-        for idx, doc in enumerate(request.documents):
-            documents.append(doc)
-        ranked_results=self.model.rank(query=request.query, docs=documents, doc_ids=list(range(len(request.documents))))
-        # Prepare results to return
-        results = [
-            backend_pb2.DocumentResult(
-                index=res.doc_id,
-                text=res.text,
-                relevance_score=res.score
-            ) for res in ranked_results.results
-        ]
-
-        # Calculate the usage and total tokens
-        # TODO: Implement the usage calculation with reranker
-        total_tokens = sum(len(doc.split()) for doc in request.documents) + len(request.query.split())
-        prompt_tokens = len(request.query.split())
-        usage = backend_pb2.Usage(total_tokens=total_tokens, prompt_tokens=prompt_tokens)
-        return backend_pb2.RerankResult(usage=usage, results=results)
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/rerankers/run.sh
+++ b/backend/python/rerankers/run.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-##
-## A bash script wrapper that runs the reranker server with conda
-
-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python $DIR/reranker.py $@
--- a/backend/python/rerankers/test.sh
+++ b/backend/python/rerankers/test.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-##
-## A bash script wrapper that runs the reranker server with conda
-
-# Activate conda environment
-source activate transformers
-
-# get the directory where the bash script is located
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-python -m unittest $DIR/test_reranker.py
--- a/backend/python/rerankers/test_reranker.py
+++ b/backend/python/rerankers/test_reranker.py
@@ -1,90 +0,0 @@
-"""
-A test script to test the gRPC service
-"""
-import unittest
-import subprocess
-import time
-import backend_pb2
-import backend_pb2_grpc
-
-import grpc
-
-
-class TestBackendServicer(unittest.TestCase):
-    """
-    TestBackendServicer is the class that tests the gRPC service
-    """
-    def setUp(self):
-        """
-        This method sets up the gRPC service by starting the server
-        """
-        self.service = subprocess.Popen(["python3", "reranker.py", "--addr", "localhost:50051"])
-        time.sleep(10)
-
-    def tearDown(self) -> None:
-        """
-        This method tears down the gRPC service by terminating the server
-        """
-        self.service.kill()
-        self.service.wait()
-
-    def test_server_startup(self):
-        """
-        This method tests if the server starts up successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.Health(backend_pb2.HealthMessage())
-                self.assertEqual(response.message, b'OK')
-        except Exception as err:
-            print(err)
-            self.fail("Server failed to start")
-        finally:
-            self.tearDown()
-
-    def test_load_model(self):
-        """
-        This method tests if the model is loaded successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
-                self.assertTrue(response.success)
-                self.assertEqual(response.message, "Model loaded successfully")
-        except Exception as err:
-            print(err)
-            self.fail("LoadModel service failed")
-        finally:
-            self.tearDown()
-
-    def test_rerank(self):
-        """
-        This method tests if the embeddings are generated successfully
-        """
-        try:
-            self.setUp()
-            with grpc.insecure_channel("localhost:50051") as channel:
-                stub = backend_pb2_grpc.BackendStub(channel)
-                request = backend_pb2.RerankRequest(
-                    query="I love you",
-                    documents=["I hate you", "I really like you"],
-                    top_n=2
-                )
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="cross-encoder"))
-                self.assertTrue(response.success)
-               
-                rerank_response = stub.Rerank(request)
-                print(rerank_response.results[0])
-                self.assertIsNotNone(rerank_response.results)
-                self.assertEqual(len(rerank_response.results), 2)
-                self.assertEqual(rerank_response.results[0].text, "I really like you")
-                self.assertEqual(rerank_response.results[1].text, "I hate you")
-        except Exception as err:
-            print(err)
-            self.fail("Reranker service failed")
-        finally:
-            self.tearDown()
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -148,9 +148,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                else:
                    device_map="CPU"
                self.model = OVModelForCausalLM.from_pretrained(model_name, 
-                                                                compile=True,
-                                                                trust_remote_code=request.TrustRemoteCode,
-                                                                ov_config={"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT"}, 
+                                                                compile=True, 
                                                                device=device_map)
                self.OV = True
            else:
@@ -160,11 +158,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                                       quantization_config=quantization, 
                                                       device_map=device_map, 
                                                       torch_dtype=compute)
-            if request.ContextSize > 0:
-                self.max_tokens = request.ContextSize
-            else:
-                self.max_tokens = self.model.config.max_position_embeddings
- 
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
            self.XPU = False

@@ -219,27 +212,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        set_seed(request.Seed)
        if request.TopP == 0:
            request.TopP = 0.9
-        
-        if request.TopK == 0:
-            request.TopK = 40
-
-        prompt = request.Prompt
-        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:    
-            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
-
-        eos_token_id = self.tokenizer.eos_token_id
-        if request.StopPrompts:
-            eos_token_id = []
-            for word in request.StopPrompts:
-                eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
-
-        inputs = self.tokenizer(prompt, return_tensors="pt")

+        max_tokens = 200
        if request.Tokens > 0:
            max_tokens = request.Tokens
-        else:
-            max_tokens = self.max_tokens - inputs["input_ids"].size()[inputs["input_ids"].dim()-1]

+        inputs = self.tokenizer(request.Prompt, return_tensors="pt")
        if self.CUDA:
            inputs = inputs.to("cuda")
        if XPU and self.OV == False:
@@ -257,7 +235,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=eos_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
                        streamer=streamer)
            thread=Thread(target=self.model.generate, kwargs=config)
@@ -286,7 +264,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        top_k=request.TopK, 
                        do_sample=True,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=eos_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id)
            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]

--- a/backend/python/vllm/backend_vllm.py
+++ b/backend/python/vllm/backend_vllm.py
@@ -95,8 +95,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            engine_args.trust_remote_code = request.TrustRemoteCode
        if request.EnforceEager:
            engine_args.enforce_eager = request.EnforceEager
-        if request.TensorParallelSize:
-            engine_args.tensor_parallel_size = request.TensorParallelSize
        if request.SwapSpace != 0:
            engine_args.swap_space = request.SwapSpace
        if request.MaxModelLen != 0:
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -2,14 +2,100 @@ package backend

 import (
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/google/uuid"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
+type EmbeddingsBackendService struct {
+	ml        *model.ModelLoader
+	bcl       *config.BackendConfigLoader
+	appConfig *config.ApplicationConfig
+}
+
+func NewEmbeddingsBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *EmbeddingsBackendService {
+	return &EmbeddingsBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
+	}
+}
+
+func (ebs *EmbeddingsBackendService) Embeddings(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.OpenAIResponse] {
+
+	resultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	go func(request *schema.OpenAIRequest) {
+		if request.Model == "" {
+			request.Model = model.StableDiffusionBackend
+		}
+
+		bc, request, err := ebs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, ebs.appConfig)
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+			close(resultChannel)
+			return
+		}
+
+		items := []schema.Item{}
+
+		for i, s := range bc.InputToken {
+			// get the model function to call for the result
+			embedFn, err := modelEmbedding("", s, ebs.ml, bc, ebs.appConfig)
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		for i, s := range bc.InputStrings {
+			// get the model function to call for the result
+			embedFn, err := modelEmbedding(s, []int{}, ebs.ml, bc, ebs.appConfig)
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:    items,
+			Object:  "list",
+		}
+		resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: resp}
+		close(resultChannel)
+	}(request)
+	return resultChannel
+}
+
+func modelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
 	modelFile := backendConfig.Model

 	grpcOpts := gRPCModelOpts(backendConfig)
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -1,18 +1,252 @@
 package backend

 import (
-	"github.com/go-skynet/LocalAI/core/config"
+	"bufio"
+	"encoding/base64"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"

+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog/log"
+
+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 )

-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
+type ImageGenerationBackendService struct {
+	ml                        *model.ModelLoader
+	bcl                       *config.BackendConfigLoader
+	appConfig                 *config.ApplicationConfig
+	BaseUrlForGeneratedImages string
+}
+
+func NewImageGenerationBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *ImageGenerationBackendService {
+	return &ImageGenerationBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
+	}
+}
+
+func (igbs *ImageGenerationBackendService) GenerateImage(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.OpenAIResponse] {
+	resultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	go func(request *schema.OpenAIRequest) {
+		bc, request, err := igbs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, igbs.appConfig)
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+			close(resultChannel)
+			return
+		}
+
+		src := ""
+		if request.File != "" {
+
+			var fileData []byte
+			// check if input.File is an URL, if so download it and save it
+			// to a temporary file
+			if strings.HasPrefix(request.File, "http://") || strings.HasPrefix(request.File, "https://") {
+				out, err := downloadFile(request.File)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("failed downloading file:%w", err)}
+					close(resultChannel)
+					return
+				}
+				defer os.RemoveAll(out)
+
+				fileData, err = os.ReadFile(out)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("failed reading file:%w", err)}
+					close(resultChannel)
+					return
+				}
+
+			} else {
+				// base 64 decode the file and write it somewhere
+				// that we will cleanup
+				fileData, err = base64.StdEncoding.DecodeString(request.File)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+			}
+
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(igbs.appConfig.ImageDir, "b64")
+			if err != nil {
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(fileData)
+			if err != nil {
+				outputFile.Close()
+				resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+				close(resultChannel)
+				return
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", bc)
+
+		switch bc.Backend {
+		case "stablediffusion":
+			bc.Backend = model.StableDiffusionBackend
+		case "tinydream":
+			bc.Backend = model.TinyDreamBackend
+		case "":
+			bc.Backend = model.StableDiffusionBackend
+			if bc.Model == "" {
+				bc.Model = "stablediffusion_assets" // TODO: check?
+			}
+		}
+
+		sizeParts := strings.Split(request.Size, "x")
+		if len(sizeParts) != 2 {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
+			close(resultChannel)
+			return
+		}
+		width, err := strconv.Atoi(sizeParts[0])
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
+			close(resultChannel)
+			return
+		}
+		height, err := strconv.Atoi(sizeParts[1])
+		if err != nil {
+			resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: fmt.Errorf("invalid value for 'size'")}
+			close(resultChannel)
+			return
+		}
+
+		b64JSON := false
+		if request.ResponseFormat.Type == "b64_json" {
+			b64JSON = true
+		}
+		// src and clip_skip
+		var result []schema.Item
+		for _, i := range bc.PromptStrings {
+			n := request.N
+			if request.N == 0 {
+				n = 1
+			}
+			for j := 0; j < n; j++ {
+				prompts := strings.Split(i, "|")
+				positive_prompt := prompts[0]
+				negative_prompt := ""
+				if len(prompts) > 1 {
+					negative_prompt = prompts[1]
+				}
+
+				mode := 0
+				step := bc.Step
+				if step == 0 {
+					step = 15
+				}
+
+				if request.Mode != 0 {
+					mode = request.Mode
+				}
+
+				if request.Step != 0 {
+					step = request.Step
+				}
+
+				tempDir := ""
+				if !b64JSON {
+					tempDir = igbs.appConfig.ImageDir
+				}
+				// Create a temporary file
+				outputFile, err := os.CreateTemp(tempDir, "b64")
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+				outputFile.Close()
+				output := outputFile.Name() + ".png"
+				// Rename the temporary file
+				err = os.Rename(outputFile.Name(), output)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+
+				if request.Seed == nil {
+					zVal := 0 // Idiomatic way to do this? Actually needed?
+					request.Seed = &zVal
+				}
+
+				fn, err := imageGeneration(height, width, mode, step, *request.Seed, positive_prompt, negative_prompt, src, output, igbs.ml, bc, igbs.appConfig)
+				if err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+				if err := fn(); err != nil {
+					resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+					close(resultChannel)
+					return
+				}
+
+				item := &schema.Item{}
+
+				if b64JSON {
+					defer os.RemoveAll(output)
+					data, err := os.ReadFile(output)
+					if err != nil {
+						resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: err}
+						close(resultChannel)
+						return
+					}
+					item.B64JSON = base64.StdEncoding.EncodeToString(data)
+				} else {
+					base := filepath.Base(output)
+					item.URL = igbs.BaseUrlForGeneratedImages + base
+				}
+
+				result = append(result, *item)
+			}
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Data:    result,
+		}
+		resultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: resp}
+		close(resultChannel)
+	}(request)
+	return resultChannel
+}
+
+func imageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
+
 	threads := backendConfig.Threads
 	if *threads == 0 && appConfig.Threads != 0 {
 		threads = &appConfig.Threads
 	}
+
 	gRPCOpts := gRPCModelOpts(backendConfig)
+
 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(backendConfig.Backend),
 		model.WithAssetDir(appConfig.AssetsDestination),
@@ -50,3 +284,24 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat

 	return fn, nil
 }
+
+// TODO: Replace this function with pkg/downloader - no reason to have a (crappier) bespoke download file fn here, but get things working before that change.
+func downloadFile(url string) (string, error) {
+	// Get the data
+	resp, err := http.Get(url)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	// Create the file
+	out, err := os.CreateTemp("", "image")
+	if err != nil {
+		return "", err
+	}
+	defer out.Close()
+
+	// Write the body to file
+	_, err = io.Copy(out, resp.Body)
+	return out.Name(), err
+}
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -11,17 +11,22 @@ import (

 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/rs/zerolog/log"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-type LLMResponse struct {
-	Response string // should this be []byte?
-	Usage    TokenUsage
+type LLMRequest struct {
+	Id          int // TODO Remove if not used.
+	Text        string
+	Images      []string
+	RawMessages []schema.Message
+	// TODO: Other Modalities?
 }

 type TokenUsage struct {
@@ -29,57 +34,94 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
-	modelFile := c.Model
-	threads := c.Threads
-	if *threads == 0 && o.Threads != 0 {
-		threads = &o.Threads
+type LLMResponse struct {
+	Request  *LLMRequest
+	Response string // should this be []byte?
+	Usage    TokenUsage
+}
+
+// TODO: Does this belong here or in core/services/openai.go?
+type LLMResponseBundle struct {
+	Request  *schema.OpenAIRequest
+	Response []schema.Choice
+	Usage    TokenUsage
+}
+
+type LLMBackendService struct {
+	bcl        *config.BackendConfigLoader
+	ml         *model.ModelLoader
+	appConfig  *config.ApplicationConfig
+	ftMutex    sync.Mutex
+	cutstrings map[string]*regexp.Regexp
+}
+
+func NewLLMBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *LLMBackendService {
+	return &LLMBackendService{
+		bcl:        bcl,
+		ml:         ml,
+		appConfig:  appConfig,
+		ftMutex:    sync.Mutex{},
+		cutstrings: make(map[string]*regexp.Regexp),
 	}
-	grpcOpts := gRPCModelOpts(c)
+}
+
+// TODO: Should ctx param be removed and replaced with hardcoded req.Context?
+func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest, bc *config.BackendConfig, enableTokenChannel bool) (
+	resultChannel <-chan concurrency.ErrorOr[*LLMResponse], tokenChannel <-chan concurrency.ErrorOr[*LLMResponse], err error) {
+
+	threads := bc.Threads
+	if (threads == nil || *threads == 0) && llmbs.appConfig.Threads != 0 {
+		threads = &llmbs.appConfig.Threads
+	}
+
+	grpcOpts := gRPCModelOpts(bc)

 	var inferenceModel grpc.Backend
-	var err error

-	opts := modelOpts(c, o, []model.Option{
+	opts := modelOpts(bc, llmbs.appConfig, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
-		model.WithAssetDir(o.AssetsDestination),
-		model.WithModel(modelFile),
-		model.WithContext(o.Context),
+		model.WithAssetDir(llmbs.appConfig.AssetsDestination),
+		model.WithModel(bc.Model),
+		model.WithContext(llmbs.appConfig.Context),
 	})

-	if c.Backend != "" {
-		opts = append(opts, model.WithBackendString(c.Backend))
+	if bc.Backend != "" {
+		opts = append(opts, model.WithBackendString(bc.Backend))
 	}

-	// Check if the modelFile exists, if it doesn't try to load it from the gallery
-	if o.AutoloadGalleries { // experimental
-		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
+	// Check if bc.Model exists, if it doesn't try to load it from the gallery
+	if llmbs.appConfig.AutoloadGalleries { // experimental
+		if _, err := os.Stat(bc.Model); os.IsNotExist(err) {
 			utils.ResetDownloadTimers()
 			// if we failed to load the model, we try to download it
-			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			err := gallery.InstallModelFromGalleryByName(llmbs.appConfig.Galleries, bc.Model, llmbs.appConfig.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
 			if err != nil {
-				return nil, err
+				return nil, nil, err
 			}
 		}
 	}

-	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(opts...)
+	if bc.Backend == "" {
+		log.Debug().Msgf("backend not known for %q, falling back to greedy loader to find it", bc.Model)
+		inferenceModel, err = llmbs.ml.GreedyLoader(opts...)
 	} else {
-		inferenceModel, err = loader.BackendLoader(opts...)
+		inferenceModel, err = llmbs.ml.BackendLoader(opts...)
 	}

 	if err != nil {
-		return nil, err
+		log.Error().Err(err).Msg("[llmbs.Inference] failed to load a backend")
+		return
 	}

-	var protoMessages []*proto.Message
-	// if we are using the tokenizer template, we need to convert the messages to proto messages
-	// unless the prompt has already been tokenized (non-chat endpoints + functions)
-	if c.TemplateConfig.UseTokenizerTemplate && s == "" {
-		protoMessages = make([]*proto.Message, len(messages), len(messages))
-		for i, message := range messages {
+	grpcPredOpts := gRPCPredictOpts(bc, llmbs.appConfig.ModelPath)
+	grpcPredOpts.Prompt = req.Text
+	grpcPredOpts.Images = req.Images
+
+	if bc.TemplateConfig.UseTokenizerTemplate && req.Text == "" {
+		grpcPredOpts.UseTokenizerTemplate = true
+		protoMessages := make([]*proto.Message, len(req.RawMessages), len(req.RawMessages))
+		for i, message := range req.RawMessages {
 			protoMessages[i] = &proto.Message{
 				Role: message.Role,
 			}
@@ -87,47 +129,32 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			case string:
 				protoMessages[i].Content = ct
 			default:
-				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
+				err = fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
+				return
 			}
 		}
 	}

-	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
-	fn := func() (LLMResponse, error) {
-		opts := gRPCPredictOpts(c, loader.ModelPath)
-		opts.Prompt = s
-		opts.Messages = protoMessages
-		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
-		opts.Images = images
+	tokenUsage := TokenUsage{}

-		tokenUsage := TokenUsage{}
+	promptInfo, pErr := inferenceModel.TokenizeString(ctx, grpcPredOpts)
+	if pErr == nil && promptInfo.Length > 0 {
+		tokenUsage.Prompt = int(promptInfo.Length)
+	}

-		// check the per-model feature flag for usage, since tokenCallback may have a cost.
-		// Defaults to off as for now it is still experimental
-		if c.FeatureFlag.Enabled("usage") {
-			userTokenCallback := tokenCallback
-			if userTokenCallback == nil {
-				userTokenCallback = func(token string, usage TokenUsage) bool {
-					return true
-				}
-			}
+	rawResultChannel := make(chan concurrency.ErrorOr[*LLMResponse])
+	// TODO this next line is the biggest argument for taking named return values _back_ out!!!
+	var rawTokenChannel chan concurrency.ErrorOr[*LLMResponse]

-			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
-			if pErr == nil && promptInfo.Length > 0 {
-				tokenUsage.Prompt = int(promptInfo.Length)
-			}
+	if enableTokenChannel {
+		rawTokenChannel = make(chan concurrency.ErrorOr[*LLMResponse])

-			tokenCallback = func(token string, usage TokenUsage) bool {
-				tokenUsage.Completion++
-				return userTokenCallback(token, tokenUsage)
-			}
-		}
-
-		if tokenCallback != nil {
-			ss := ""
+		// TODO Needs better name
+		ss := ""

+		go func() {
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
+			err := inferenceModel.PredictStream(ctx, grpcPredOpts, func(chars []byte) {
 				partialRune = append(partialRune, chars...)

 				for len(partialRune) > 0 {
@@ -137,54 +164,126 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 						break
 					}

-					tokenCallback(string(r), tokenUsage)
+					tokenUsage.Completion++
+					rawTokenChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
+						Response: string(r),
+						Usage:    tokenUsage,
+					}}
+
 					ss += string(r)

 					partialRune = partialRune[size:]
 				}
 			})
-			return LLMResponse{
-				Response: ss,
-				Usage:    tokenUsage,
-			}, err
-		} else {
-			// TODO: Is the chicken bit the only way to get here? is that acceptable?
-			reply, err := inferenceModel.Predict(ctx, opts)
+			close(rawTokenChannel)
 			if err != nil {
-				return LLMResponse{}, err
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
+			} else {
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
+					Response: ss,
+					Usage:    tokenUsage,
+				}}
 			}
+			close(rawResultChannel)
+		}()
+	} else {
+		go func() {
+			reply, err := inferenceModel.Predict(ctx, grpcPredOpts)
 			if tokenUsage.Prompt == 0 {
 				tokenUsage.Prompt = int(reply.PromptTokens)
 			}
 			if tokenUsage.Completion == 0 {
 				tokenUsage.Completion = int(reply.Tokens)
 			}
-			return LLMResponse{
-				Response: string(reply.Message),
-				Usage:    tokenUsage,
-			}, err
-		}
+			if err != nil {
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
+				close(rawResultChannel)
+			} else {
+				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
+					Response: string(reply.Message),
+					Usage:    tokenUsage,
+				}}
+				close(rawResultChannel)
+			}
+		}()
 	}

-	return fn, nil
+	resultChannel = rawResultChannel
+	tokenChannel = rawTokenChannel
+	return
 }

-var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
-var mu sync.Mutex = sync.Mutex{}
+// TODO: Should predInput be a seperate param still, or should this fn handle extracting it from request??
+func (llmbs *LLMBackendService) GenerateText(predInput string, request *schema.OpenAIRequest, bc *config.BackendConfig,
+	mappingFn func(*LLMResponse) schema.Choice, enableCompletionChannels bool, enableTokenChannels bool) (
+	// Returns:
+	resultChannel <-chan concurrency.ErrorOr[*LLMResponseBundle], completionChannels []<-chan concurrency.ErrorOr[*LLMResponse], tokenChannels []<-chan concurrency.ErrorOr[*LLMResponse], err error) {

-func Finetune(config config.BackendConfig, input, prediction string) string {
+	rawChannel := make(chan concurrency.ErrorOr[*LLMResponseBundle])
+	resultChannel = rawChannel
+
+	if request.N == 0 { // number of completions to return
+		request.N = 1
+	}
+	images := []string{}
+	for _, m := range request.Messages {
+		images = append(images, m.StringImages...)
+	}
+
+	for i := 0; i < request.N; i++ {
+
+		individualResultChannel, tokenChannel, infErr := llmbs.Inference(request.Context, &LLMRequest{
+			Text:        predInput,
+			Images:      images,
+			RawMessages: request.Messages,
+		}, bc, enableTokenChannels)
+		if infErr != nil {
+			err = infErr // Avoids complaints about redeclaring err but looks dumb
+			return
+		}
+		completionChannels = append(completionChannels, individualResultChannel)
+		tokenChannels = append(tokenChannels, tokenChannel)
+	}
+
+	go func() {
+		initialBundle := LLMResponseBundle{
+			Request:  request,
+			Response: []schema.Choice{},
+			Usage:    TokenUsage{},
+		}
+
+		wg := concurrency.SliceOfChannelsReducer(completionChannels, rawChannel, func(iv concurrency.ErrorOr[*LLMResponse], ov concurrency.ErrorOr[*LLMResponseBundle]) concurrency.ErrorOr[*LLMResponseBundle] {
+			if iv.Error != nil {
+				ov.Error = iv.Error
+				// TODO: Decide if we should wipe partials or not?
+				return ov
+			}
+			ov.Value.Usage.Prompt += iv.Value.Usage.Prompt
+			ov.Value.Usage.Completion += iv.Value.Usage.Completion
+
+			ov.Value.Response = append(ov.Value.Response, mappingFn(iv.Value))
+			return ov
+		}, concurrency.ErrorOr[*LLMResponseBundle]{Value: &initialBundle}, true)
+		wg.Wait()
+
+	}()
+
+	return
+}
+
+func (llmbs *LLMBackendService) Finetune(config config.BackendConfig, input, prediction string) string {
 	if config.Echo {
 		prediction = input + prediction
 	}

 	for _, c := range config.Cutstrings {
-		mu.Lock()
-		reg, ok := cutstrings[c]
+		llmbs.ftMutex.Lock()
+		reg, ok := llmbs.cutstrings[c]
 		if !ok {
-			cutstrings[c] = regexp.MustCompile(c)
-			reg = cutstrings[c]
+			llmbs.cutstrings[c] = regexp.MustCompile(c)
+			reg = llmbs.cutstrings[c]
 		}
-		mu.Unlock()
+		llmbs.ftMutex.Unlock()
 		prediction = reg.ReplaceAllString(prediction, "")
 	}

--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -7,11 +7,10 @@ import (

 	"github.com/go-skynet/LocalAI/core/config"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/rs/zerolog/log"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+func modelOpts(bc *config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
 	if so.SingleBackend {
 		opts = append(opts, model.WithSingleActiveBackend())
 	}
@@ -20,12 +19,12 @@ func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []mode
 		opts = append(opts, model.EnableParallelRequests)
 	}

-	if c.GRPC.Attempts != 0 {
-		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+	if bc.GRPC.Attempts != 0 {
+		opts = append(opts, model.WithGRPCAttempts(bc.GRPC.Attempts))
 	}

-	if c.GRPC.AttemptsSleepTime != 0 {
-		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+	if bc.GRPC.AttemptsSleepTime != 0 {
+		opts = append(opts, model.WithGRPCAttemptsDelay(bc.GRPC.AttemptsSleepTime))
 	}

 	for k, v := range so.ExternalGRPCBackends {
@@ -35,7 +34,7 @@ func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []mode
 	return opts
 }

-func getSeed(c config.BackendConfig) int32 {
+func getSeed(c *config.BackendConfig) int32 {
 	seed := int32(*c.Seed)
 	if seed == config.RAND_SEED {
 		seed = rand.Int31()
@@ -44,7 +43,7 @@ func getSeed(c config.BackendConfig) int32 {
 	return seed
 }

-func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
+func gRPCModelOpts(c *config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
@@ -75,7 +74,6 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		EnforceEager:         c.EnforceEager,
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
-		TensorParallelSize:   int32(c.TensorParallelSize),
 		MMProj:               c.MMProj,
 		YarnExtFactor:        c.YarnExtFactor,
 		YarnAttnFactor:       c.YarnAttnFactor,
@@ -106,51 +104,47 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	}
 }

-func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOptions {
+func gRPCPredictOpts(bc *config.BackendConfig, modelPath string) *pb.PredictOptions {
 	promptCachePath := ""
-	if c.PromptCachePath != "" {
-		p := filepath.Join(modelPath, c.PromptCachePath)
-		err := os.MkdirAll(filepath.Dir(p), 0750)
-		if err == nil {
-			promptCachePath = p
-		} else {
-			log.Error().Err(err).Str("promptCachePath", promptCachePath).Msg("error creating prompt cache folder")
-		}
+	if bc.PromptCachePath != "" {
+		p := filepath.Join(modelPath, bc.PromptCachePath)
+		os.MkdirAll(filepath.Dir(p), 0755)
+		promptCachePath = p
 	}

 	return &pb.PredictOptions{
-		Temperature:         float32(*c.Temperature),
-		TopP:                float32(*c.TopP),
-		NDraft:              c.NDraft,
-		TopK:                int32(*c.TopK),
-		Tokens:              int32(*c.Maxtokens),
-		Threads:             int32(*c.Threads),
-		PromptCacheAll:      c.PromptCacheAll,
-		PromptCacheRO:       c.PromptCacheRO,
+		Temperature:         float32(*bc.Temperature),
+		TopP:                float32(*bc.TopP),
+		NDraft:              bc.NDraft,
+		TopK:                int32(*bc.TopK),
+		Tokens:              int32(*bc.Maxtokens),
+		Threads:             int32(*bc.Threads),
+		PromptCacheAll:      bc.PromptCacheAll,
+		PromptCacheRO:       bc.PromptCacheRO,
 		PromptCachePath:     promptCachePath,
-		F16KV:               *c.F16,
-		DebugMode:           *c.Debug,
-		Grammar:             c.Grammar,
-		NegativePromptScale: c.NegativePromptScale,
-		RopeFreqBase:        c.RopeFreqBase,
-		RopeFreqScale:       c.RopeFreqScale,
-		NegativePrompt:      c.NegativePrompt,
-		Mirostat:            int32(*c.LLMConfig.Mirostat),
-		MirostatETA:         float32(*c.LLMConfig.MirostatETA),
-		MirostatTAU:         float32(*c.LLMConfig.MirostatTAU),
-		Debug:               *c.Debug,
-		StopPrompts:         c.StopWords,
-		Repeat:              int32(c.RepeatPenalty),
-		NKeep:               int32(c.Keep),
-		Batch:               int32(c.Batch),
-		IgnoreEOS:           c.IgnoreEOS,
-		Seed:                getSeed(c),
-		FrequencyPenalty:    float32(c.FrequencyPenalty),
-		MLock:               *c.MMlock,
-		MMap:                *c.MMap,
-		MainGPU:             c.MainGPU,
-		TensorSplit:         c.TensorSplit,
-		TailFreeSamplingZ:   float32(*c.TFZ),
-		TypicalP:            float32(*c.TypicalP),
+		F16KV:               *bc.F16,
+		DebugMode:           *bc.Debug,
+		Grammar:             bc.Grammar,
+		NegativePromptScale: bc.NegativePromptScale,
+		RopeFreqBase:        bc.RopeFreqBase,
+		RopeFreqScale:       bc.RopeFreqScale,
+		NegativePrompt:      bc.NegativePrompt,
+		Mirostat:            int32(*bc.LLMConfig.Mirostat),
+		MirostatETA:         float32(*bc.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(*bc.LLMConfig.MirostatTAU),
+		Debug:               *bc.Debug,
+		StopPrompts:         bc.StopWords,
+		Repeat:              int32(bc.RepeatPenalty),
+		NKeep:               int32(bc.Keep),
+		Batch:               int32(bc.Batch),
+		IgnoreEOS:           bc.IgnoreEOS,
+		Seed:                getSeed(bc),
+		FrequencyPenalty:    float32(bc.FrequencyPenalty),
+		MLock:               *bc.MMlock,
+		MMap:                *bc.MMap,
+		MainGPU:             bc.MainGPU,
+		TensorSplit:         bc.TensorSplit,
+		TailFreeSamplingZ:   float32(*bc.TFZ),
+		TypicalP:            float32(*bc.TypicalP),
 	}
 }
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -1,39 +0,0 @@
-package backend
-
-import (
-	"context"
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-)
-
-func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
-	bb := backend
-	if bb == "" {
-		return nil, fmt.Errorf("backend is required")
-	}
-
-	grpcOpts := gRPCModelOpts(backendConfig)
-
-	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
-		model.WithBackendString(bb),
-		model.WithModel(modelFile),
-		model.WithContext(appConfig.Context),
-		model.WithAssetDir(appConfig.AssetsDestination),
-		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-	})
-	rerankModel, err := loader.BackendLoader(opts...)
-	if err != nil {
-		return nil, err
-	}
-
-	if rerankModel == nil {
-		return nil, fmt.Errorf("could not load rerank model")
-	}
-
-	res, err := rerankModel.Rerank(context.Background(), request)
-
-	return res, err
-}
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -7,11 +7,48 @@ import (
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelTranscription(audio, language string, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.Result, error) {
+type TranscriptionBackendService struct {
+	ml        *model.ModelLoader
+	bcl       *config.BackendConfigLoader
+	appConfig *config.ApplicationConfig
+}
+
+func NewTranscriptionBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *TranscriptionBackendService {
+	return &TranscriptionBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
+	}
+}
+
+func (tbs *TranscriptionBackendService) Transcribe(request *schema.OpenAIRequest) <-chan concurrency.ErrorOr[*schema.TranscriptionResult] {
+	responseChannel := make(chan concurrency.ErrorOr[*schema.TranscriptionResult])
+	go func(request *schema.OpenAIRequest) {
+		bc, request, err := tbs.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, tbs.appConfig)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Error: fmt.Errorf("failed reading parameters from request:%w", err)}
+			close(responseChannel)
+			return
+		}
+
+		tr, err := modelTranscription(request.File, request.Language, tbs.ml, bc, tbs.appConfig)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Error: err}
+			close(responseChannel)
+			return
+		}
+		responseChannel <- concurrency.ErrorOr[*schema.TranscriptionResult]{Value: tr}
+		close(responseChannel)
+	}(request)
+	return responseChannel
+}
+
+func modelTranscription(audio, language string, ml *model.ModelLoader, backendConfig *config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {

 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -7,29 +7,60 @@ import (
 	"path/filepath"

 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"

+	"github.com/go-skynet/LocalAI/pkg/concurrency"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-func generateUniqueFileName(dir, baseName, ext string) string {
-	counter := 1
-	fileName := baseName + ext
+type TextToSpeechBackendService struct {
+	ml        *model.ModelLoader
+	bcl       *config.BackendConfigLoader
+	appConfig *config.ApplicationConfig
+}

-	for {
-		filePath := filepath.Join(dir, fileName)
-		_, err := os.Stat(filePath)
-		if os.IsNotExist(err) {
-			return fileName
-		}
-
-		counter++
-		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
+func NewTextToSpeechBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *TextToSpeechBackendService {
+	return &TextToSpeechBackendService{
+		ml:        ml,
+		bcl:       bcl,
+		appConfig: appConfig,
 	}
 }

-func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+func (ttsbs *TextToSpeechBackendService) TextToAudioFile(request *schema.TTSRequest) <-chan concurrency.ErrorOr[*string] {
+	responseChannel := make(chan concurrency.ErrorOr[*string])
+	go func(request *schema.TTSRequest) {
+		cfg, err := ttsbs.bcl.LoadBackendConfigFileByName(request.Model, ttsbs.appConfig.ModelPath,
+			config.LoadOptionDebug(ttsbs.appConfig.Debug),
+			config.LoadOptionThreads(ttsbs.appConfig.Threads),
+			config.LoadOptionContextSize(ttsbs.appConfig.ContextSize),
+			config.LoadOptionF16(ttsbs.appConfig.F16),
+		)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*string]{Error: err}
+			close(responseChannel)
+			return
+		}
+
+		if request.Backend != "" {
+			cfg.Backend = request.Backend
+		}
+
+		outFile, _, err := modelTTS(cfg.Backend, request.Input, cfg.Model, request.Voice, ttsbs.ml, ttsbs.appConfig, cfg)
+		if err != nil {
+			responseChannel <- concurrency.ErrorOr[*string]{Error: err}
+			close(responseChannel)
+			return
+		}
+		responseChannel <- concurrency.ErrorOr[*string]{Value: &outFile}
+		close(responseChannel)
+	}(request)
+	return responseChannel
+}
+
+func modelTTS(backend, text, modelFile string, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig *config.BackendConfig) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -37,7 +68,7 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,

 	grpcOpts := gRPCModelOpts(backendConfig)

-	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
+	opts := modelOpts(&config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
 		model.WithContext(appConfig.Context),
@@ -53,7 +84,7 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
 		return "", nil, fmt.Errorf("could not load piper model")
 	}

-	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
+	if err := os.MkdirAll(appConfig.AudioDir, 0755); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}

@@ -87,3 +118,19 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,

 	return filePath, res, err
 }
+
+func generateUniqueFileName(dir, baseName, ext string) string {
+	counter := 1
+	fileName := baseName + ext
+
+	for {
+		filePath := filepath.Join(dir, fileName)
+		_, err := os.Stat(filePath)
+		if os.IsNotExist(err) {
+			return fileName
+		}
+
+		counter++
+		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
+	}
+}
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -4,7 +4,7 @@ import "embed"

 type Context struct {
 	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
-	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
+	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug" help:"Set the level of logs to output [${enum}]"`

 	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
 	BackendAssets embed.FS `kong:"-"`
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -25,7 +25,7 @@ type ModelsInstall struct {
 }

 type ModelsCMD struct {
-	List    ModelsList    `cmd:"" help:"List the models available in your galleries" default:"withargs"`
+	List    ModelsList    `cmd:"" help:"List the models avaiable in your galleries" default:"withargs"`
 	Install ModelsInstall `cmd:"" help:"Install a model from the gallery"`
 }

--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -2,31 +2,30 @@ package cli

 import (
 	"fmt"
+	"os"
 	"strings"
 	"time"

 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/http"
 	"github.com/go-skynet/LocalAI/core/startup"
-	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )

 type RunCMD struct {
 	ModelArgs []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`

-	ModelsPath                   string        `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
-	BackendAssetsPath            string        `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
-	ImagePath                    string        `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
-	AudioPath                    string        `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
-	UploadPath                   string        `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
-	ConfigPath                   string        `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
-	LocalaiConfigDir             string        `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
-	LocalaiConfigDirPollInterval time.Duration `env:"LOCALAI_CONFIG_DIR_POLL_INTERVAL" help:"Typically the config path picks up changes automatically, but if your system has broken fsnotify events, set this to an interval to poll the LocalAI Config Dir (example: 1m)" group:"storage"`
+	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+	ImagePath         string `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
+	AudioPath         string `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
+	UploadPath        string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
+	ConfigPath        string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
+	LocalaiConfigDir  string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
 	// The alias on this option is there to preserve functionality with the old `--config-file` parameter
 	ModelsConfigFile string `env:"LOCALAI_MODELS_CONFIG_FILE,CONFIG_FILE" aliases:"config-file" help:"YAML file containing a list of model backend configs" group:"storage"`

-	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
+	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models"`
 	AutoloadGalleries   bool     `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
 	RemoteLibrary       string   `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
 	PreloadModels       string   `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
@@ -61,16 +60,15 @@ func (r *RunCMD) Run(ctx *Context) error {
 		config.WithYAMLConfigPreload(r.PreloadModelsConfig),
 		config.WithModelPath(r.ModelsPath),
 		config.WithContextSize(r.ContextSize),
-		config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
+		config.WithDebug(*ctx.LogLevel == "debug"),
 		config.WithImageDir(r.ImagePath),
 		config.WithAudioDir(r.AudioPath),
 		config.WithUploadDir(r.UploadPath),
 		config.WithConfigsDir(r.ConfigPath),
-		config.WithDynamicConfigDir(r.LocalaiConfigDir),
-		config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
 		config.WithF16(r.F16),
 		config.WithStringGalleries(r.Galleries),
 		config.WithModelLibraryURL(r.RemoteLibrary),
+		config.WithDisableMessage(false),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
 		config.WithThreads(r.Threads),
@@ -126,16 +124,28 @@ func (r *RunCMD) Run(ctx *Context) error {
 	}

 	if r.PreloadBackendOnly {
-		_, _, _, err := startup.Startup(opts...)
+		_, err := startup.Startup(opts...)
 		return err
 	}

-	cl, ml, options, err := startup.Startup(opts...)
+	application, err := startup.Startup(opts...)
+
 	if err != nil {
 		return fmt.Errorf("failed basic startup tasks with error %s", err.Error())
 	}

-	appHTTP, err := http.App(cl, ml, options)
+	// Watch the configuration directory
+	// If the directory does not exist, we don't watch it
+	if _, err := os.Stat(r.LocalaiConfigDir); err == nil {
+		closeConfigWatcherFn, err := startup.WatchConfigDirectory(r.LocalaiConfigDir, application.ApplicationConfig)
+		defer closeConfigWatcherFn()
+
+		if err != nil {
+			return fmt.Errorf("failed while watching configuration directory %s", r.LocalaiConfigDir)
+		}
+	}
+
+	appHTTP, err := http.App(application)
 	if err != nil {
 		log.Error().Err(err).Msg("error during HTTP App construction")
 		return err
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -7,6 +7,7 @@ import (

 	"github.com/go-skynet/LocalAI/core/backend"
 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/model"
 )

@@ -43,11 +44,21 @@ func (t *TranscriptCMD) Run(ctx *Context) error {

 	defer ml.StopAllGRPC()

-	tr, err := backend.ModelTranscription(t.Filename, t.Language, ml, c, opts)
-	if err != nil {
-		return err
+	tbs := backend.NewTranscriptionBackendService(ml, cl, opts)
+
+	resultChannel := tbs.Transcribe(&schema.OpenAIRequest{
+		PredictionOptions: schema.PredictionOptions{
+			Language: t.Language,
+		},
+		File: t.Filename,
+	})
+
+	r := <-resultChannel
+
+	if r.Error != nil {
+		return r.Error
 	}
-	for _, segment := range tr.Segments {
+	for _, segment := range r.Value.Segments {
 		fmt.Println(segment.Start.String(), "-", segment.Text)
 	}
 	return nil
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -9,6 +9,7 @@ import (

 	"github.com/go-skynet/LocalAI/core/backend"
 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/model"
 )

@@ -42,20 +43,29 @@ func (t *TTSCMD) Run(ctx *Context) error {

 	defer ml.StopAllGRPC()

-	options := config.BackendConfig{}
-	options.SetDefaults()
+	ttsbs := backend.NewTextToSpeechBackendService(ml, config.NewBackendConfigLoader(), opts)

-	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
-	if err != nil {
-		return err
+	request := &schema.TTSRequest{
+		Model:   t.Model,
+		Input:   text,
+		Backend: t.Backend,
+		Voice:   t.Voice,
+	}
+
+	resultsChannel := ttsbs.TextToAudioFile(request)
+
+	rawResult := <-resultsChannel
+
+	if rawResult.Error != nil {
+		return rawResult.Error
 	}
 	if outputFile != "" {
-		if err := os.Rename(filePath, outputFile); err != nil {
+		if err := os.Rename(*rawResult.Value, outputFile); err != nil {
 			return err
 		}
-		fmt.Printf("Generate file %s\n", outputFile)
+		fmt.Printf("Generated file %q\n", outputFile)
 	} else {
-		fmt.Printf("Generate file %s\n", filePath)
+		fmt.Printf("Generated file %q\n", *rawResult.Value)
 	}
 	return nil
 }
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -17,13 +17,11 @@ type ApplicationConfig struct {
 	UploadLimitMB, Threads, ContextSize int
 	DisableWelcomePage                  bool
 	F16                                 bool
-	Debug                               bool
+	Debug, DisableMessage               bool
 	ImageDir                            string
 	AudioDir                            string
 	UploadDir                           string
 	ConfigsDir                          string
-	DynamicConfigsDir                   string
-	DynamicConfigsDirPollInterval       time.Duration
 	CORS                                bool
 	PreloadJSONModels                   string
 	PreloadModelsFromPath               string
@@ -57,11 +55,12 @@ type AppOption func(*ApplicationConfig)

 func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 	opt := &ApplicationConfig{
-		Context:       context.Background(),
-		UploadLimitMB: 15,
-		Threads:       1,
-		ContextSize:   512,
-		Debug:         true,
+		Context:        context.Background(),
+		UploadLimitMB:  15,
+		Threads:        1,
+		ContextSize:    512,
+		Debug:          true,
+		DisableMessage: true,
 	}
 	for _, oo := range o {
 		oo(opt)
@@ -235,6 +234,12 @@ func WithDebug(debug bool) AppOption {
 	}
 }

+func WithDisableMessage(disableMessage bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.DisableMessage = disableMessage
+	}
+}
+
 func WithAudioDir(audioDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.AudioDir = audioDir
@@ -259,18 +264,6 @@ func WithConfigsDir(configsDir string) AppOption {
 	}
 }

-func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
-	return func(o *ApplicationConfig) {
-		o.DynamicConfigsDir = dynamicConfigsDir
-	}
-}
-
-func WithDynamicConfigDirPollInterval(interval time.Duration) AppOption {
-	return func(o *ApplicationConfig) {
-		o.DynamicConfigsDirPollInterval = interval
-	}
-}
-
 func WithApiKeys(apiKeys []string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ApiKeys = apiKeys
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -1,23 +1,7 @@
 package config

 import (
-	"errors"
-	"fmt"
-	"io/fs"
-	"os"
-	"path/filepath"
-	"sort"
-	"strings"
-	"sync"
-
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/downloader"
-	"github.com/go-skynet/LocalAI/pkg/functions"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-	"github.com/rs/zerolog/log"
-	"gopkg.in/yaml.v3"
-
-	"github.com/charmbracelet/glamour"
 )

 const (
@@ -40,7 +24,7 @@ type BackendConfig struct {
 	InputToken                                 [][]int  `yaml:"-"`
 	functionCallString, functionCallNameString string   `yaml:"-"`

-	FunctionsConfig functions.FunctionsConfig `yaml:"function"`
+	FunctionsConfig Functions `yaml:"function"`

 	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
@@ -140,7 +124,6 @@ type LLMConfig struct {
 	EnforceEager         bool    `yaml:"enforce_eager"`          // vLLM
 	SwapSpace            int     `yaml:"swap_space"`             // vLLM
 	MaxModelLen          int     `yaml:"max_model_len"`          // vLLM
-	TensorParallelSize   int     `yaml:"tensor_parallel_size"`          // vLLM
 	MMProj               string  `yaml:"mmproj"`

 	RopeScaling string `yaml:"rope_scaling"`
@@ -159,6 +142,13 @@ type AutoGPTQ struct {
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

+type Functions struct {
+	DisableNoAction         bool   `yaml:"disable_no_action"`
+	NoActionFunctionName    string `yaml:"no_action_function_name"`
+	NoActionDescriptionName string `yaml:"no_action_description_name"`
+	ParallelCalls           bool   `yaml:"parallel_calls"`
+}
+
 type TemplateConfig struct {
 	Chat                 string `yaml:"chat"`
 	ChatMessage          string `yaml:"chat_message"`
@@ -194,7 +184,7 @@ func (c *BackendConfig) FunctionToCall() string {
 }

 func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
-	lo := &LoadOptions{}
+	lo := &ConfigLoaderOptions{}
 	lo.Apply(opts...)

 	ctx := lo.ctxSize
@@ -205,15 +195,15 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTopP := 0.95
 	defaultTopK := 40
 	defaultTemp := 0.9
+	defaultMaxTokens := 2048
 	defaultMirostat := 2
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
 	defaultTypicalP := 1.0
 	defaultTFZ := 1.0
-	defaultZero := 0

 	// Try to offload all GPU layers (if GPU is found)
-	defaultHigh := 99999999
+	defaultNGPULayers := 99999999

 	trueV := true
 	falseV := false
@@ -254,7 +244,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	}

 	if cfg.Maxtokens == nil {
-		cfg.Maxtokens = &defaultZero
+		cfg.Maxtokens = &defaultMaxTokens
 	}

 	if cfg.Mirostat == nil {
@@ -269,7 +259,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.NGPULayers == nil {
-		cfg.NGPULayers = &defaultHigh
+		cfg.NGPULayers = &defaultNGPULayers
 	}

 	if cfg.LowVRAM == nil {
@@ -307,287 +297,3 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}
 }
-
-////// Config Loader ////////
-
-type BackendConfigLoader struct {
-	configs map[string]BackendConfig
-	sync.Mutex
-}
-
-type LoadOptions struct {
-	debug            bool
-	threads, ctxSize int
-	f16              bool
-}
-
-func LoadOptionDebug(debug bool) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.debug = debug
-	}
-}
-
-func LoadOptionThreads(threads int) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.threads = threads
-	}
-}
-
-func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.ctxSize = ctxSize
-	}
-}
-
-func LoadOptionF16(f16 bool) ConfigLoaderOption {
-	return func(o *LoadOptions) {
-		o.f16 = f16
-	}
-}
-
-type ConfigLoaderOption func(*LoadOptions)
-
-func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
-	for _, l := range options {
-		l(lo)
-	}
-}
-
-// Load a config file for a model
-func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
-
-	// Load a config file if present after the model name
-	cfg := &BackendConfig{
-		PredictionOptions: schema.PredictionOptions{
-			Model: modelName,
-		},
-	}
-
-	cfgExisting, exists := cl.GetBackendConfig(modelName)
-	if exists {
-		cfg = &cfgExisting
-	} else {
-		// Try loading a model config file
-		modelConfig := filepath.Join(modelPath, modelName+".yaml")
-		if _, err := os.Stat(modelConfig); err == nil {
-			if err := cl.LoadBackendConfig(
-				modelConfig, opts...,
-			); err != nil {
-				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-			}
-			cfgExisting, exists = cl.GetBackendConfig(modelName)
-			if exists {
-				cfg = &cfgExisting
-			}
-		}
-	}
-
-	cfg.SetDefaults(opts...)
-
-	return cfg, nil
-}
-
-func NewBackendConfigLoader() *BackendConfigLoader {
-	return &BackendConfigLoader{
-		configs: make(map[string]BackendConfig),
-	}
-}
-func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
-	c := &[]*BackendConfig{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	for _, cc := range *c {
-		cc.SetDefaults(opts...)
-	}
-
-	return *c, nil
-}
-
-func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
-	lo := &LoadOptions{}
-	lo.Apply(opts...)
-
-	c := &BackendConfig{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	c.SetDefaults(opts...)
-	return c, nil
-}
-
-func (cm *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadBackendConfigFile(file, opts...)
-	if err != nil {
-		return fmt.Errorf("cannot load config file: %w", err)
-	}
-
-	for _, cc := range c {
-		cm.configs[cc.Name] = *cc
-	}
-	return nil
-}
-
-func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
-	cl.Lock()
-	defer cl.Unlock()
-	c, err := ReadBackendConfig(file, opts...)
-	if err != nil {
-		return fmt.Errorf("cannot read config file: %w", err)
-	}
-
-	cl.configs[c.Name] = *c
-	return nil
-}
-
-func (cl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
-	cl.Lock()
-	defer cl.Unlock()
-	v, exists := cl.configs[m]
-	return v, exists
-}
-
-func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
-	cl.Lock()
-	defer cl.Unlock()
-	var res []BackendConfig
-	for _, v := range cl.configs {
-		res = append(res, v)
-	}
-
-	sort.SliceStable(res, func(i, j int) bool {
-		return res[i].Name < res[j].Name
-	})
-
-	return res
-}
-
-func (cl *BackendConfigLoader) ListBackendConfigs() []string {
-	cl.Lock()
-	defer cl.Unlock()
-	var res []string
-	for k := range cl.configs {
-		res = append(res, k)
-	}
-	return res
-}
-
-// Preload prepare models if they are not local but url or huggingface repositories
-func (cl *BackendConfigLoader) Preload(modelPath string) error {
-	cl.Lock()
-	defer cl.Unlock()
-
-	status := func(fileName, current, total string, percent float64) {
-		utils.DisplayDownloadFunction(fileName, current, total, percent)
-	}
-
-	log.Info().Msgf("Preloading models from %s", modelPath)
-
-	renderMode := "dark"
-	if os.Getenv("COLOR") != "" {
-		renderMode = os.Getenv("COLOR")
-	}
-
-	glamText := func(t string) {
-		out, err := glamour.Render(t, renderMode)
-		if err == nil && os.Getenv("NO_COLOR") == "" {
-			fmt.Println(out)
-		} else {
-			fmt.Println(t)
-		}
-	}
-
-	for i, config := range cl.configs {
-
-		// Download files and verify their SHA
-		for i, file := range config.DownloadFiles {
-			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
-
-			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
-				return err
-			}
-			// Create file path
-			filePath := filepath.Join(modelPath, file.Filename)
-
-			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, i, len(config.DownloadFiles), status); err != nil {
-				return err
-			}
-		}
-
-		modelURL := config.PredictionOptions.Model
-		modelURL = downloader.ConvertURL(modelURL)
-
-		if downloader.LooksLikeURL(modelURL) {
-			// md5 of model name
-			md5Name := utils.MD5(modelURL)
-
-			// check if file exists
-			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
-				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", 0, 0, status)
-				if err != nil {
-					return err
-				}
-			}
-
-			cc := cl.configs[i]
-			c := &cc
-			c.PredictionOptions.Model = md5Name
-			cl.configs[i] = *c
-		}
-		if cl.configs[i].Name != "" {
-			glamText(fmt.Sprintf("**Model name**: _%s_", cl.configs[i].Name))
-		}
-		if cl.configs[i].Description != "" {
-			//glamText("**Description**")
-			glamText(cl.configs[i].Description)
-		}
-		if cl.configs[i].Usage != "" {
-			//glamText("**Usage**")
-			glamText(cl.configs[i].Usage)
-		}
-	}
-	return nil
-}
-
-// LoadBackendConfigsFromPath reads all the configurations of the models from a path
-// (non-recursive)
-func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
-	cm.Lock()
-	defer cm.Unlock()
-	entries, err := os.ReadDir(path)
-	if err != nil {
-		return err
-	}
-	files := make([]fs.FileInfo, 0, len(entries))
-	for _, entry := range entries {
-		info, err := entry.Info()
-		if err != nil {
-			return err
-		}
-		files = append(files, info)
-	}
-	for _, file := range files {
-		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
-			continue
-		}
-		c, err := ReadBackendConfig(filepath.Join(path, file.Name()), opts...)
-		if err == nil {
-			cm.configs[c.Name] = *c
-		}
-	}
-
-	return nil
-}
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -0,0 +1,509 @@
+package config
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+
+	"github.com/charmbracelet/glamour"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/downloader"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/rs/zerolog/log"
+	"gopkg.in/yaml.v2"
+)
+
+type BackendConfigLoader struct {
+	configs map[string]BackendConfig
+	sync.Mutex
+}
+
+type ConfigLoaderOptions struct {
+	debug            bool
+	threads, ctxSize int
+	f16              bool
+}
+
+func LoadOptionDebug(debug bool) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.debug = debug
+	}
+}
+
+func LoadOptionThreads(threads int) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.threads = threads
+	}
+}
+
+func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.ctxSize = ctxSize
+	}
+}
+
+func LoadOptionF16(f16 bool) ConfigLoaderOption {
+	return func(o *ConfigLoaderOptions) {
+		o.f16 = f16
+	}
+}
+
+type ConfigLoaderOption func(*ConfigLoaderOptions)
+
+func (lo *ConfigLoaderOptions) Apply(options ...ConfigLoaderOption) {
+	for _, l := range options {
+		l(lo)
+	}
+}
+
+func NewBackendConfigLoader() *BackendConfigLoader {
+	return &BackendConfigLoader{
+		configs: make(map[string]BackendConfig),
+	}
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	c, err := readBackendConfig(file, opts...)
+	if err != nil {
+		return fmt.Errorf("cannot read config file: %w", err)
+	}
+
+	bcl.configs[c.Name] = *c
+	return nil
+}
+
+func (bcl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
+	bcl.Lock()
+	defer bcl.Unlock()
+	v, exists := bcl.configs[m]
+	return v, exists
+}
+
+func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
+	bcl.Lock()
+	defer bcl.Unlock()
+	var res []BackendConfig
+	for _, v := range bcl.configs {
+		res = append(res, v)
+	}
+	sort.SliceStable(res, func(i, j int) bool {
+		return res[i].Name < res[j].Name
+	})
+	return res
+}
+
+func (bcl *BackendConfigLoader) ListBackendConfigs() []string {
+	bcl.Lock()
+	defer bcl.Unlock()
+	var res []string
+	for k := range bcl.configs {
+		res = append(res, k)
+	}
+	return res
+}
+
+// Preload prepare models if they are not local but url or huggingface repositories
+func (bcl *BackendConfigLoader) Preload(modelPath string) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+
+	status := func(fileName, current, total string, percent float64) {
+		utils.DisplayDownloadFunction(fileName, current, total, percent)
+	}
+
+	log.Info().Msgf("Preloading models from %s", modelPath)
+
+	renderMode := "dark"
+	if os.Getenv("COLOR") != "" {
+		renderMode = os.Getenv("COLOR")
+	}
+
+	glamText := func(t string) {
+		out, err := glamour.Render(t, renderMode)
+		if err == nil && os.Getenv("NO_COLOR") == "" {
+			fmt.Println(out)
+		} else {
+			fmt.Println(t)
+		}
+	}
+
+	for i, config := range bcl.configs {
+
+		// Download files and verify their SHA
+		for _, file := range config.DownloadFiles {
+			log.Debug().Msgf("Checking %q exists and matches SHA", file.Filename)
+
+			if err := utils.VerifyPath(file.Filename, modelPath); err != nil {
+				return err
+			}
+			// Create file path
+			filePath := filepath.Join(modelPath, file.Filename)
+
+			if err := downloader.DownloadFile(file.URI, filePath, file.SHA256, status); err != nil {
+				return err
+			}
+		}
+
+		modelURL := config.PredictionOptions.Model
+		modelURL = downloader.ConvertURL(modelURL)
+
+		if downloader.LooksLikeURL(modelURL) {
+			// md5 of model name
+			md5Name := utils.MD5(modelURL)
+
+			// check if file exists
+			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
+				err := downloader.DownloadFile(modelURL, filepath.Join(modelPath, md5Name), "", status)
+				if err != nil {
+					return err
+				}
+			}
+
+			cc := bcl.configs[i]
+			c := &cc
+			c.PredictionOptions.Model = md5Name
+			bcl.configs[i] = *c
+		}
+		if bcl.configs[i].Name != "" {
+			glamText(fmt.Sprintf("**Model name**: _%s_", bcl.configs[i].Name))
+		}
+		if bcl.configs[i].Description != "" {
+			//glamText("**Description**")
+			glamText(bcl.configs[i].Description)
+		}
+		if bcl.configs[i].Usage != "" {
+			//glamText("**Usage**")
+			glamText(bcl.configs[i].Usage)
+		}
+	}
+	return nil
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		return err
+	}
+	files := make([]fs.FileInfo, 0, len(entries))
+	for _, entry := range entries {
+		info, err := entry.Info()
+		if err != nil {
+			return err
+		}
+		files = append(files, info)
+	}
+	for _, file := range files {
+		// Skip templates, YAML and .keep files
+		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
+			continue
+		}
+		c, err := readBackendConfig(filepath.Join(path, file.Name()), opts...)
+		if err == nil {
+			bcl.configs[c.Name] = *c
+		}
+	}
+
+	return nil
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	c, err := readBackendConfigFile(file, opts...)
+	if err != nil {
+		return fmt.Errorf("cannot load config file: %w", err)
+	}
+
+	for _, cc := range c {
+		bcl.configs[cc.Name] = *cc
+	}
+	return nil
+}
+
+//////////
+
+// Load a config file for a model
+func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName string, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+
+	// Load a config file if present after the model name
+	cfg := &BackendConfig{
+		PredictionOptions: schema.PredictionOptions{
+			Model: modelName,
+		},
+	}
+
+	cfgExisting, exists := bcl.GetBackendConfig(modelName)
+	if exists {
+		cfg = &cfgExisting
+	} else {
+		// Load a config file if present after the model name
+		modelConfig := filepath.Join(modelPath, modelName+".yaml")
+		if _, err := os.Stat(modelConfig); err == nil {
+			if err := bcl.LoadBackendConfig(modelConfig); err != nil {
+				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+			}
+			cfgExisting, exists = bcl.GetBackendConfig(modelName)
+			if exists {
+				cfg = &cfgExisting
+			}
+		}
+	}
+
+	cfg.SetDefaults(opts...)
+	return cfg, nil
+}
+
+func readBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
+	c := &[]*BackendConfig{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	for _, cc := range *c {
+		cc.SetDefaults(opts...)
+	}
+
+	return *c, nil
+}
+
+func readBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+	c := &BackendConfig{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	c.SetDefaults(opts...)
+	return c, nil
+}
+
+func (bcl *BackendConfigLoader) LoadBackendConfigForModelAndOpenAIRequest(modelFile string, input *schema.OpenAIRequest, appConfig *ApplicationConfig) (*BackendConfig, *schema.OpenAIRequest, error) {
+	cfg, err := bcl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+		LoadOptionContextSize(appConfig.ContextSize),
+		LoadOptionDebug(appConfig.Debug),
+		LoadOptionF16(appConfig.F16),
+		LoadOptionThreads(appConfig.Threads),
+	)
+
+	// Set the parameters for the language model prediction
+	updateBackendConfigFromOpenAIRequest(cfg, input)
+
+	return cfg, input, err
+}
+
+func updateBackendConfigFromOpenAIRequest(bc *BackendConfig, request *schema.OpenAIRequest) {
+	if request.Echo {
+		bc.Echo = request.Echo
+	}
+	if request.TopK != nil && *request.TopK != 0 {
+		bc.TopK = request.TopK
+	}
+	if request.TopP != nil && *request.TopP != 0 {
+		bc.TopP = request.TopP
+	}
+
+	if request.Backend != "" {
+		bc.Backend = request.Backend
+	}
+
+	if request.ClipSkip != 0 {
+		bc.Diffusers.ClipSkip = request.ClipSkip
+	}
+
+	if request.ModelBaseName != "" {
+		bc.AutoGPTQ.ModelBaseName = request.ModelBaseName
+	}
+
+	if request.NegativePromptScale != 0 {
+		bc.NegativePromptScale = request.NegativePromptScale
+	}
+
+	if request.UseFastTokenizer {
+		bc.UseFastTokenizer = request.UseFastTokenizer
+	}
+
+	if request.NegativePrompt != "" {
+		bc.NegativePrompt = request.NegativePrompt
+	}
+
+	if request.RopeFreqBase != 0 {
+		bc.RopeFreqBase = request.RopeFreqBase
+	}
+
+	if request.RopeFreqScale != 0 {
+		bc.RopeFreqScale = request.RopeFreqScale
+	}
+
+	if request.Grammar != "" {
+		bc.Grammar = request.Grammar
+	}
+
+	if request.Temperature != nil && *request.Temperature != 0 {
+		bc.Temperature = request.Temperature
+	}
+
+	if request.Maxtokens != nil && *request.Maxtokens != 0 {
+		bc.Maxtokens = request.Maxtokens
+	}
+
+	switch stop := request.Stop.(type) {
+	case string:
+		if stop != "" {
+			bc.StopWords = append(bc.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				bc.StopWords = append(bc.StopWords, s)
+			}
+		}
+	}
+
+	if len(request.Tools) > 0 {
+		for _, tool := range request.Tools {
+			request.Functions = append(request.Functions, tool.Function)
+		}
+	}
+
+	if request.ToolsChoice != nil {
+		var toolChoice grammar.Tool
+		switch content := request.ToolsChoice.(type) {
+		case string:
+			_ = json.Unmarshal([]byte(content), &toolChoice)
+		case map[string]interface{}:
+			dat, _ := json.Marshal(content)
+			_ = json.Unmarshal(dat, &toolChoice)
+		}
+		request.FunctionCall = map[string]interface{}{
+			"name": toolChoice.Function.Name,
+		}
+	}
+
+	// Decode each request's message content
+	index := 0
+	for i, m := range request.Messages {
+		switch content := m.Content.(type) {
+		case string:
+			request.Messages[i].StringContent = content
+		case []interface{}:
+			dat, _ := json.Marshal(content)
+			c := []schema.Content{}
+			json.Unmarshal(dat, &c)
+			for _, pp := range c {
+				if pp.Type == "text" {
+					request.Messages[i].StringContent = pp.Text
+				} else if pp.Type == "image_url" {
+					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
+					base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
+					if err == nil {
+						request.Messages[i].StringImages = append(request.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+						// set a placeholder for each image
+						request.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + request.Messages[i].StringContent
+						index++
+					} else {
+						fmt.Print("Failed encoding image", err)
+					}
+				}
+			}
+		}
+	}
+
+	if request.RepeatPenalty != 0 {
+		bc.RepeatPenalty = request.RepeatPenalty
+	}
+
+	if request.FrequencyPenalty != 0 {
+		bc.FrequencyPenalty = request.FrequencyPenalty
+	}
+
+	if request.PresencePenalty != 0 {
+		bc.PresencePenalty = request.PresencePenalty
+	}
+
+	if request.Keep != 0 {
+		bc.Keep = request.Keep
+	}
+
+	if request.Batch != 0 {
+		bc.Batch = request.Batch
+	}
+
+	if request.IgnoreEOS {
+		bc.IgnoreEOS = request.IgnoreEOS
+	}
+
+	if request.Seed != nil {
+		bc.Seed = request.Seed
+	}
+
+	if request.TypicalP != nil {
+		bc.TypicalP = request.TypicalP
+	}
+
+	switch inputs := request.Input.(type) {
+	case string:
+		if inputs != "" {
+			bc.InputStrings = append(bc.InputStrings, inputs)
+		}
+	case []interface{}:
+		for _, pp := range inputs {
+			switch i := pp.(type) {
+			case string:
+				bc.InputStrings = append(bc.InputStrings, i)
+			case []interface{}:
+				tokens := []int{}
+				for _, ii := range i {
+					tokens = append(tokens, int(ii.(float64)))
+				}
+				bc.InputToken = append(bc.InputToken, tokens)
+			}
+		}
+	}
+
+	// Can be either a string or an object
+	switch fnc := request.FunctionCall.(type) {
+	case string:
+		if fnc != "" {
+			bc.SetFunctionCallString(fnc)
+		}
+	case map[string]interface{}:
+		var name string
+		n, exists := fnc["name"]
+		if exists {
+			nn, e := n.(string)
+			if e {
+				name = nn
+			}
+		}
+		bc.SetFunctionCallNameString(name)
+	}
+
+	switch p := request.Prompt.(type) {
+	case string:
+		bc.PromptStrings = append(bc.PromptStrings, p)
+	case []interface{}:
+		for _, pp := range p {
+			if s, ok := pp.(string); ok {
+				bc.PromptStrings = append(bc.PromptStrings, s)
+			}
+		}
+	}
+}
--- a/core/config/exports_test.go
+++ b/core/config/exports_test.go
@@ -0,0 +1,6 @@
+package config
+
+// This file re-exports private functions to be used directly in unit tests.
+// Since this file's name ends in _test.go, theoretically these should not be exposed past the tests.
+
+var ReadBackendConfigFile = readBackendConfigFile
--- a/core/http/api.go
+++ b/core/http/api.go
@@ -0,0 +1,278 @@
+package http
+
+import (
+	"errors"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/core"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/gofiber/swagger" // swagger handler
+
+	"github.com/go-skynet/LocalAI/core/http/endpoints/elevenlabs"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/internal"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/logger"
+	"github.com/gofiber/fiber/v2/middleware/recover"
+)
+
+func readAuthHeader(c *fiber.Ctx) string {
+	authHeader := c.Get("Authorization")
+
+	// elevenlabs
+	xApiKey := c.Get("xi-api-key")
+	if xApiKey != "" {
+		authHeader = "Bearer " + xApiKey
+	}
+
+	// anthropic
+	xApiKey = c.Get("x-api-key")
+	if xApiKey != "" {
+		authHeader = "Bearer " + xApiKey
+	}
+
+	return authHeader
+}
+
+// @title LocalAI API
+// @version 2.0.0
+// @description The LocalAI Rest API.
+// @termsOfService
+// @contact.name LocalAI
+// @contact.url https://localai.io
+// @license.name MIT
+// @license.url https://raw.githubusercontent.com/mudler/LocalAI/master/LICENSE
+// @BasePath /
+// @securityDefinitions.apikey BearerAuth
+// @in header
+// @name Authorization
+func App(application *core.Application) (*fiber.App, error) {
+	// Return errors as JSON responses
+	app := fiber.New(fiber.Config{
+		Views:                 renderEngine(),
+		BodyLimit:             application.ApplicationConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		DisableStartupMessage: application.ApplicationConfig.DisableMessage,
+		// Override default error handler
+		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
+			// Status code defaults to 500
+			code := fiber.StatusInternalServerError
+
+			// Retrieve the custom status code if it's a *fiber.Error
+			var e *fiber.Error
+			if errors.As(err, &e) {
+				code = e.Code
+			}
+
+			// Send custom error page
+			return ctx.Status(code).JSON(
+				schema.ErrorResponse{
+					Error: &schema.APIError{Message: err.Error(), Code: code},
+				},
+			)
+		},
+	})
+
+	if application.ApplicationConfig.Debug {
+		app.Use(logger.New(logger.Config{
+			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
+		}))
+	}
+
+	// Default middleware config
+
+	if !application.ApplicationConfig.Debug {
+		app.Use(recover.New())
+	}
+
+	metricsService, err := services.NewLocalAIMetricsService()
+	if err != nil {
+		return nil, err
+	}
+
+	if metricsService != nil {
+		app.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
+		app.Hooks().OnShutdown(func() error {
+			return metricsService.Shutdown()
+		})
+	}
+
+	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
+	auth := func(c *fiber.Ctx) error {
+		if len(application.ApplicationConfig.ApiKeys) == 0 {
+			return c.Next()
+		}
+
+		authHeader := readAuthHeader(c)
+		if authHeader == "" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+		}
+
+		// If it's a bearer token
+		authHeaderParts := strings.Split(authHeader, " ")
+		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+		}
+
+		apiKey := authHeaderParts[1]
+		for _, key := range application.ApplicationConfig.ApiKeys {
+			if apiKey == key {
+				return c.Next()
+			}
+		}
+
+		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+	}
+
+	if application.ApplicationConfig.CORS {
+		var c func(ctx *fiber.Ctx) error
+		if application.ApplicationConfig.CORSAllowOrigins == "" {
+			c = cors.New()
+		} else {
+			c = cors.New(cors.Config{AllowOrigins: application.ApplicationConfig.CORSAllowOrigins})
+		}
+
+		app.Use(c)
+	}
+
+	fiberContextExtractor := fiberContext.NewFiberContextExtractor(application.ModelLoader, application.ApplicationConfig)
+
+	// LocalAI API endpoints
+	galleryService := services.NewGalleryService(application.ApplicationConfig.ModelPath)
+	galleryService.Start(application.ApplicationConfig.Context, application.BackendConfigLoader)
+
+	app.Get("/version", auth, func(c *fiber.Ctx) error {
+		return c.JSON(struct {
+			Version string `json:"version"`
+		}{Version: internal.PrintableVersion()})
+	})
+
+	app.Get("/swagger/*", swagger.HandlerDefault) // default
+
+	welcomeRoute(
+		app,
+		application.BackendConfigLoader,
+		application.ModelLoader,
+		application.ApplicationConfig,
+		auth,
+	)
+
+	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(application.ApplicationConfig.Galleries, application.ApplicationConfig.ModelPath, galleryService)
+	app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+	app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+	app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
+	app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
+	app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+	app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
+	app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
+
+	// Stores
+	storeLoader := model.NewModelLoader("") // TODO: Investigate if this should be migrated to application and reused. Should the path be configurable? Merging for now.
+	app.Post("/stores/set", auth, localai.StoresSetEndpoint(storeLoader, application.ApplicationConfig))
+	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(storeLoader, application.ApplicationConfig))
+	app.Post("/stores/get", auth, localai.StoresGetEndpoint(storeLoader, application.ApplicationConfig))
+	app.Post("/stores/find", auth, localai.StoresFindEndpoint(storeLoader, application.ApplicationConfig))
+
+	// openAI compatible API endpoints
+
+	// chat
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(fiberContextExtractor, application.OpenAIService))
+
+	// edit
+	app.Post("/v1/edits", auth, openai.EditEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/edits", auth, openai.EditEndpoint(fiberContextExtractor, application.OpenAIService))
+
+	// assistant
+	// TODO: Refactor this to the new style eventually
+	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(application.BackendConfigLoader, application.ModelLoader, application.ApplicationConfig))
+
+	// files
+	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Post("/files", auth, openai.UploadFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/v1/files", auth, openai.ListFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/files", auth, openai.ListFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(application.BackendConfigLoader, application.ApplicationConfig))
+
+	// completion
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(fiberContextExtractor, application.OpenAIService))
+
+	// embeddings
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(fiberContextExtractor, application.EmbeddingsBackendService))
+
+	// audio
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(fiberContextExtractor, application.TranscriptionBackendService))
+	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
+
+	// images
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(fiberContextExtractor, application.ImageGenerationBackendService))
+
+	// Elevenlabs
+	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
+
+	// LocalAI TTS?
+	app.Post("/tts", auth, localai.TTSEndpoint(fiberContextExtractor, application.TextToSpeechBackendService))
+
+	if application.ApplicationConfig.ImageDir != "" {
+		app.Static("/generated-images", application.ApplicationConfig.ImageDir)
+	}
+
+	if application.ApplicationConfig.AudioDir != "" {
+		app.Static("/generated-audio", application.ApplicationConfig.AudioDir)
+	}
+
+	ok := func(c *fiber.Ctx) error {
+		return c.SendStatus(200)
+	}
+
+	// Kubernetes health checks
+	app.Get("/healthz", ok)
+	app.Get("/readyz", ok)
+
+	// Experimental Backend Statistics Module
+	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(application.BackendMonitorService))
+	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(application.BackendMonitorService))
+
+	// models
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(application.ListModelsService))
+	app.Get("/models", auth, openai.ListModelsEndpoint(application.ListModelsService))
+
+	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
+
+	// Define a custom 404 handler
+	// Note: keep this at the bottom!
+	app.Use(notFoundHandler)
+
+	return app, nil
+}
--- a/core/http/api_test.go
+++ b/core/http/api_test.go
@@ -12,7 +12,9 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strings"

+	"github.com/go-skynet/LocalAI/core"
 	"github.com/go-skynet/LocalAI/core/config"
 	. "github.com/go-skynet/LocalAI/core/http"
 	"github.com/go-skynet/LocalAI/core/schema"
@@ -205,12 +207,11 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string
 	var modelDir string
-	var bcl *config.BackendConfigLoader
-	var ml *model.ModelLoader
-	var applicationConfig *config.ApplicationConfig
+	var application *core.Application

 	commonOpts := []config.AppOption{
 		config.WithDebug(true),
+		config.WithDisableMessage(true),
 	}

 	Context("API with ephemeral models", func() {
@@ -222,7 +223,7 @@ var _ = Describe("API test", func() {

 			modelDir = filepath.Join(tmpdir, "models")
 			backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
-			err = os.Mkdir(backendAssetsDir, 0750)
+			err = os.Mkdir(backendAssetsDir, 0755)
 			Expect(err).ToNot(HaveOccurred())

 			c, cancel = context.WithCancel(context.Background())
@@ -241,7 +242,7 @@ var _ = Describe("API test", func() {
 			}
 			out, err := yaml.Marshal(g)
 			Expect(err).ToNot(HaveOccurred())
-			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0600)
+			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
 			Expect(err).ToNot(HaveOccurred())

 			galleries := []gallery.Gallery{
@@ -251,7 +252,7 @@ var _ = Describe("API test", func() {
 				},
 			}

-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithGalleries(galleries),
@@ -260,7 +261,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())

-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -473,11 +474,11 @@ var _ = Describe("API test", func() {
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
-				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+				Expect(resp2.Choices[0].Message.ToolCalls[0].Function).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.ToolCalls[0].Function.Name)

 				var res map[string]string
-				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.ToolCalls[0].Function.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
@@ -486,13 +487,14 @@ var _ = Describe("API test", func() {
 			})

 			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-
-				modelName := "hermes-2-pro-mistral"
+				// if runtime.GOOS != "linux" {
+				// 	Skip("test supported only on linux")
+				// }
+				modelName := "codellama"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
+					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
+					Name:      modelName,
+					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -502,7 +504,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
+				}, "480s", "10s").Should(Equal(true))

 				By("testing chat")
 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
@@ -549,13 +551,15 @@ var _ = Describe("API test", func() {
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
-				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+				fmt.Printf("\n--- %+v\n\n", resp2.Choices[0].Message)
+				Expect(resp2.Choices[0].Message.ToolCalls).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.ToolCalls[0]).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.ToolCalls[0].Function.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.ToolCalls[0].Function.Name)

 				var res map[string]string
-				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.ToolCalls[0].Function.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
+				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
@@ -595,7 +599,7 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred())
 			modelDir = filepath.Join(tmpdir, "models")
 			backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
-			err = os.Mkdir(backendAssetsDir, 0750)
+			err = os.Mkdir(backendAssetsDir, 0755)
 			Expect(err).ToNot(HaveOccurred())

 			c, cancel = context.WithCancel(context.Background())
@@ -607,7 +611,7 @@ var _ = Describe("API test", func() {
 				},
 			}

-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithAudioDir(tmpdir),
@@ -618,7 +622,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(tmpdir))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -722,14 +726,14 @@ var _ = Describe("API test", func() {

 			var err error

-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

@@ -759,6 +763,11 @@ var _ = Describe("API test", func() {
 			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions via ggml", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel.ggml", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -766,6 +775,11 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate chat completions via ggml", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel.ggml", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -773,6 +787,11 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate completions from model configs", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -780,6 +799,11 @@ var _ = Describe("API test", func() {
 		})

 		It("can generate chat completions from model configs", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
@@ -866,9 +890,9 @@ var _ = Describe("API test", func() {

 		Context("backends", func() {
 			It("runs rwkv completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
+				// if runtime.GOOS != "linux" {
+				// 	Skip("test supported only on linux")
+				// }
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices) > 0).To(BeTrue())
@@ -889,17 +913,20 @@ var _ = Describe("API test", func() {
 					}

 					Expect(err).ToNot(HaveOccurred())
-					text += response.Choices[0].Text
-					tokens++
+
+					if len(response.Choices) > 0 {
+						text += response.Choices[0].Text
+						tokens++
+					}
 				}
 				Expect(text).ToNot(BeEmpty())
 				Expect(text).To(ContainSubstring("five"))
 				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
 			})
 			It("runs rwkv chat completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
+				// if runtime.GOOS != "linux" {
+				// 	Skip("test supported only on linux")
+				// }
 				resp, err := client.CreateChatCompletion(context.TODO(),
 					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -1008,14 +1035,14 @@ var _ = Describe("API test", func() {
 			c, cancel = context.WithCancel(context.Background())

 			var err error
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err = startup.Startup(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 					config.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = App(application)
 			Expect(err).ToNot(HaveOccurred())

 			go app.Listen("127.0.0.1:9090")
@@ -1039,18 +1066,33 @@ var _ = Describe("API test", func() {
 			}
 		})
 		It("can generate chat completions from config file (list1)", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate chat completions from config file (list2)", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 		It("can generate edit completions from config file", func() {
+			bt, ok := os.LookupEnv("BUILD_TYPE")
+			if ok && strings.ToLower(bt) == "metal" {
+				Skip("GGML + Metal is known flaky, skip test temporarily")
+			}
+
 			request := openaigo.EditCreateRequestBody{
 				Model:       "list2",
 				Instruction: "foo",
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -1,204 +0,0 @@
-package http
-
-import (
-	"encoding/json"
-	"errors"
-	"os"
-	"strings"
-
-	"github.com/go-skynet/LocalAI/pkg/utils"
-
-	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
-	"github.com/go-skynet/LocalAI/core/http/routes"
-
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/core/services"
-	"github.com/go-skynet/LocalAI/pkg/model"
-
-	"github.com/gofiber/contrib/fiberzerolog"
-	"github.com/gofiber/fiber/v2"
-	"github.com/gofiber/fiber/v2/middleware/cors"
-	"github.com/gofiber/fiber/v2/middleware/recover"
-
-	// swagger handler
-	"github.com/rs/zerolog/log"
-)
-
-func readAuthHeader(c *fiber.Ctx) string {
-	authHeader := c.Get("Authorization")
-
-	// elevenlabs
-	xApiKey := c.Get("xi-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	// anthropic
-	xApiKey = c.Get("x-api-key")
-	if xApiKey != "" {
-		authHeader = "Bearer " + xApiKey
-	}
-
-	return authHeader
-}
-
-// @title LocalAI API
-// @version 2.0.0
-// @description The LocalAI Rest API.
-// @termsOfService
-// @contact.name LocalAI
-// @contact.url https://localai.io
-// @license.name MIT
-// @license.url https://raw.githubusercontent.com/mudler/LocalAI/master/LICENSE
-// @BasePath /
-// @securityDefinitions.apikey BearerAuth
-// @in header
-// @name Authorization
-
-func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
-	// Return errors as JSON responses
-	app := fiber.New(fiber.Config{
-		Views:     renderEngine(),
-		BodyLimit: appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
-		// We disable the Fiber startup message as it does not conform to structured logging.
-		// We register a startup log line with connection information in the OnListen hook to keep things user friendly though
-		DisableStartupMessage: true,
-		// Override default error handler
-		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
-			// Status code defaults to 500
-			code := fiber.StatusInternalServerError
-
-			// Retrieve the custom status code if it's a *fiber.Error
-			var e *fiber.Error
-			if errors.As(err, &e) {
-				code = e.Code
-			}
-
-			// Send custom error page
-			return ctx.Status(code).JSON(
-				schema.ErrorResponse{
-					Error: &schema.APIError{Message: err.Error(), Code: code},
-				},
-			)
-		},
-	})
-
-	app.Hooks().OnListen(func(listenData fiber.ListenData) error {
-		scheme := "http"
-		if listenData.TLS {
-			scheme = "https"
-		}
-		log.Info().Str("endpoint", scheme+"://"+listenData.Host+":"+listenData.Port).Msg("LocalAI API is listening! Please connect to the endpoint for API documentation.")
-		return nil
-	})
-
-	// Have Fiber use zerolog like the rest of the application rather than it's built-in logger
-	logger := log.Logger
-	app.Use(fiberzerolog.New(fiberzerolog.Config{
-		Logger: &logger,
-	}))
-
-	// Default middleware config
-
-	if !appConfig.Debug {
-		app.Use(recover.New())
-	}
-
-	metricsService, err := services.NewLocalAIMetricsService()
-	if err != nil {
-		return nil, err
-	}
-
-	if metricsService != nil {
-		app.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
-		app.Hooks().OnShutdown(func() error {
-			return metricsService.Shutdown()
-		})
-	}
-
-	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
-	auth := func(c *fiber.Ctx) error {
-		if len(appConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		// Check for api_keys.json file
-		fileContent, err := os.ReadFile("api_keys.json")
-		if err == nil {
-			// Parse JSON content from the file
-			var fileKeys []string
-			err := json.Unmarshal(fileContent, &fileKeys)
-			if err != nil {
-				return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{"message": "Error parsing api_keys.json"})
-			}
-
-			// Add file keys to options.ApiKeys
-			appConfig.ApiKeys = append(appConfig.ApiKeys, fileKeys...)
-		}
-
-		if len(appConfig.ApiKeys) == 0 {
-			return c.Next()
-		}
-
-		authHeader := readAuthHeader(c)
-		if authHeader == "" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
-		}
-
-		// If it's a bearer token
-		authHeaderParts := strings.Split(authHeader, " ")
-		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
-			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
-		}
-
-		apiKey := authHeaderParts[1]
-		for _, key := range appConfig.ApiKeys {
-			if apiKey == key {
-				return c.Next()
-			}
-		}
-
-		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
-	}
-
-	if appConfig.CORS {
-		var c func(ctx *fiber.Ctx) error
-		if appConfig.CORSAllowOrigins == "" {
-			c = cors.New()
-		} else {
-			c = cors.New(cors.Config{AllowOrigins: appConfig.CORSAllowOrigins})
-		}
-
-		app.Use(c)
-	}
-
-	// Make sure directories exists
-	os.MkdirAll(appConfig.ImageDir, 0750)
-	os.MkdirAll(appConfig.AudioDir, 0750)
-	os.MkdirAll(appConfig.UploadDir, 0750)
-	os.MkdirAll(appConfig.ConfigsDir, 0750)
-	os.MkdirAll(appConfig.ModelPath, 0750)
-
-	// Load config jsons
-	utils.LoadConfig(appConfig.UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
-
-	galleryService := services.NewGalleryService(appConfig.ModelPath)
-	galleryService.Start(appConfig.Context, cl)
-
-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig, auth)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService, auth)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig, auth)
-	routes.RegisterPagesRoutes(app, cl, ml, appConfig, auth)
-	routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService, auth)
-	routes.RegisterJINARoutes(app, cl, ml, appConfig, auth)
-
-	// Define a custom 404 handler
-	// Note: keep this at the bottom!
-	app.Use(notFoundHandler)
-
-	return app, nil
-}
--- a/core/http/http_suite_test.go
+++ b/core/http/http_suite_test.go
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -1,43 +1,88 @@
 package fiberContext

 import (
+	"context"
+	"encoding/json"
 	"fmt"
 	"strings"

+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

+type FiberContextExtractor struct {
+	ml        *model.ModelLoader
+	appConfig *config.ApplicationConfig
+}
+
+func NewFiberContextExtractor(ml *model.ModelLoader, appConfig *config.ApplicationConfig) *FiberContextExtractor {
+	return &FiberContextExtractor{
+		ml:        ml,
+		appConfig: appConfig,
+	}
+}
+
 // ModelFromContext returns the model from the context
 // If no model is specified, it will take the first available
 // Takes a model string as input which should be the one received from the user request.
 // It returns the model name resolved from the context and an error if any.
-func ModelFromContext(ctx *fiber.Ctx, loader *model.ModelLoader, modelInput string, firstModel bool) (string, error) {
-	if ctx.Params("model") != "" {
-		modelInput = ctx.Params("model")
+func (fce *FiberContextExtractor) ModelFromContext(ctx *fiber.Ctx, modelInput string, firstModel bool) (string, error) {
+	ctxPM := ctx.Params("model")
+	if ctxPM != "" {
+		log.Debug().Msgf("[FCE] Overriding param modelInput %q with ctx.Params value %q", modelInput, ctxPM)
+		modelInput = ctxPM
 	}

 	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
-	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+	bearer := strings.TrimPrefix(ctx.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && fce.ml.ExistsInModelPath(bearer)

 	// If no model was specified, take the first available
 	if modelInput == "" && !bearerExists && firstModel {
-		models, _ := loader.ListModels()
+		models, _ := fce.ml.ListModels()
 		if len(models) > 0 {
 			modelInput = models[0]
-			log.Debug().Msgf("No model specified, using: %s", modelInput)
+			log.Debug().Msgf("[FCE] No model specified, using first available: %s", modelInput)
 		} else {
-			log.Debug().Msgf("No model specified, returning error")
-			return "", fmt.Errorf("no model specified")
+			log.Warn().Msgf("[FCE] No model specified, none available")
+			return "", fmt.Errorf("[fce] no model specified, none available")
 		}
 	}

 	// If a model is found in bearer token takes precedence
 	if bearerExists {
-		log.Debug().Msgf("Using model from bearer token: %s", bearer)
+		log.Debug().Msgf("[FCE] Using model from bearer token: %s", bearer)
 		modelInput = bearer
 	}
+
+	if modelInput == "" {
+		log.Warn().Msg("[FCE] modelInput is empty")
+	}
 	return modelInput, nil
 }
+
+// TODO: Do we still need the first return value?
+func (fce *FiberContextExtractor) OpenAIRequestFromContext(c *fiber.Ctx, firstModel bool) (string, *schema.OpenAIRequest, error) {
+	input := new(schema.OpenAIRequest)
+
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
+	}
+
+	received, _ := json.Marshal(input)
+
+	ctx, cancel := context.WithCancel(fce.appConfig.Context)
+	input.Context = ctx
+	input.Cancel = cancel
+
+	log.Debug().Msgf("Request received: %s", string(received))
+
+	var err error
+	input.Model, err = fce.ModelFromContext(c, input.Model, firstModel)
+
+	return input.Model, input, err
+}
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -1,247 +0,0 @@
-package elements
-
-import (
-	"fmt"
-
-	"github.com/chasefleming/elem-go"
-	"github.com/chasefleming/elem-go/attrs"
-	"github.com/go-skynet/LocalAI/pkg/gallery"
-)
-
-const (
-	NoImage = "https://upload.wikimedia.org/wikipedia/commons/6/65/No-Image-Placeholder.svg"
-)
-
-func DoneProgress(uid string) string {
-	return elem.Div(
-		attrs.Props{},
-		elem.H3(
-			attrs.Props{
-				"role":      "status",
-				"id":        "pblabel",
-				"tabindex":  "-1",
-				"autofocus": "",
-			},
-			elem.Text("Installation completed"),
-		),
-	).Render()
-}
-
-func ErrorProgress(err string) string {
-	return elem.Div(
-		attrs.Props{},
-		elem.H3(
-			attrs.Props{
-				"role":      "status",
-				"id":        "pblabel",
-				"tabindex":  "-1",
-				"autofocus": "",
-			},
-			elem.Text("Error"+err),
-		),
-	).Render()
-}
-
-func ProgressBar(progress string) string {
-	return elem.Div(attrs.Props{
-		"class":           "progress",
-		"role":            "progressbar",
-		"aria-valuemin":   "0",
-		"aria-valuemax":   "100",
-		"aria-valuenow":   "0",
-		"aria-labelledby": "pblabel",
-	},
-		elem.Div(attrs.Props{
-			"id":    "pb",
-			"class": "progress-bar",
-			"style": "width:" + progress + "%",
-		}),
-	).Render()
-}
-
-func StartProgressBar(uid, progress string) string {
-	if progress == "" {
-		progress = "0"
-	}
-	return elem.Div(attrs.Props{
-		"hx-trigger": "done",
-		"hx-get":     "/browse/job/" + uid,
-		"hx-swap":    "outerHTML",
-		"hx-target":  "this",
-	},
-		elem.H3(
-			attrs.Props{
-				"role":      "status",
-				"id":        "pblabel",
-				"tabindex":  "-1",
-				"autofocus": "",
-			},
-			elem.Text("Installing"),
-			// This is a simple example of how to use the HTMLX library to create a progress bar that updates every 600ms.
-			elem.Div(attrs.Props{
-				"hx-get":     "/browse/job/progress/" + uid,
-				"hx-trigger": "every 600ms",
-				"hx-target":  "this",
-				"hx-swap":    "innerHTML",
-			},
-				elem.Raw(ProgressBar(progress)),
-			),
-		),
-	).Render()
-}
-
-func cardSpan(text, icon string) elem.Node {
-	return elem.Span(
-		attrs.Props{
-			"class": "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
-		},
-		elem.I(attrs.Props{
-			"class": icon + " pr-2",
-		}),
-		elem.Text(text),
-	)
-}
-
-func ListModels(models []*gallery.GalleryModel) string {
-	modelsElements := []elem.Node{}
-	span := func(s string) elem.Node {
-		return elem.Span(
-			attrs.Props{
-				"class": "float-right inline-block bg-green-500 text-white py-1 px-3 rounded-full text-xs",
-			},
-			elem.Text(s),
-		)
-	}
-	installButton := func(m *gallery.GalleryModel) elem.Node {
-		return elem.Button(
-			attrs.Props{
-				"data-twe-ripple-init":  "",
-				"data-twe-ripple-color": "light",
-				"class":                 "float-right inline-block rounded bg-primary px-6 pb-2.5 mb-3 pt-2.5 text-xs font-medium uppercase leading-normal text-white shadow-primary-3 transition duration-150 ease-in-out hover:bg-primary-accent-300 hover:shadow-primary-2 focus:bg-primary-accent-300 focus:shadow-primary-2 focus:outline-none focus:ring-0 active:bg-primary-600 active:shadow-primary-2 dark:shadow-black/30 dark:hover:shadow-dark-strong dark:focus:shadow-dark-strong dark:active:shadow-dark-strong",
-				// post the Model ID as param
-				"hx-post": "/browse/install/model/" + fmt.Sprintf("%s@%s", m.Gallery.Name, m.Name),
-			},
-			elem.I(
-				attrs.Props{
-					"class": "fa-solid fa-download pr-2",
-				},
-			),
-			elem.Text("Install"),
-		)
-	}
-
-	descriptionDiv := func(m *gallery.GalleryModel) elem.Node {
-
-		return elem.Div(
-			attrs.Props{
-				"class": "p-6 text-surface dark:text-white",
-			},
-			elem.H5(
-				attrs.Props{
-					"class": "mb-2 text-xl font-medium leading-tight",
-				},
-				elem.Text(m.Name),
-			),
-			elem.P(
-				attrs.Props{
-					"class": "mb-4 text-base",
-				},
-				elem.Text(m.Description),
-			),
-		)
-	}
-
-	actionDiv := func(m *gallery.GalleryModel) elem.Node {
-		nodes := []elem.Node{
-			cardSpan("Repository: "+m.Gallery.Name, "fa-brands fa-git-alt"),
-		}
-
-		if m.License != "" {
-			nodes = append(nodes,
-				cardSpan("License: "+m.License, "fas fa-book"),
-			)
-		}
-
-		for _, tag := range m.Tags {
-			nodes = append(nodes,
-				cardSpan(tag, "fas fa-tag"),
-			)
-		}
-
-		for i, url := range m.URLs {
-			nodes = append(nodes,
-				elem.A(
-					attrs.Props{
-						"class":  "inline-block bg-gray-200 rounded-full px-3 py-1 text-sm font-semibold text-gray-700 mr-2 mb-2",
-						"href":   url,
-						"target": "_blank",
-					},
-					elem.I(attrs.Props{
-						"class": "fas fa-link pr-2",
-					}),
-					elem.Text("Link #"+fmt.Sprintf("%d", i+1)),
-				))
-		}
-
-		return elem.Div(
-			attrs.Props{
-				"class": "px-6 pt-4 pb-2",
-			},
-			elem.P(
-				attrs.Props{
-					"class": "mb-4 text-base",
-				},
-				nodes...,
-			),
-			elem.If(m.Installed, span("Installed"), installButton(m)),
-		)
-	}
-
-	for _, m := range models {
-
-		elems := []elem.Node{}
-
-		if m.Icon == "" {
-			m.Icon = NoImage
-		}
-
-		elems = append(elems,
-
-			elem.Div(attrs.Props{
-				"class": "flex justify-center items-center",
-			},
-				elem.A(attrs.Props{
-					"href": "#!",
-					//		"class": "justify-center items-center",
-				},
-					elem.Img(attrs.Props{
-						//	"class": "rounded-t-lg object-fit object-center h-96",
-						"class": "rounded-t-lg max-h-48 max-w-96 object-cover mt-3",
-						"src":   m.Icon,
-					}),
-				),
-			))
-
-		elems = append(elems, descriptionDiv(m), actionDiv(m))
-		modelsElements = append(modelsElements,
-			elem.Div(
-				attrs.Props{
-					"class": " me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2",
-				},
-				elem.Div(
-					attrs.Props{
-						//	"class": "p-6",
-					},
-					elems...,
-				),
-			),
-		)
-	}
-
-	wrapper := elem.Div(attrs.Props{
-		"class": "dark grid grid-cols-1 grid-rows-1 md:grid-cols-3 block rounded-lg shadow-secondary-1 dark:bg-surface-dark",
-		//"class": "block rounded-lg bg-white shadow-secondary-1 dark:bg-surface-dark",
-	}, modelsElements...)
-
-	return wrapper.Render()
-}
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -2,9 +2,7 @@ package elevenlabs

 import (
 	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
 	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
@@ -17,7 +15,7 @@ import (
 // @Param request body schema.TTSRequest true "query params"
 // @Success 200 {string} binary	 "Response"
 // @Router /v1/text-to-speech/{voice-id} [post]
-func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func TTSEndpoint(fce *fiberContext.FiberContextExtractor, ttsbs *backend.TextToSpeechBackendService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

 		input := new(schema.ElevenLabsTTSRequest)
@@ -28,34 +26,21 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			return err
 		}

-		modelFile, err := fiberContext.ModelFromContext(c, ml, input.ModelID, false)
+		var err error
+		input.ModelID, err = fce.ModelFromContext(c, input.ModelID, false)
 		if err != nil {
-			modelFile = input.ModelID
 			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
 		}

-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-		if err != nil {
-			modelFile = input.ModelID
-			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
-		} else {
-			if input.ModelID != "" {
-				modelFile = input.ModelID
-			} else {
-				modelFile = cfg.Model
-			}
+		responseChannel := ttsbs.TextToAudioFile(&schema.TTSRequest{
+			Model: input.ModelID,
+			Voice: voiceID,
+			Input: input.Text,
+		})
+		rawValue := <-responseChannel
+		if rawValue.Error != nil {
+			return rawValue.Error
 		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
-
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
-		if err != nil {
-			return err
-		}
-		return c.Download(filePath)
+		return c.Download(*rawValue.Value)
 	}
 }
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -1,84 +0,0 @@
-package jina
-
-import (
-	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
-
-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-)
-
-func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		req := new(schema.JINARerankRequest)
-		if err := c.BodyParser(req); err != nil {
-			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{
-				"error": "Cannot parse JSON",
-			})
-		}
-
-		input := new(schema.TTSRequest)
-
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, false)
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
-		}
-
-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
-		} else {
-			modelFile = cfg.Model
-		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
-
-		if input.Backend != "" {
-			cfg.Backend = input.Backend
-		}
-
-		request := &proto.RerankRequest{
-			Query:     req.Query,
-			TopN:      int32(req.TopN),
-			Documents: req.Documents,
-		}
-
-		results, err := backend.Rerank(cfg.Backend, modelFile, request, ml, appConfig, *cfg)
-		if err != nil {
-			return err
-		}
-
-		response := &schema.JINARerankResponse{
-			Model: req.Model,
-		}
-
-		for _, r := range results.Results {
-			response.Results = append(response.Results, schema.JINADocumentResult{
-				Index:          int(r.Index),
-				Document:       schema.JINAText{Text: r.Text},
-				RelevanceScore: float64(r.RelevanceScore),
-			})
-		}
-
-		response.Usage.TotalTokens = int(results.Usage.TotalTokens)
-		response.Usage.PromptTokens = int(results.Usage.PromptTokens)
-
-		return c.Status(fiber.StatusOK).JSON(response)
-	}
-}
--- a/core/http/endpoints/localai/backend_monitor.go
+++ b/core/http/endpoints/localai/backend_monitor.go
@@ -6,7 +6,7 @@ import (
 	"github.com/gofiber/fiber/v2"
 )

-func BackendMonitorEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error {
+func BackendMonitorEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

 		input := new(schema.BackendMonitorRequest)
@@ -23,7 +23,7 @@ func BackendMonitorEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error
 	}
 }

-func BackendShutdownEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error {
+func BackendShutdownEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.BackendMonitorRequest)
 		// Get input data from the request body
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -2,9 +2,7 @@ package localai

 import (
 	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
 	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/pkg/model"

 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
@@ -16,45 +14,26 @@ import (
 // @Param request body schema.TTSRequest true "query params"
 // @Success 200 {string} binary	 "Response"
 // @Router /v1/audio/speech [post]
-func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func TTSEndpoint(fce *fiberContext.FiberContextExtractor, ttsbs *backend.TextToSpeechBackendService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-
+		var err error
 		input := new(schema.TTSRequest)

 		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
+		if err = c.BodyParser(input); err != nil {
 			return err
 		}

-		modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, false)
+		input.Model, err = fce.ModelFromContext(c, input.Model, false)
 		if err != nil {
-			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}

-		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
-			config.LoadOptionDebug(appConfig.Debug),
-			config.LoadOptionThreads(appConfig.Threads),
-			config.LoadOptionContextSize(appConfig.ContextSize),
-			config.LoadOptionF16(appConfig.F16),
-		)
-
-		if err != nil {
-			modelFile = input.Model
-			log.Warn().Msgf("Model not found in context: %s", input.Model)
-		} else {
-			modelFile = cfg.Model
+		responseChannel := ttsbs.TextToAudioFile(input)
+		rawValue := <-responseChannel
+		if rawValue.Error != nil {
+			return rawValue.Error
 		}
-		log.Debug().Msgf("Request for model: %s", modelFile)
-
-		if input.Backend != "" {
-			cfg.Backend = input.Backend
-		}
-
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
-		if err != nil {
-			return err
-		}
-		return c.Download(filePath)
+		return c.Download(*rawValue.Value)
 	}
 }
--- a/core/http/endpoints/localai/welcome.go
+++ b/core/http/endpoints/localai/welcome.go
@@ -1,32 +0,0 @@
-package localai
-
-import (
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/internal"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-)
-
-func WelcomeEndpoint(appConfig *config.ApplicationConfig,
-	cl *config.BackendConfigLoader, ml *model.ModelLoader) func(*fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		models, _ := ml.ListModels()
-		backendConfigs := cl.GetAllBackendConfigs()
-
-		summary := fiber.Map{
-			"Title":             "LocalAI API - " + internal.PrintableVersion(),
-			"Version":           internal.PrintableVersion(),
-			"Models":            models,
-			"ModelsConfig":      backendConfigs,
-			"ApplicationConfig": appConfig,
-		}
-
-		if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
-			// The client expects a JSON response
-			return c.Status(fiber.StatusOK).JSON(summary)
-		} else {
-			// Render index
-			return c.Render("views/index", summary)
-		}
-	}
-}
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -339,7 +339,7 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 			}
 		}

-		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find "))
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistantID %q", assistantID))
 	}
 }

@@ -455,19 +455,21 @@ func DeleteAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
 		for i, assistant := range Assistants {
 			if assistant.ID == assistantID {
 				for j, fileId := range assistant.FileIDs {
-					Assistants[i].FileIDs = append(Assistants[i].FileIDs[:j], Assistants[i].FileIDs[j+1:]...)
+					if fileId == fileId {
+						Assistants[i].FileIDs = append(Assistants[i].FileIDs[:j], Assistants[i].FileIDs[j+1:]...)

-					// Check if the file exists in the assistantFiles slice
-					for i, assistantFile := range AssistantFiles {
-						if assistantFile.ID == fileId {
-							// Remove the file from the assistantFiles slice
-							AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
-							utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
-							return c.Status(fiber.StatusOK).JSON(DeleteAssistantFileResponse{
-								ID:      fileId,
-								Object:  "assistant.file.deleted",
-								Deleted: true,
-							})
+						// Check if the file exists in the assistantFiles slice
+						for i, assistantFile := range AssistantFiles {
+							if assistantFile.ID == fileId {
+								// Remove the file from the assistantFiles slice
+								AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
+								utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
+								return c.Status(fiber.StatusOK).JSON(DeleteAssistantFileResponse{
+									ID:      fileId,
+									Object:  "assistant.file.deleted",
+									Deleted: true,
+								})
+							}
 						}
 					}
 				}
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -3,6 +3,10 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/stretchr/testify/assert"
 	"io"
 	"io/ioutil"
 	"net/http"
@@ -12,11 +16,6 @@ import (
 	"strings"
 	"testing"
 	"time"
-
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/stretchr/testify/assert"
 )

 var configsDir string = "/tmp/localai/configs"
@@ -50,8 +49,8 @@ func TestAssistantEndpoints(t *testing.T) {
 	}

 	_ = os.RemoveAll(appConfig.ConfigsDir)
-	_ = os.MkdirAll(appConfig.ConfigsDir, 0750)
-	_ = os.MkdirAll(modelPath, 0750)
+	_ = os.MkdirAll(appConfig.ConfigsDir, 0755)
+	_ = os.MkdirAll(modelPath, 0755)
 	os.Create(filepath.Join(modelPath, "ggml-gpt4all-j"))

 	app := fiber.New(fiber.Config{
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -5,16 +5,11 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
-	"strings"
-	"time"

-	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/functions"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/core/services"
 	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -24,418 +19,82 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	emptyMessage := ""
-	id := uuid.New().String()
-	created := int(time.Now().Unix())
-
-	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		initialMessage := schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
-			Object:  "chat.completion.chunk",
-		}
-		responses <- initialMessage
-
-		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			resp := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
-				Object:  "chat.completion.chunk",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     usage.Prompt,
-					CompletionTokens: usage.Completion,
-					TotalTokens:      usage.Prompt + usage.Completion,
-				},
-			}
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		result := ""
-		_, tokenUsage, _ := ComputeChoices(req, prompt, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			result += s
-			// TODO: Change generated BNF grammar to be compliant with the schema so we can
-			// stream the result token by token here.
-			return true
-		})
-
-		results := functions.ParseFunctionCall(result, config.FunctionsConfig)
-		noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0
-
-		switch {
-		case noActionToRun:
-			initialMessage := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
-				Object:  "chat.completion.chunk",
-			}
-			responses <- initialMessage
-
-			result, err := handleQuestion(config, req, ml, startupOptions, results, prompt)
-			if err != nil {
-				log.Error().Err(err).Msg("error handling question")
-				return
-			}
-
-			resp := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0}},
-				Object:  "chat.completion.chunk",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     tokenUsage.Prompt,
-					CompletionTokens: tokenUsage.Completion,
-					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
-				},
-			}
-
-			responses <- resp
-
-		default:
-			for i, ss := range results {
-				name, args := ss.Name, ss.Arguments
-
-				initialMessage := schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []schema.Choice{{
-						Delta: &schema.Message{
-							Role: "assistant",
-							ToolCalls: []schema.ToolCall{
-								{
-									Index: i,
-									ID:    id,
-									Type:  "function",
-									FunctionCall: schema.FunctionCall{
-										Name: name,
-									},
-								},
-							},
-						}}},
-					Object: "chat.completion.chunk",
-				}
-				responses <- initialMessage
-
-				responses <- schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []schema.Choice{{
-						Delta: &schema.Message{
-							Role: "assistant",
-							ToolCalls: []schema.ToolCall{
-								{
-									Index: i,
-									ID:    id,
-									Type:  "function",
-									FunctionCall: schema.FunctionCall{
-										Arguments: args,
-									},
-								},
-							},
-						}}},
-					Object: "chat.completion.chunk",
-				}
-			}
-		}
-
-		close(responses)
-	}
-
+func ChatEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAIService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readRequest(c, ml, startupOptions, true)
+		_, request, err := fce.OpenAIRequestFromContext(c, false)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+			return fmt.Errorf("failed reading parameters from request: %w", err)
 		}

-		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, startupOptions.Debug, startupOptions.Threads, startupOptions.ContextSize, startupOptions.F16)
+		traceID, finalResultChannel, _, tokenChannel, err := oais.Chat(request, false, request.Stream)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-		log.Debug().Msgf("Configuration read: %+v", config)
-
-		funcs := input.Functions
-		shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
-
-		// Allow the user to set custom actions via config file
-		// to be "embedded" in each model
-		noActionName := "answer"
-		noActionDescription := "use this action to answer without performing any action"
-
-		if config.FunctionsConfig.NoActionFunctionName != "" {
-			noActionName = config.FunctionsConfig.NoActionFunctionName
-		}
-		if config.FunctionsConfig.NoActionDescriptionName != "" {
-			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
+			return err
 		}

-		if input.ResponseFormat.Type == "json_object" {
-			input.Grammar = functions.JSONBNF
-		}
+		if request.Stream {

-		config.Grammar = input.Grammar
+			log.Debug().Msgf("Chat Stream request received")

-		if shouldUseFn {
-			log.Debug().Msgf("Response needs to process functions")
-		}
-
-		switch {
-		case !config.FunctionsConfig.NoGrammar && shouldUseFn:
-			noActionGrammar := functions.Function{
-				Name:        noActionName,
-				Description: noActionDescription,
-				Parameters: map[string]interface{}{
-					"properties": map[string]interface{}{
-						"message": map[string]interface{}{
-							"type":        "string",
-							"description": "The message to reply the user with",
-						}},
-				},
-			}
-
-			// Append the no action function
-			if !config.FunctionsConfig.DisableNoAction {
-				funcs = append(funcs, noActionGrammar)
-			}
-
-			// Force picking one of the functions by the request
-			if config.FunctionToCall() != "" {
-				funcs = funcs.Select(config.FunctionToCall())
-			}
-
-			// Update input grammar
-			jsStruct := funcs.ToJSONStructure()
-			config.Grammar = jsStruct.Grammar("", config.FunctionsConfig.ParallelCalls)
-		case input.JSONFunctionGrammarObject != nil:
-			config.Grammar = input.JSONFunctionGrammarObject.Grammar("", config.FunctionsConfig.ParallelCalls)
-		default:
-			// Force picking one of the functions by the request
-			if config.FunctionToCall() != "" {
-				funcs = funcs.Select(config.FunctionToCall())
-			}
-		}
-
-		// process functions if we have any defined or if we have a function call string
-
-		// functions are not supported in stream mode (yet?)
-		toStream := input.Stream
-
-		log.Debug().Msgf("Parameters: %+v", config)
-
-		var predInput string
-
-		// If we are using the tokenizer template, we don't need to process the messages
-		// unless we are processing functions
-		if !config.TemplateConfig.UseTokenizerTemplate || shouldUseFn {
-			suppressConfigSystemPrompt := false
-			mess := []string{}
-			for messageIndex, i := range input.Messages {
-				var content string
-				role := i.Role
-
-				// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
-				// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
-				if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
-					roleFn := "assistant_function_call"
-					r := config.Roles[roleFn]
-					if r != "" {
-						role = roleFn
-					}
-				}
-				r := config.Roles[role]
-				contentExists := i.Content != nil && i.StringContent != ""
-
-				fcall := i.FunctionCall
-				if len(i.ToolCalls) > 0 {
-					fcall = i.ToolCalls
-				}
-
-				// First attempt to populate content via a chat message specific template
-				if config.TemplateConfig.ChatMessage != "" {
-					chatMessageData := model.ChatMessageTemplateData{
-						SystemPrompt: config.SystemPrompt,
-						Role:         r,
-						RoleName:     role,
-						Content:      i.StringContent,
-						FunctionCall: fcall,
-						FunctionName: i.Name,
-						LastMessage:  messageIndex == (len(input.Messages) - 1),
-						Function:     config.Grammar != "" && (messageIndex == (len(input.Messages) - 1)),
-						MessageIndex: messageIndex,
-					}
-					templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
-					if err != nil {
-						log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
-					} else {
-						if templatedChatMessage == "" {
-							log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
-							continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
-						}
-						log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
-						content = templatedChatMessage
-					}
-				}
-
-				marshalAnyRole := func(f any) {
-					j, err := json.Marshal(f)
-					if err == nil {
-						if contentExists {
-							content += "\n" + fmt.Sprint(r, " ", string(j))
-						} else {
-							content = fmt.Sprint(r, " ", string(j))
-						}
-					}
-				}
-				marshalAny := func(f any) {
-					j, err := json.Marshal(f)
-					if err == nil {
-						if contentExists {
-							content += "\n" + string(j)
-						} else {
-							content = string(j)
-						}
-					}
-				}
-				// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
-				if content == "" {
-					if r != "" {
-						if contentExists {
-							content = fmt.Sprint(r, i.StringContent)
-						}
-
-						if i.FunctionCall != nil {
-							marshalAnyRole(i.FunctionCall)
-						}
-						if i.ToolCalls != nil {
-							marshalAnyRole(i.ToolCalls)
-						}
-					} else {
-						if contentExists {
-							content = fmt.Sprint(i.StringContent)
-						}
-						if i.FunctionCall != nil {
-							marshalAny(i.FunctionCall)
-						}
-						if i.ToolCalls != nil {
-							marshalAny(i.ToolCalls)
-						}
-					}
-					// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
-					if contentExists && role == "system" {
-						suppressConfigSystemPrompt = true
-					}
-				}
-
-				mess = append(mess, content)
-			}
-
-			predInput = strings.Join(mess, "\n")
-			log.Debug().Msgf("Prompt (before templating): %s", predInput)
-
-			templateFile := ""
-
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-				templateFile = config.Model
-			}
-
-			if config.TemplateConfig.Chat != "" && !shouldUseFn {
-				templateFile = config.TemplateConfig.Chat
-			}
-
-			if config.TemplateConfig.Functions != "" && shouldUseFn {
-				templateFile = config.TemplateConfig.Functions
-			}
-
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-					SystemPrompt:         config.SystemPrompt,
-					SuppressSystemPrompt: suppressConfigSystemPrompt,
-					Input:                predInput,
-					Functions:            funcs,
-				})
-				if err == nil {
-					predInput = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", predInput)
-				} else {
-					log.Debug().Msgf("Template failed loading: %s", err.Error())
-				}
-			}
-
-			log.Debug().Msgf("Prompt (after templating): %s", predInput)
-			if shouldUseFn && config.Grammar != "" {
-				log.Debug().Msgf("Grammar: %+v", config.Grammar)
-			}
-		}
-
-		switch {
-		case toStream:
-
-			log.Debug().Msgf("Stream request received")
 			c.Context().SetContentType("text/event-stream")
 			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//	c.Set("Content-Type", "text/event-stream")
+			//
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")

-			responses := make(chan schema.OpenAIResponse)
-
-			if !shouldUseFn {
-				go process(predInput, input, config, ml, responses)
-			} else {
-				go processTools(noActionName, predInput, input, config, ml, responses)
-			}
-
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
 				usage := &schema.OpenAIUsage{}
 				toolsCalled := false
-				for ev := range responses {
-					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
-					if len(ev.Choices[0].Delta.ToolCalls) > 0 {
+				for ev := range tokenChannel {
+					if ev.Error != nil {
+						log.Debug().Err(ev.Error).Msg("chat streaming responseChannel error")
+						request.Cancel()
+						break
+					}
+					usage = &ev.Value.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+
+					if len(ev.Value.Choices[0].Delta.ToolCalls) > 0 {
 						toolsCalled = true
 					}
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
-					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					if ev.Error != nil {
+						log.Debug().Err(ev.Error).Msg("[ChatEndpoint] error to debug during tokenChannel handler")
+						enc.Encode(ev.Error)
+					} else {
+						enc.Encode(ev.Value)
+					}
+					log.Debug().Msgf("chat streaming sending chunk: %s", buf.String())
 					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
 					if err != nil {
-						log.Debug().Msgf("Sending chunk failed: %v", err)
-						input.Cancel()
+						log.Debug().Err(err).Msgf("Sending chunk failed")
+						request.Cancel()
+						break
+					}
+					err = w.Flush()
+					if err != nil {
+						log.Debug().Msg("error while flushing, closing connection")
+						request.Cancel()
 						break
 					}
-					w.Flush()
 				}

 				finishReason := "stop"
 				if toolsCalled {
 					finishReason = "tool_calls"
-				} else if toolsCalled && len(input.Tools) == 0 {
+				} else if toolsCalled && len(request.Tools) == 0 {
 					finishReason = "function_call"
 				}

 				resp := &schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      traceID.ID,
+					Created: traceID.Created,
+					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							FinishReason: finishReason,
 							Index:        0,
-							Delta:        &schema.Message{Content: &emptyMessage},
+							Delta:        &schema.Message{Content: ""},
 						}},
 					Object: "chat.completion.chunk",
 					Usage:  *usage,
@@ -446,146 +105,21 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				w.WriteString("data: [DONE]\n\n")
 				w.Flush()
 			}))
+
 			return nil
-
-		// no streaming mode
-		default:
-			result, tokenUsage, err := ComputeChoices(input, predInput, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
-				if !shouldUseFn {
-					// no function is called, just reply and use stop as finish reason
-					*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
-					return
-				}
-
-				results := functions.ParseFunctionCall(s, config.FunctionsConfig)
-				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0
-
-				switch {
-				case noActionsToRun:
-					result, err := handleQuestion(config, input, ml, startupOptions, results, predInput)
-					if err != nil {
-						log.Error().Err(err).Msg("error handling question")
-						return
-					}
-					*c = append(*c, schema.Choice{
-						Message: &schema.Message{Role: "assistant", Content: &result}})
-				default:
-					toolChoice := schema.Choice{
-						Message: &schema.Message{
-							Role: "assistant",
-						},
-					}
-
-					if len(input.Tools) > 0 {
-						toolChoice.FinishReason = "tool_calls"
-					}
-
-					for _, ss := range results {
-						name, args := ss.Name, ss.Arguments
-						if len(input.Tools) > 0 {
-							// If we are using tools, we condense the function calls into
-							// a single response choice with all the tools
-							toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls,
-								schema.ToolCall{
-									ID:   id,
-									Type: "function",
-									FunctionCall: schema.FunctionCall{
-										Name:      name,
-										Arguments: args,
-									},
-								},
-							)
-						} else {
-							// otherwise we return more choices directly
-							*c = append(*c, schema.Choice{
-								FinishReason: "function_call",
-								Message: &schema.Message{
-									Role: "assistant",
-									FunctionCall: map[string]interface{}{
-										"name":      name,
-										"arguments": args,
-									},
-								},
-							})
-						}
-					}
-
-					if len(input.Tools) > 0 {
-						// we need to append our result if we are using tools
-						*c = append(*c, toolChoice)
-					}
-				}
-
-			}, nil)
-			if err != nil {
-				return err
-			}
-
-			resp := &schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: result,
-				Object:  "chat.completion",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     tokenUsage.Prompt,
-					CompletionTokens: tokenUsage.Completion,
-					TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
-				},
-			}
-			respData, _ := json.Marshal(resp)
-			log.Debug().Msgf("Response: %s", respData)
-
-			// Return the prediction in the response body
-			return c.JSON(resp)
 		}
+
+		// TODO is this proper to have exclusive from Stream, or do we need to issue both responses?
+		rawResponse := <-finalResultChannel
+
+		if rawResponse.Error != nil {
+			return rawResponse.Error
+		}
+
+		jsonResult, _ := json.Marshal(rawResponse.Value)
+		log.Debug().Str("jsonResult", string(jsonResult)).Msg("Chat Final Response")
+
+		// Return the prediction in the response body
+		return c.JSON(rawResponse.Value)
 	}
 }
-
-func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, funcResults []functions.FuncCallResults, prompt string) (string, error) {
-	log.Debug().Msgf("nothing to do, computing a reply")
-	arg := ""
-	if len(funcResults) > 0 {
-		arg = funcResults[0].Arguments
-	}
-	// If there is a message that the LLM already sends as part of the JSON reply, use it
-	arguments := map[string]interface{}{}
-	if err := json.Unmarshal([]byte(arg), &arguments); err != nil {
-		log.Debug().Msg("handleQuestion: function result did not contain a valid JSON object")
-	}
-	m, exists := arguments["message"]
-	if exists {
-		switch message := m.(type) {
-		case string:
-			if message != "" {
-				log.Debug().Msgf("Reply received from LLM: %s", message)
-				message = backend.Finetune(*config, prompt, message)
-				log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
-
-				return message, nil
-			}
-		}
-	}
-
-	log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
-	// Otherwise ask the LLM to understand the JSON output and the context, and return a message
-	// Note: This costs (in term of CPU/GPU) another computation
-	config.Grammar = ""
-	images := []string{}
-	for _, m := range input.Messages {
-		images = append(images, m.StringImages...)
-	}
-
-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
-	if err != nil {
-		log.Error().Err(err).Msg("model inference failed")
-		return "", err
-	}
-
-	prediction, err := predFunc()
-	if err != nil {
-		log.Error().Err(err).Msg("prediction failed")
-		return "", err
-	}
-	return backend.Finetune(*config, prompt, prediction.Response), nil
-}
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -4,18 +4,13 @@ import (
 	"bufio"
 	"bytes"
 	"encoding/json"
-	"errors"
 	"fmt"
-	"time"

-	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/services"

 	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/functions"
-	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -25,116 +20,50 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
-func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	id := uuid.New().String()
-	created := int(time.Now().Unix())
-
-	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		ComputeChoices(req, s, config, appConfig, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			resp := schema.OpenAIResponse{
-				ID:      id,
-				Created: created,
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{
-					{
-						Index: 0,
-						Text:  s,
-					},
-				},
-				Object: "text_completion",
-				Usage: schema.OpenAIUsage{
-					PromptTokens:     usage.Prompt,
-					CompletionTokens: usage.Completion,
-					TotalTokens:      usage.Prompt + usage.Completion,
-				},
-			}
-			log.Debug().Msgf("Sending goroutine: %s", s)
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-
+func CompletionEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAIService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readRequest(c, ml, appConfig, true)
+		_, request, err := fce.OpenAIRequestFromContext(c, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		log.Debug().Msgf("`input`: %+v", input)
+		log.Debug().Msgf("`OpenAIRequest`: %+v", request)

-		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
+		traceID, finalResultChannel, _, _, tokenChannel, err := oais.Completion(request, false, request.Stream)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+			return err
 		}

-		if input.ResponseFormat.Type == "json_object" {
-			input.Grammar = functions.JSONBNF
-		}
+		if request.Stream {
+			log.Debug().Msgf("Completion Stream request received")

-		config.Grammar = input.Grammar
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		if input.Stream {
-			log.Debug().Msgf("Stream request received")
 			c.Context().SetContentType("text/event-stream")
 			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
 			//c.Set("Content-Type", "text/event-stream")
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
-		}
-
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
-
-		if config.TemplateConfig.Completion != "" {
-			templateFile = config.TemplateConfig.Completion
-		}
-
-		if input.Stream {
-			if len(config.PromptStrings) > 1 {
-				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
-			}
-
-			predInput := config.PromptStrings[0]
-
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					Input: predInput,
-				})
-				if err == nil {
-					predInput = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", predInput)
-				}
-			}
-
-			responses := make(chan schema.OpenAIResponse)
-
-			go process(predInput, input, config, ml, responses)

 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-
-				for ev := range responses {
+				for ev := range tokenChannel {
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
+					if ev.Error != nil {
+						log.Debug().Msgf("[CompletionEndpoint] error to debug during tokenChannel handler: %q", ev.Error)
+						enc.Encode(ev.Error)
+					} else {
+						enc.Encode(ev.Value)
+					}

-					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					log.Debug().Msgf("completion streaming sending chunk: %s", buf.String())
 					fmt.Fprintf(w, "data: %v\n", buf.String())
 					w.Flush()
 				}

 				resp := &schema.OpenAIResponse{
-					ID:      id,
-					Created: created,
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      traceID.ID,
+					Created: traceID.Created,
+					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							Index:        0,
@@ -151,55 +80,15 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			}))
 			return nil
 		}
-
-		var result []schema.Choice
-
-		totalTokenUsage := backend.TokenUsage{}
-
-		for k, i := range config.PromptStrings {
-			if templateFile != "" {
-				// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					SystemPrompt: config.SystemPrompt,
-					Input:        i,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
-			}
-
-			r, tokenUsage, err := ComputeChoices(
-				input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
-					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
-				}, nil)
-			if err != nil {
-				return err
-			}
-
-			totalTokenUsage.Prompt += tokenUsage.Prompt
-			totalTokenUsage.Completion += tokenUsage.Completion
-
-			result = append(result, r...)
+		// TODO is this proper to have exclusive from Stream, or do we need to issue both responses?
+		rawResponse := <-finalResultChannel
+		if rawResponse.Error != nil {
+			return rawResponse.Error
 		}
-
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "text_completion",
-			Usage: schema.OpenAIUsage{
-				PromptTokens:     totalTokenUsage.Prompt,
-				CompletionTokens: totalTokenUsage.Completion,
-				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
-			},
-		}
-
-		jsonResult, _ := json.Marshal(resp)
+		jsonResult, _ := json.Marshal(rawResponse.Value)
 		log.Debug().Msgf("Response: %s", jsonResult)

 		// Return the prediction in the response body
-		return c.JSON(resp)
+		return c.JSON(rawResponse.Value)
 	}
 }
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -3,92 +3,36 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
-	"time"

-	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/core/services"

-	"github.com/go-skynet/LocalAI/core/schema"
-	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"

 	"github.com/rs/zerolog/log"
 )

-func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func EditEndpoint(fce *fiberContext.FiberContextExtractor, oais *services.OpenAIService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readRequest(c, ml, appConfig, true)
+		_, request, err := fce.OpenAIRequestFromContext(c, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
+		_, finalResultChannel, _, _, _, err := oais.Edit(request, false, request.Stream)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+			return err
 		}

-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
+		rawResponse := <-finalResultChannel
+		if rawResponse.Error != nil {
+			return rawResponse.Error
 		}

-		if config.TemplateConfig.Edit != "" {
-			templateFile = config.TemplateConfig.Edit
-		}
-
-		var result []schema.Choice
-		totalTokenUsage := backend.TokenUsage{}
-
-		for _, i := range config.InputStrings {
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
-					Input:        i,
-					Instruction:  input.Instruction,
-					SystemPrompt: config.SystemPrompt,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
-			}
-
-			r, tokenUsage, err := ComputeChoices(input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
-				*c = append(*c, schema.Choice{Text: s})
-			}, nil)
-			if err != nil {
-				return err
-			}
-
-			totalTokenUsage.Prompt += tokenUsage.Prompt
-			totalTokenUsage.Completion += tokenUsage.Completion
-
-			result = append(result, r...)
-		}
-
-		id := uuid.New().String()
-		created := int(time.Now().Unix())
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "edit",
-			Usage: schema.OpenAIUsage{
-				PromptTokens:     totalTokenUsage.Prompt,
-				CompletionTokens: totalTokenUsage.Completion,
-				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
-			},
-		}
-
-		jsonResult, _ := json.Marshal(resp)
+		jsonResult, _ := json.Marshal(rawResponse.Value)
 		log.Debug().Msgf("Response: %s", jsonResult)

 		// Return the prediction in the response body
-		return c.JSON(resp)
+		return c.JSON(rawResponse.Value)
 	}
 }
--- a/core/http/endpoints/openai/embeddings.go
+++ b/core/http/endpoints/openai/embeddings.go
@@ -3,14 +3,9 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
-	"time"

 	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/model"
-
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/google/uuid"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"

 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -21,63 +16,25 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/embeddings [post]
-func EmbeddingsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func EmbeddingsEndpoint(fce *fiberContext.FiberContextExtractor, ebs *backend.EmbeddingsBackendService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		model, input, err := readRequest(c, ml, appConfig, true)
+		_, input, err := fce.OpenAIRequestFromContext(c, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := mergeRequestWithConfig(model, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+		responseChannel := ebs.Embeddings(input)
+
+		rawResponse := <-responseChannel
+
+		if rawResponse.Error != nil {
+			return rawResponse.Error
 		}

-		log.Debug().Msgf("Parameter Config: %+v", config)
-		items := []schema.Item{}
-
-		for i, s := range config.InputToken {
-			// get the model function to call for the result
-			embedFn, err := backend.ModelEmbedding("", s, ml, *config, appConfig)
-			if err != nil {
-				return err
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				return err
-			}
-			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		for i, s := range config.InputStrings {
-			// get the model function to call for the result
-			embedFn, err := backend.ModelEmbedding(s, []int{}, ml, *config, appConfig)
-			if err != nil {
-				return err
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				return err
-			}
-			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		id := uuid.New().String()
-		created := int(time.Now().Unix())
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:    items,
-			Object:  "list",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
+		jsonResult, _ := json.Marshal(rawResponse.Value)
 		log.Debug().Msgf("Response: %s", jsonResult)

 		// Return the prediction in the response body
-		return c.JSON(resp)
+		return c.JSON(rawResponse.Value)
 	}
 }
--- a/core/http/endpoints/openai/files_test.go
+++ b/core/http/endpoints/openai/files_test.go
@@ -251,7 +251,7 @@ func newMultipartFile(filePath, tag, purpose string) (*strings.Reader, *multipar

 // Helper to create test files
 func createTestFile(t *testing.T, name string, sizeMB int, option *config.ApplicationConfig) *os.File {
-	err := os.MkdirAll(option.UploadDir, 0750)
+	err := os.MkdirAll(option.UploadDir, 0755)
 	if err != nil {

 		t.Fatalf("Error MKDIR: %v", err)
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -1,50 +1,18 @@
 package openai

 import (
-	"bufio"
-	"encoding/base64"
 	"encoding/json"
 	"fmt"
-	"io"
-	"net/http"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"time"

-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/google/uuid"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"

 	"github.com/go-skynet/LocalAI/core/backend"

-	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

-func downloadFile(url string) (string, error) {
-	// Get the data
-	resp, err := http.Get(url)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	// Create the file
-	out, err := os.CreateTemp("", "image")
-	if err != nil {
-		return "", err
-	}
-	defer out.Close()
-
-	// Write the body to file
-	_, err = io.Copy(out, resp.Body)
-	return out.Name(), err
-}
-
-//
+// https://platform.openai.com/docs/api-reference/images/create

 /*
 *
@@ -59,186 +27,36 @@ func downloadFile(url string) (string, error) {

 *
 */
+
 // ImageEndpoint is the OpenAI Image generation API endpoint https://platform.openai.com/docs/api-reference/images/create
 // @Summary Creates an image given a prompt.
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/images/generations [post]
-func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func ImageEndpoint(fce *fiberContext.FiberContextExtractor, igbs *backend.ImageGenerationBackendService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readRequest(c, ml, appConfig, false)
+		// TODO: Somewhat a hack. Is there a better place to assign this?
+		if igbs.BaseUrlForGeneratedImages == "" {
+			igbs.BaseUrlForGeneratedImages = c.BaseURL() + "/generated-images/"
+		}
+		_, request, err := fce.OpenAIRequestFromContext(c, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		if m == "" {
-			m = model.StableDiffusionBackend
-		}
-		log.Debug().Msgf("Loading model: %+v", m)
+		responseChannel := igbs.GenerateImage(request)
+		rawResponse := <-responseChannel

-		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, 0, 0, false)
+		if rawResponse.Error != nil {
+			return rawResponse.Error
+		}
+
+		jsonResult, err := json.Marshal(rawResponse.Value)
 		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
+			return err
 		}
-
-		src := ""
-		if input.File != "" {
-
-			fileData := []byte{}
-			// check if input.File is an URL, if so download it and save it
-			// to a temporary file
-			if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
-				out, err := downloadFile(input.File)
-				if err != nil {
-					return fmt.Errorf("failed downloading file:%w", err)
-				}
-				defer os.RemoveAll(out)
-
-				fileData, err = os.ReadFile(out)
-				if err != nil {
-					return fmt.Errorf("failed reading file:%w", err)
-				}
-
-			} else {
-				// base 64 decode the file and write it somewhere
-				// that we will cleanup
-				fileData, err = base64.StdEncoding.DecodeString(input.File)
-				if err != nil {
-					return err
-				}
-			}
-
-			// Create a temporary file
-			outputFile, err := os.CreateTemp(appConfig.ImageDir, "b64")
-			if err != nil {
-				return err
-			}
-			// write the base64 result
-			writer := bufio.NewWriter(outputFile)
-			_, err = writer.Write(fileData)
-			if err != nil {
-				outputFile.Close()
-				return err
-			}
-			outputFile.Close()
-			src = outputFile.Name()
-			defer os.RemoveAll(src)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		switch config.Backend {
-		case "stablediffusion":
-			config.Backend = model.StableDiffusionBackend
-		case "tinydream":
-			config.Backend = model.TinyDreamBackend
-		case "":
-			config.Backend = model.StableDiffusionBackend
-		}
-
-		sizeParts := strings.Split(input.Size, "x")
-		if len(sizeParts) != 2 {
-			return fmt.Errorf("invalid value for 'size'")
-		}
-		width, err := strconv.Atoi(sizeParts[0])
-		if err != nil {
-			return fmt.Errorf("invalid value for 'size'")
-		}
-		height, err := strconv.Atoi(sizeParts[1])
-		if err != nil {
-			return fmt.Errorf("invalid value for 'size'")
-		}
-
-		b64JSON := false
-		if input.ResponseFormat.Type == "b64_json" {
-			b64JSON = true
-		}
-		// src and clip_skip
-		var result []schema.Item
-		for _, i := range config.PromptStrings {
-			n := input.N
-			if input.N == 0 {
-				n = 1
-			}
-			for j := 0; j < n; j++ {
-				prompts := strings.Split(i, "|")
-				positive_prompt := prompts[0]
-				negative_prompt := ""
-				if len(prompts) > 1 {
-					negative_prompt = prompts[1]
-				}
-
-				mode := 0
-				step := config.Step
-				if step == 0 {
-					step = 15
-				}
-
-				if input.Mode != 0 {
-					mode = input.Mode
-				}
-
-				if input.Step != 0 {
-					step = input.Step
-				}
-
-				tempDir := ""
-				if !b64JSON {
-					tempDir = appConfig.ImageDir
-				}
-				// Create a temporary file
-				outputFile, err := os.CreateTemp(tempDir, "b64")
-				if err != nil {
-					return err
-				}
-				outputFile.Close()
-				output := outputFile.Name() + ".png"
-				// Rename the temporary file
-				err = os.Rename(outputFile.Name(), output)
-				if err != nil {
-					return err
-				}
-
-				baseURL := c.BaseURL()
-
-				fn, err := backend.ImageGeneration(height, width, mode, step, *config.Seed, positive_prompt, negative_prompt, src, output, ml, *config, appConfig)
-				if err != nil {
-					return err
-				}
-				if err := fn(); err != nil {
-					return err
-				}
-
-				item := &schema.Item{}
-
-				if b64JSON {
-					defer os.RemoveAll(output)
-					data, err := os.ReadFile(output)
-					if err != nil {
-						return err
-					}
-					item.B64JSON = base64.StdEncoding.EncodeToString(data)
-				} else {
-					base := filepath.Base(output)
-					item.URL = baseURL + "/generated-images/" + base
-				}
-
-				result = append(result, *item)
-			}
-		}
-
-		id := uuid.New().String()
-		created := int(time.Now().Unix())
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Data:    result,
-		}
-
-		jsonResult, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", jsonResult)
-
 		// Return the prediction in the response body
-		return c.JSON(resp)
+		return c.JSON(rawResponse.Value)
 	}
 }
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -1,55 +0,0 @@
-package openai
-
-import (
-	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
-
-	"github.com/go-skynet/LocalAI/core/schema"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-)
-
-func ComputeChoices(
-	req *schema.OpenAIRequest,
-	predInput string,
-	config *config.BackendConfig,
-	o *config.ApplicationConfig,
-	loader *model.ModelLoader,
-	cb func(string, *[]schema.Choice),
-	tokenCallback func(string, backend.TokenUsage) bool) ([]schema.Choice, backend.TokenUsage, error) {
-	n := req.N // number of completions to return
-	result := []schema.Choice{}
-
-	if n == 0 {
-		n = 1
-	}
-
-	images := []string{}
-	for _, m := range req.Messages {
-		images = append(images, m.StringImages...)
-	}
-
-	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
-	if err != nil {
-		return result, backend.TokenUsage{}, err
-	}
-
-	tokenUsage := backend.TokenUsage{}
-
-	for i := 0; i < n; i++ {
-		prediction, err := predFunc()
-		if err != nil {
-			return result, backend.TokenUsage{}, err
-		}
-
-		tokenUsage.Prompt += prediction.Usage.Prompt
-		tokenUsage.Completion += prediction.Usage.Completion
-
-		finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
-		cb(finetunedResponse, &result)
-
-		//result = append(result, Choice{Text: prediction})
-
-	}
-	return result, tokenUsage, err
-}
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -1,61 +1,21 @@
 package openai

 import (
-	"regexp"
-
-	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/core/services"
 	"github.com/gofiber/fiber/v2"
 )

-func ListModelsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader) func(ctx *fiber.Ctx) error {
+func ListModelsEndpoint(lms *services.ListModelsService) func(ctx *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		models, err := ml.ListModels()
-		if err != nil {
-			return err
-		}
-		var mm map[string]interface{} = map[string]interface{}{}
-
-		dataModels := []schema.OpenAIModel{}
-
-		var filterFn func(name string) bool
+		// If blank, no filter is applied.
 		filter := c.Query("filter")
-
-		// If filter is not specified, do not filter the list by model name
-		if filter == "" {
-			filterFn = func(_ string) bool { return true }
-		} else {
-			// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
-			rxp, err := regexp.Compile(filter)
-			if err != nil {
-				return err
-			}
-			filterFn = func(name string) bool {
-				return rxp.MatchString(name)
-			}
-		}
-
 		// By default, exclude any loose files that are already referenced by a configuration file.
 		excludeConfigured := c.QueryBool("excludeConfigured", true)

-		// Start with the known configurations
-		for _, c := range cl.GetAllBackendConfigs() {
-			if excludeConfigured {
-				mm[c.Model] = nil
-			}
-
-			if filterFn(c.Name) {
-				dataModels = append(dataModels, schema.OpenAIModel{ID: c.Name, Object: "model"})
-			}
-		}
-
-		// Then iterate through the loose files:
-		for _, m := range models {
-			// And only adds them if they shouldn't be skipped.
-			if _, exists := mm[m]; !exists && filterFn(m) {
-				dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
-			}
+		dataModels, err := lms.ListModels(filter, excludeConfigured)
+		if err != nil {
+			return err
 		}

 		return c.JSON(struct {
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -1,285 +0,0 @@
-package openai
-
-import (
-	"context"
-	"encoding/base64"
-	"encoding/json"
-	"fmt"
-	"io"
-	"net/http"
-	"strings"
-
-	"github.com/go-skynet/LocalAI/core/config"
-	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/functions"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-)
-
-func readRequest(c *fiber.Ctx, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
-	input := new(schema.OpenAIRequest)
-
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
-	}
-
-	received, _ := json.Marshal(input)
-
-	ctx, cancel := context.WithCancel(o.Context)
-	input.Context = ctx
-	input.Cancel = cancel
-
-	log.Debug().Msgf("Request received: %s", string(received))
-
-	modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, firstModel)
-
-	return modelFile, input, err
-}
-
-// this function check if the string is an URL, if it's an URL downloads the image in memory
-// encodes it in base64 and returns the base64 string
-func getBase64Image(s string) (string, error) {
-	if strings.HasPrefix(s, "http") {
-		// download the image
-		resp, err := http.Get(s)
-		if err != nil {
-			return "", err
-		}
-		defer resp.Body.Close()
-
-		// read the image data into memory
-		data, err := io.ReadAll(resp.Body)
-		if err != nil {
-			return "", err
-		}
-
-		// encode the image data in base64
-		encoded := base64.StdEncoding.EncodeToString(data)
-
-		// return the base64 string
-		return encoded, nil
-	}
-
-	// if the string instead is prefixed with "data:image/jpeg;base64,", drop it
-	if strings.HasPrefix(s, "data:image/jpeg;base64,") {
-		return strings.ReplaceAll(s, "data:image/jpeg;base64,", ""), nil
-	}
-	return "", fmt.Errorf("not valid string")
-}
-
-func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIRequest) {
-	if input.Echo {
-		config.Echo = input.Echo
-	}
-	if input.TopK != nil {
-		config.TopK = input.TopK
-	}
-	if input.TopP != nil {
-		config.TopP = input.TopP
-	}
-
-	if input.Backend != "" {
-		config.Backend = input.Backend
-	}
-
-	if input.ClipSkip != 0 {
-		config.Diffusers.ClipSkip = input.ClipSkip
-	}
-
-	if input.ModelBaseName != "" {
-		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
-	}
-
-	if input.NegativePromptScale != 0 {
-		config.NegativePromptScale = input.NegativePromptScale
-	}
-
-	if input.UseFastTokenizer {
-		config.UseFastTokenizer = input.UseFastTokenizer
-	}
-
-	if input.NegativePrompt != "" {
-		config.NegativePrompt = input.NegativePrompt
-	}
-
-	if input.RopeFreqBase != 0 {
-		config.RopeFreqBase = input.RopeFreqBase
-	}
-
-	if input.RopeFreqScale != 0 {
-		config.RopeFreqScale = input.RopeFreqScale
-	}
-
-	if input.Grammar != "" {
-		config.Grammar = input.Grammar
-	}
-
-	if input.Temperature != nil {
-		config.Temperature = input.Temperature
-	}
-
-	if input.Maxtokens != nil {
-		config.Maxtokens = input.Maxtokens
-	}
-
-	switch stop := input.Stop.(type) {
-	case string:
-		if stop != "" {
-			config.StopWords = append(config.StopWords, stop)
-		}
-	case []interface{}:
-		for _, pp := range stop {
-			if s, ok := pp.(string); ok {
-				config.StopWords = append(config.StopWords, s)
-			}
-		}
-	}
-
-	if len(input.Tools) > 0 {
-		for _, tool := range input.Tools {
-			input.Functions = append(input.Functions, tool.Function)
-		}
-	}
-
-	if input.ToolsChoice != nil {
-		var toolChoice functions.Tool
-
-		switch content := input.ToolsChoice.(type) {
-		case string:
-			_ = json.Unmarshal([]byte(content), &toolChoice)
-		case map[string]interface{}:
-			dat, _ := json.Marshal(content)
-			_ = json.Unmarshal(dat, &toolChoice)
-		}
-		input.FunctionCall = map[string]interface{}{
-			"name": toolChoice.Function.Name,
-		}
-	}
-
-	// Decode each request's message content
-	index := 0
-	for i, m := range input.Messages {
-		switch content := m.Content.(type) {
-		case string:
-			input.Messages[i].StringContent = content
-		case []interface{}:
-			dat, _ := json.Marshal(content)
-			c := []schema.Content{}
-			json.Unmarshal(dat, &c)
-			for _, pp := range c {
-				if pp.Type == "text" {
-					input.Messages[i].StringContent = pp.Text
-				} else if pp.Type == "image_url" {
-					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
-					base64, err := getBase64Image(pp.ImageURL.URL)
-					if err == nil {
-						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
-						// set a placeholder for each image
-						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
-						index++
-					} else {
-						fmt.Print("Failed encoding image", err)
-					}
-				}
-			}
-		}
-	}
-
-	if input.RepeatPenalty != 0 {
-		config.RepeatPenalty = input.RepeatPenalty
-	}
-
-	if input.FrequencyPenalty != 0 {
-		config.FrequencyPenalty = input.FrequencyPenalty
-	}
-
-	if input.PresencePenalty != 0 {
-		config.PresencePenalty = input.PresencePenalty
-	}
-
-	if input.Keep != 0 {
-		config.Keep = input.Keep
-	}
-
-	if input.Batch != 0 {
-		config.Batch = input.Batch
-	}
-
-	if input.IgnoreEOS {
-		config.IgnoreEOS = input.IgnoreEOS
-	}
-
-	if input.Seed != nil {
-		config.Seed = input.Seed
-	}
-
-	if input.TypicalP != nil {
-		config.TypicalP = input.TypicalP
-	}
-
-	switch inputs := input.Input.(type) {
-	case string:
-		if inputs != "" {
-			config.InputStrings = append(config.InputStrings, inputs)
-		}
-	case []interface{}:
-		for _, pp := range inputs {
-			switch i := pp.(type) {
-			case string:
-				config.InputStrings = append(config.InputStrings, i)
-			case []interface{}:
-				tokens := []int{}
-				for _, ii := range i {
-					tokens = append(tokens, int(ii.(float64)))
-				}
-				config.InputToken = append(config.InputToken, tokens)
-			}
-		}
-	}
-
-	// Can be either a string or an object
-	switch fnc := input.FunctionCall.(type) {
-	case string:
-		if fnc != "" {
-			config.SetFunctionCallString(fnc)
-		}
-	case map[string]interface{}:
-		var name string
-		n, exists := fnc["name"]
-		if exists {
-			nn, e := n.(string)
-			if e {
-				name = nn
-			}
-		}
-		config.SetFunctionCallNameString(name)
-	}
-
-	switch p := input.Prompt.(type) {
-	case string:
-		config.PromptStrings = append(config.PromptStrings, p)
-	case []interface{}:
-		for _, pp := range p {
-			if s, ok := pp.(string); ok {
-				config.PromptStrings = append(config.PromptStrings, s)
-			}
-		}
-	}
-}
-
-func mergeRequestWithConfig(modelFile string, input *schema.OpenAIRequest, cm *config.BackendConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.BackendConfig, *schema.OpenAIRequest, error) {
-	cfg, err := cm.LoadBackendConfigFileByName(modelFile, loader.ModelPath,
-		config.LoadOptionDebug(debug),
-		config.LoadOptionThreads(threads),
-		config.LoadOptionContextSize(ctx),
-		config.LoadOptionF16(f16),
-	)
-
-	// Set the parameters for the language model prediction
-	updateRequestConfig(cfg, input)
-
-	return cfg, input, err
-}
--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -9,8 +9,7 @@ import (
 	"path/filepath"

 	"github.com/go-skynet/LocalAI/core/backend"
-	"github.com/go-skynet/LocalAI/core/config"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"

 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -23,17 +22,15 @@ import (
 // @Param file formData file true "file"
 // @Success 200 {object} map[string]string	 "Response"
 // @Router /v1/audio/transcriptions [post]
-func TranscriptEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func TranscriptEndpoint(fce *fiberContext.FiberContextExtractor, tbs *backend.TranscriptionBackendService) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readRequest(c, ml, appConfig, false)
+		_, request, err := fce.OpenAIRequestFromContext(c, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
+		// TODO: Investigate this file copy stuff later - potentially belongs in service.
+
 		// retrieve the file data from the request
 		file, err := c.FormFile("file")
 		if err != nil {
@@ -65,13 +62,16 @@ func TranscriptEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a

 		log.Debug().Msgf("Audio file copied to: %+v", dst)

-		tr, err := backend.ModelTranscription(dst, input.Language, ml, *config, appConfig)
-		if err != nil {
-			return err
-		}
+		request.File = dst

-		log.Debug().Msgf("Trascribed: %+v", tr)
+		responseChannel := tbs.Transcribe(request)
+		rawResponse := <-responseChannel
+
+		if rawResponse.Error != nil {
+			return rawResponse.Error
+		}
+		log.Debug().Msgf("Transcribed: %+v", rawResponse.Value)
 		// TODO: handle different outputs here
-		return c.Status(http.StatusOK).JSON(tr)
+		return c.Status(http.StatusOK).JSON(rawResponse.Value)
 	}
 }
--- a/core/http/render.go
+++ b/core/http/render.go
@@ -7,10 +7,12 @@ import (
 	"net/http"

 	"github.com/Masterminds/sprig/v3"
+	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	fiberhtml "github.com/gofiber/template/html/v2"
-	"github.com/microcosm-cc/bluemonday"
 	"github.com/russross/blackfriday"
 )

@@ -31,6 +33,40 @@ func notFoundHandler(c *fiber.Ctx) error {
 	return nil
 }

+func welcomeRoute(
+	app *fiber.App,
+	cl *config.BackendConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	auth func(*fiber.Ctx) error,
+) {
+	if appConfig.DisableWelcomePage {
+		return
+	}
+
+	models, _ := ml.ListModels()
+	backendConfigs := cl.GetAllBackendConfigs()
+
+	app.Get("/", auth, func(c *fiber.Ctx) error {
+		summary := fiber.Map{
+			"Title":             "LocalAI API - " + internal.PrintableVersion(),
+			"Version":           internal.PrintableVersion(),
+			"Models":            models,
+			"ModelsConfig":      backendConfigs,
+			"ApplicationConfig": appConfig,
+		}
+
+		if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
+			// The client expects a JSON response
+			return c.Status(fiber.StatusOK).JSON(summary)
+		} else {
+			// Render index
+			return c.Render("views/index", summary)
+		}
+	})
+
+}
+
 func renderEngine() *fiberhtml.Engine {
 	engine := fiberhtml.NewFileSystem(http.FS(viewsfs), ".html")
 	engine.AddFuncMap(sprig.FuncMap())
@@ -40,5 +76,5 @@ func renderEngine() *fiberhtml.Engine {

 func markDowner(args ...interface{}) template.HTML {
 	s := blackfriday.MarkdownCommon([]byte(fmt.Sprintf("%s", args...)))
-	return template.HTML(bluemonday.UGCPolicy().Sanitize(string(s)))
+	return template.HTML(s)
 }
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -1,19 +0,0 @@
-package routes
-
-import (
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/elevenlabs"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-)
-
-func RegisterElevenLabsRoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
-
-	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
-
-}
--- a/core/http/routes/jina.go
+++ b/core/http/routes/jina.go
@@ -1,19 +0,0 @@
-package routes
-
-import (
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/jina"
-
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-)
-
-func RegisterJINARoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
-
-	// POST endpoint to mimic the reranking
-	app.Post("/v1/rerank", jina.JINARerankEndpoint(cl, ml, appConfig))
-}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -1,63 +0,0 @@
-package routes
-
-import (
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
-	"github.com/go-skynet/LocalAI/core/services"
-	"github.com/go-skynet/LocalAI/internal"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/gofiber/swagger"
-)
-
-func RegisterLocalAIRoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
-	auth func(*fiber.Ctx) error) {
-
-	app.Get("/swagger/*", swagger.HandlerDefault) // default
-
-	// LocalAI API endpoints
-
-	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
-	app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
-	app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
-	app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
-	app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
-	app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
-	app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
-	app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
-
-	app.Post("/tts", auth, localai.TTSEndpoint(cl, ml, appConfig))
-
-	// Stores
-	sl := model.NewModelLoader("")
-	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
-	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
-	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
-	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))
-
-	// Kubernetes health checks
-	ok := func(c *fiber.Ctx) error {
-		return c.SendStatus(200)
-	}
-
-	app.Get("/healthz", ok)
-	app.Get("/readyz", ok)
-
-	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
-
-	// Experimental Backend Statistics Module
-	backendMonitor := services.NewBackendMonitor(cl, ml, appConfig) // Split out for now
-	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(backendMonitor))
-	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(backendMonitor))
-
-	app.Get("/version", auth, func(c *fiber.Ctx) error {
-		return c.JSON(struct {
-			Version string `json:"version"`
-		}{Version: internal.PrintableVersion()})
-	})
-
-}
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -1,86 +0,0 @@
-package routes
-
-import (
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-)
-
-func RegisterOpenAIRoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
-	// openAI compatible API endpoint
-
-	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
-
-	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
-
-	// assistant
-	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-
-	// files
-	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/files", auth, openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
-	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
-	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
-
-	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
-
-	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
-
-	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(cl, ml, appConfig))
-
-	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, ml, appConfig))
-
-	if appConfig.ImageDir != "" {
-		app.Static("/generated-images", appConfig.ImageDir)
-	}
-
-	if appConfig.AudioDir != "" {
-		app.Static("/generated-audio", appConfig.AudioDir)
-	}
-
-	// models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(cl, ml))
-	app.Get("/models", auth, openai.ListModelsEndpoint(cl, ml))
-}
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -1,111 +0,0 @@
-package routes
-
-import (
-	"fmt"
-	"html/template"
-	"strings"
-
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/http/elements"
-	"github.com/go-skynet/LocalAI/core/services"
-	"github.com/go-skynet/LocalAI/pkg/gallery"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
-)
-
-func RegisterUIRoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	galleryService *services.GalleryService,
-	auth func(*fiber.Ctx) error) {
-
-	// Show the Models page
-	app.Get("/browse", auth, func(c *fiber.Ctx) error {
-		models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
-
-		summary := fiber.Map{
-			"Title":        "LocalAI - Models",
-			"Models":       template.HTML(elements.ListModels(models)),
-			"Repositories": appConfig.Galleries,
-			//	"ApplicationConfig": appConfig,
-		}
-
-		// Render index
-		return c.Render("views/models", summary)
-	})
-
-	// HTMX: return the model details
-	// https://htmx.org/examples/active-search/
-	app.Post("/browse/search/models", auth, func(c *fiber.Ctx) error {
-		form := struct {
-			Search string `form:"search"`
-		}{}
-		if err := c.BodyParser(&form); err != nil {
-			return c.Status(fiber.StatusBadRequest).SendString(err.Error())
-		}
-
-		models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
-
-		filteredModels := []*gallery.GalleryModel{}
-		for _, m := range models {
-			if strings.Contains(m.Name, form.Search) ||
-				strings.Contains(m.Description, form.Search) ||
-				strings.Contains(m.Gallery.Name, form.Search) ||
-				strings.Contains(strings.Join(m.Tags, ","), form.Search) {
-				filteredModels = append(filteredModels, m)
-			}
-		}
-
-		return c.SendString(elements.ListModels(filteredModels))
-	})
-
-	// https://htmx.org/examples/progress-bar/
-	app.Post("/browse/install/model/:id", auth, func(c *fiber.Ctx) error {
-		galleryID := strings.Clone(c.Params("id")) // strings.Clone is required!
-
-		id, err := uuid.NewUUID()
-		if err != nil {
-			return err
-		}
-
-		uid := id.String()
-
-		op := gallery.GalleryOp{
-			Id:          uid,
-			GalleryName: galleryID,
-			Galleries:   appConfig.Galleries,
-		}
-		go func() {
-			galleryService.C <- op
-		}()
-
-		return c.SendString(elements.StartProgressBar(uid, "0"))
-	})
-
-	// https://htmx.org/examples/progress-bar/
-	app.Get("/browse/job/progress/:uid", auth, func(c *fiber.Ctx) error {
-		jobUID := c.Params("uid")
-
-		status := galleryService.GetStatus(jobUID)
-		if status == nil {
-			//fmt.Errorf("could not find any status for ID")
-			return c.SendString(elements.ProgressBar("0"))
-		}
-
-		if status.Progress == 100 {
-			c.Set("HX-Trigger", "done")
-			return c.SendString(elements.ProgressBar("100"))
-		}
-		if status.Error != nil {
-			return c.SendString(elements.ErrorProgress(status.Error.Error()))
-		}
-
-		return c.SendString(elements.ProgressBar(fmt.Sprint(status.Progress)))
-	})
-
-	app.Get("/browse/job/:uid", auth, func(c *fiber.Ctx) error {
-		return c.SendString(elements.DoneProgress(c.Params("uid")))
-	})
-}
--- a/core/http/routes/welcome.go
+++ b/core/http/routes/welcome.go
@@ -1,19 +0,0 @@
-package routes
-
-import (
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-)
-
-func RegisterPagesRoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig,
-	auth func(*fiber.Ctx) error) {
-
-	if !appConfig.DisableWelcomePage {
-		app.Get("/", auth, localai.WelcomeEndpoint(appConfig, cl, ml))
-	}
-}
--- a/core/http/views/models.html
+++ b/core/http/views/models.html
@@ -1,34 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-{{template "views/partials/head" .}}
-
-<body class="bg-gray-900 text-gray-200">
-<div class="flex flex-col min-h-screen">
-   
-    {{template "views/partials/navbar" .}}
-    <div class="container mx-auto px-4 flex-grow">
-
-        <div class="models mt-12">
-            <h2 class="text-center text-3xl font-semibold text-gray-100">
-                🖼️ Available models from <i>{{ len .Repositories }}</i> repositories     <a href="https://localai.io/models/" target="_blank" >
-                    <i class="fas fa-circle-info pr-2"></i>
-                </a></h2> 
-            
-        
-            <span class="htmx-indicator loader"></span>
-            <input class="form-control appearance-none block w-full px-3 py-2 text-base font-normal text-gray-300 pb-2 mb-5 bg-gray-800 bg-clip-padding border border-solid border-gray-600 rounded transition ease-in-out m-0 focus:text-gray-300 focus:bg-gray-900 focus:border-blue-500 focus:outline-none" type="search" 
-                name="search" placeholder="Begin Typing To Search models..." 
-                hx-post="/browse/search/models" 
-                hx-trigger="input changed delay:500ms, search" 
-                hx-target="#search-results" 
-                hx-indicator=".htmx-indicator">
-
-            <div id="search-results">{{.Models}}</div>
-        </div>
-    </div>
-
-    {{template "views/partials/footer" .}}
-</div>
-
-</body>
-</html>
--- a/core/http/views/partials/head.html
+++ b/core/http/views/partials/head.html
@@ -3,76 +3,11 @@
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{{.Title}}</title>
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wght@400;500&display=swap" rel="stylesheet">
-    <link
-    href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700,900&display=swap"
-    rel="stylesheet" />
-  <link
-    rel="stylesheet"
-    href="https://cdn.jsdelivr.net/npm/tw-elements/css/tw-elements.min.css" />
-  <script src="https://cdn.tailwindcss.com/3.3.0"></script>
-  <script>
-    tailwind.config = {
-      darkMode: "class",
-      theme: {
-        fontFamily: {
-          sans: ["Roboto", "sans-serif"],
-          body: ["Roboto", "sans-serif"],
-          mono: ["ui-monospace", "monospace"],
-        },
-      },
-      corePlugins: {
-        preflight: false,
-      },
-    };
-  </script>
+    <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.1.1/css/all.min.css">
-    <script src="https://unpkg.com/htmx.org@1.9.12" integrity="sha384-ujb1lZYygJmzgSwoxRggbCHcjc0rB2XoQrxeTUQyRjrOnlCoYta87iKBWq3EsdM2" crossorigin="anonymous"></script>
    <style>
        body {
            font-family: 'Inter', sans-serif;
        }
-        /* Loader (https://cssloaders.github.io/) */
-        .loader {
-          width: 12px;
-          height: 12px;
-          border-radius: 50%;
-          display: block;
-          margin:15px auto;
-          position: relative;
-          color: #FFF;
-          box-sizing: border-box;
-          animation: animloader 2s linear infinite;
-        }
-
-        @keyframes animloader {
-          0% { box-shadow: 14px 0 0 -2px,  38px 0 0 -2px,  -14px 0 0 -2px,  -38px 0 0 -2px; }
-          25% { box-shadow: 14px 0 0 -2px,  38px 0 0 -2px,  -14px 0 0 -2px,  -38px 0 0 2px; }
-          50% { box-shadow: 14px 0 0 -2px,  38px 0 0 -2px,  -14px 0 0 2px,  -38px 0 0 -2px; }
-          75% { box-shadow: 14px 0 0 2px,  38px 0 0 -2px,  -14px 0 0 -2px,  -38px 0 0 -2px; }
-          100% { box-shadow: 14px 0 0 -2px,  38px 0 0 2px,  -14px 0 0 -2px,  -38px 0 0 -2px; }
-        }
-        .progress {
-            height: 20px;
-            margin-bottom: 20px;
-            overflow: hidden;
-            background-color: #f5f5f5;
-            border-radius: 4px;
-            box-shadow: inset 0 1px 2px rgba(0,0,0,.1);
-        }
-        .progress-bar {
-            float: left;
-            width: 0%;
-            height: 100%;
-            font-size: 12px;
-            line-height: 20px;
-            color: #fff;
-            text-align: center;
-            background-color: #337ab7;
-            -webkit-box-shadow: inset 0 -1px 0 rgba(0,0,0,.15);
-            box-shadow: inset 0 -1px 0 rgba(0,0,0,.15);
-            -webkit-transition: width .6s ease;
-            -o-transition: width .6s ease;
-            transition: width .6s ease;
-        }
    </style>
 </head>
--- a/core/http/views/partials/navbar.html
+++ b/core/http/views/partials/navbar.html
@@ -9,7 +9,6 @@
            <div>
                <a href="/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
-                <a href="/browse/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-brain pr-2"></i> Models</a>
                <a href="/swagger/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-code pr-2"></i> API</a>
            </div>
        </div>
--- a/core/schema/jina.go
+++ b/core/schema/jina.go
@@ -1,34 +0,0 @@
-package schema
-
-// RerankRequest defines the structure of the request payload
-type JINARerankRequest struct {
-	Model     string   `json:"model"`
-	Query     string   `json:"query"`
-	Documents []string `json:"documents"`
-	TopN      int      `json:"top_n"`
-}
-
-// DocumentResult represents a single document result
-type JINADocumentResult struct {
-	Index          int      `json:"index"`
-	Document       JINAText `json:"document"`
-	RelevanceScore float64  `json:"relevance_score"`
-}
-
-// Text holds the text of the document
-type JINAText struct {
-	Text string `json:"text"`
-}
-
-// RerankResponse defines the structure of the response payload
-type JINARerankResponse struct {
-	Model   string               `json:"model"`
-	Usage   JINAUsageInfo        `json:"usage"`
-	Results []JINADocumentResult `json:"results"`
-}
-
-// UsageInfo holds information about usage of tokens
-type JINAUsageInfo struct {
-	TotalTokens  int `json:"total_tokens"`
-	PromptTokens int `json:"prompt_tokens"`
-}
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -3,7 +3,7 @@ package schema
 import (
 	"context"

-	functions "github.com/go-skynet/LocalAI/pkg/functions"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
 )

 // APIError provides error information returned by the OpenAI API.
@@ -108,7 +108,7 @@ type ChatCompletionResponseFormat struct {
 type OpenAIRequest struct {
 	PredictionOptions

-	Context context.Context    `json:"-"`
+	Context context.Context  `json:"-"`
 	Cancel  context.CancelFunc `json:"-"`

 	// whisper
@@ -130,11 +130,11 @@ type OpenAIRequest struct {
 	Messages []Message `json:"messages" yaml:"messages"`

 	// A list of available functions to call
-	Functions    functions.Functions `json:"functions" yaml:"functions"`
-	FunctionCall interface{}         `json:"function_call" yaml:"function_call"` // might be a string or an object
+	Functions    []grammar.Function `json:"functions" yaml:"functions"`
+	FunctionCall interface{}        `json:"function_call" yaml:"function_call"` // might be a string or an object

-	Tools       []functions.Tool `json:"tools,omitempty" yaml:"tools"`
-	ToolsChoice interface{}      `json:"tool_choice,omitempty" yaml:"tool_choice"`
+	Tools       []grammar.Tool `json:"tools,omitempty" yaml:"tools"`
+	ToolsChoice interface{}    `json:"tool_choice,omitempty" yaml:"tool_choice"`

 	Stream bool `json:"stream"`

@@ -145,7 +145,7 @@ type OpenAIRequest struct {
 	// A grammar to constrain the LLM output
 	Grammar string `json:"grammar" yaml:"grammar"`

-	JSONFunctionGrammarObject *functions.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`
+	JSONFunctionGrammarObject *grammar.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`

 	Backend string `json:"backend" yaml:"backend"`

--- a/core/schema/transcription.go
+++ b/core/schema/transcription.go
@@ -10,7 +10,7 @@ type Segment struct {
 	Tokens []int         `json:"tokens"`
 }

-type Result struct {
+type TranscriptionResult struct {
 	Segments []Segment `json:"segments"`
 	Text     string    `json:"text"`
 }
--- a/core/services/backend_monitor.go
+++ b/core/services/backend_monitor.go
@@ -15,22 +15,22 @@ import (
 	gopsutil "github.com/shirou/gopsutil/v3/process"
 )

-type BackendMonitor struct {
+type BackendMonitorService struct {
 	configLoader *config.BackendConfigLoader
 	modelLoader  *model.ModelLoader
 	options      *config.ApplicationConfig // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
 }

-func NewBackendMonitor(configLoader *config.BackendConfigLoader, modelLoader *model.ModelLoader, appConfig *config.ApplicationConfig) BackendMonitor {
-	return BackendMonitor{
+func NewBackendMonitorService(modelLoader *model.ModelLoader, configLoader *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *BackendMonitorService {
+	return &BackendMonitorService{
 		configLoader: configLoader,
 		modelLoader:  modelLoader,
 		options:      appConfig,
 	}
 }

-func (bm BackendMonitor) getModelLoaderIDFromModelName(modelName string) (string, error) {
-	config, exists := bm.configLoader.GetBackendConfig(modelName)
+func (bms BackendMonitorService) getModelLoaderIDFromModelName(modelName string) (string, error) {
+	config, exists := bms.configLoader.GetBackendConfig(modelName)
 	var backendId string
 	if exists {
 		backendId = config.Model
@@ -46,8 +46,8 @@ func (bm BackendMonitor) getModelLoaderIDFromModelName(modelName string) (string
 	return backendId, nil
 }

-func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*schema.BackendMonitorResponse, error) {
-	config, exists := bm.configLoader.GetBackendConfig(model)
+func (bms *BackendMonitorService) SampleLocalBackendProcess(model string) (*schema.BackendMonitorResponse, error) {
+	config, exists := bms.configLoader.GetBackendConfig(model)
 	var backend string
 	if exists {
 		backend = config.Model
@@ -60,7 +60,7 @@ func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*schema.Backe
 		backend = fmt.Sprintf("%s.bin", backend)
 	}

-	pid, err := bm.modelLoader.GetGRPCPID(backend)
+	pid, err := bms.modelLoader.GetGRPCPID(backend)

 	if err != nil {
 		log.Error().Err(err).Str("model", model).Msg("failed to find GRPC pid")
@@ -101,12 +101,12 @@ func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*schema.Backe
 	}, nil
 }

-func (bm BackendMonitor) CheckAndSample(modelName string) (*proto.StatusResponse, error) {
-	backendId, err := bm.getModelLoaderIDFromModelName(modelName)
+func (bms BackendMonitorService) CheckAndSample(modelName string) (*proto.StatusResponse, error) {
+	backendId, err := bms.getModelLoaderIDFromModelName(modelName)
 	if err != nil {
 		return nil, err
 	}
-	modelAddr := bm.modelLoader.CheckIsLoaded(backendId)
+	modelAddr := bms.modelLoader.CheckIsLoaded(backendId)
 	if modelAddr == "" {
 		return nil, fmt.Errorf("backend %s is not currently loaded", backendId)
 	}
@@ -114,7 +114,7 @@ func (bm BackendMonitor) CheckAndSample(modelName string) (*proto.StatusResponse
 	status, rpcErr := modelAddr.GRPC(false, nil).Status(context.TODO())
 	if rpcErr != nil {
 		log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
-		val, slbErr := bm.SampleLocalBackendProcess(backendId)
+		val, slbErr := bms.SampleLocalBackendProcess(backendId)
 		if slbErr != nil {
 			return nil, fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
 		}
@@ -131,10 +131,10 @@ func (bm BackendMonitor) CheckAndSample(modelName string) (*proto.StatusResponse
 	return status, nil
 }

-func (bm BackendMonitor) ShutdownModel(modelName string) error {
-	backendId, err := bm.getModelLoaderIDFromModelName(modelName)
+func (bms BackendMonitorService) ShutdownModel(modelName string) error {
+	backendId, err := bms.getModelLoaderIDFromModelName(modelName)
 	if err != nil {
 		return err
 	}
-	return bm.modelLoader.ShutdownModel(backendId)
+	return bms.modelLoader.ShutdownModel(backendId)
 }
--- a/core/services/gallery.go
+++ b/core/services/gallery.go
@@ -3,14 +3,18 @@ package services
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"os"
+	"path/filepath"
 	"strings"
 	"sync"

 	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/embedded"
+	"github.com/go-skynet/LocalAI/pkg/downloader"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
-	"github.com/go-skynet/LocalAI/pkg/startup"
 	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v2"
 )

@@ -29,18 +33,6 @@ func NewGalleryService(modelPath string) *GalleryService {
 	}
 }

-func prepareModel(modelPath string, req gallery.GalleryModel, cl *config.BackendConfigLoader, downloadStatus func(string, string, string, float64)) error {
-
-	config, err := gallery.GetGalleryConfigFromURL(req.URL)
-	if err != nil {
-		return err
-	}
-
-	config.Files = append(config.Files, req.AdditionalFiles...)
-
-	return gallery.InstallModel(modelPath, req.Name, &config, req.Overrides, downloadStatus)
-}
-
 func (g *GalleryService) UpdateStatus(s string, op *gallery.GalleryOpStatus) {
 	g.Lock()
 	defer g.Unlock()
@@ -92,10 +84,10 @@ func (g *GalleryService) Start(c context.Context, cl *config.BackendConfigLoader
 						err = gallery.InstallModelFromGalleryByName(op.Galleries, op.GalleryName, g.modelPath, op.Req, progressCallback)
 					}
 				} else if op.ConfigURL != "" {
-					startup.PreloadModelsConfigurations(op.ConfigURL, g.modelPath, op.ConfigURL)
+					PreloadModelsConfigurations(op.ConfigURL, g.modelPath, op.ConfigURL)
 					err = cl.Preload(g.modelPath)
 				} else {
-					err = prepareModel(g.modelPath, op.Req, cl, progressCallback)
+					err = prepareModel(g.modelPath, op.Req, progressCallback)
 				}

 				if err != nil {
@@ -127,13 +119,12 @@ type galleryModel struct {
 	ID                   string           `json:"id"`
 }

-func processRequests(modelPath, s string, cm *config.BackendConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
+func processRequests(modelPath string, galleries []gallery.Gallery, requests []galleryModel) error {
 	var err error
 	for _, r := range requests {
 		utils.ResetDownloadTimers()
 		if r.ID == "" {
-			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
-
+			err = prepareModel(modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
 		} else {
 			if strings.Contains(r.ID, "@") {
 				err = gallery.InstallModelFromGallery(
@@ -158,7 +149,7 @@ func ApplyGalleryFromFile(modelPath, s string, cl *config.BackendConfigLoader, g
 		return err
 	}

-	return processRequests(modelPath, s, cl, galleries, requests)
+	return processRequests(modelPath, galleries, requests)
 }

 func ApplyGalleryFromString(modelPath, s string, cl *config.BackendConfigLoader, galleries []gallery.Gallery) error {
@@ -168,5 +159,90 @@ func ApplyGalleryFromString(modelPath, s string, cl *config.BackendConfigLoader,
 		return err
 	}

-	return processRequests(modelPath, s, cl, galleries, requests)
+	return processRequests(modelPath, galleries, requests)
+}
+
+// PreloadModelsConfigurations will preload models from the given list of URLs
+// It will download the model if it is not already present in the model path
+// It will also try to resolve if the model is an embedded model YAML configuration
+func PreloadModelsConfigurations(modelLibraryURL string, modelPath string, models ...string) {
+	for _, url := range models {
+
+		// As a best effort, try to resolve the model from the remote library
+		// if it's not resolved we try with the other method below
+		if modelLibraryURL != "" {
+			lib, err := embedded.GetRemoteLibraryShorteners(modelLibraryURL)
+			if err == nil {
+				if lib[url] != "" {
+					log.Debug().Msgf("[startup] model configuration is defined remotely: %s (%s)", url, lib[url])
+					url = lib[url]
+				}
+			}
+		}
+
+		url = embedded.ModelShortURL(url)
+		switch {
+		case embedded.ExistsInModelsLibrary(url):
+			modelYAML, err := embedded.ResolveContent(url)
+			// If we resolve something, just save it to disk and continue
+			if err != nil {
+				log.Error().Err(err).Msg("error resolving model content")
+				continue
+			}
+
+			log.Debug().Msgf("[startup] resolved embedded model: %s", url)
+			md5Name := utils.MD5(url)
+			modelDefinitionFilePath := filepath.Join(modelPath, md5Name) + ".yaml"
+			if err := os.WriteFile(modelDefinitionFilePath, modelYAML, os.ModePerm); err != nil {
+				log.Error().Err(err).Str("filepath", modelDefinitionFilePath).Msg("error writing model definition")
+			}
+		case downloader.LooksLikeURL(url):
+			log.Debug().Msgf("[startup] resolved model to download: %s", url)
+
+			// md5 of model name
+			md5Name := utils.MD5(url)
+
+			// check if file exists
+			if _, err := os.Stat(filepath.Join(modelPath, md5Name)); errors.Is(err, os.ErrNotExist) {
+				modelDefinitionFilePath := filepath.Join(modelPath, md5Name) + ".yaml"
+				err := downloader.DownloadFile(url, modelDefinitionFilePath, "", func(fileName, current, total string, percent float64) {
+					utils.DisplayDownloadFunction(fileName, current, total, percent)
+				})
+				if err != nil {
+					log.Error().Err(err).Str("url", url).Str("filepath", modelDefinitionFilePath).Msg("error downloading model")
+				}
+			}
+		default:
+			if _, err := os.Stat(url); err == nil {
+				log.Debug().Msgf("[startup] resolved local model: %s", url)
+				// copy to modelPath
+				md5Name := utils.MD5(url)
+
+				modelYAML, err := os.ReadFile(url)
+				if err != nil {
+					log.Error().Err(err).Str("filepath", url).Msg("error reading model definition")
+					continue
+				}
+
+				modelDefinitionFilePath := filepath.Join(modelPath, md5Name) + ".yaml"
+				if err := os.WriteFile(modelDefinitionFilePath, modelYAML, os.ModePerm); err != nil {
+					log.Error().Err(err).Str("filepath", modelDefinitionFilePath).Msg("error loading model: %s")
+				}
+			} else {
+				log.Warn().Msgf("[startup] failed resolving model '%s'", url)
+			}
+		}
+	}
+}
+
+func prepareModel(modelPath string, req gallery.GalleryModel, downloadStatus func(string, string, string, float64)) error {
+
+	config, err := gallery.GetGalleryConfigFromURL(req.URL)
+	if err != nil {
+		return err
+	}
+
+	config.Files = append(config.Files, req.AdditionalFiles...)
+
+	return gallery.InstallModel(modelPath, req.Name, &config, req.Overrides, downloadStatus)
 }
--- a/core/services/list_models.go
+++ b/core/services/list_models.go
@@ -0,0 +1,72 @@
+package services
+
+import (
+	"regexp"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/model"
+)
+
+type ListModelsService struct {
+	bcl       *config.BackendConfigLoader
+	ml        *model.ModelLoader
+	appConfig *config.ApplicationConfig
+}
+
+func NewListModelsService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *ListModelsService {
+	return &ListModelsService{
+		bcl:       bcl,
+		ml:        ml,
+		appConfig: appConfig,
+	}
+}
+
+func (lms *ListModelsService) ListModels(filter string, excludeConfigured bool) ([]schema.OpenAIModel, error) {
+
+	models, err := lms.ml.ListModels()
+	if err != nil {
+		return nil, err
+	}
+
+	var mm map[string]interface{} = map[string]interface{}{}
+
+	dataModels := []schema.OpenAIModel{}
+
+	var filterFn func(name string) bool
+
+	// If filter is not specified, do not filter the list by model name
+	if filter == "" {
+		filterFn = func(_ string) bool { return true }
+	} else {
+		// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
+		rxp, err := regexp.Compile(filter)
+		if err != nil {
+			return nil, err
+		}
+		filterFn = func(name string) bool {
+			return rxp.MatchString(name)
+		}
+	}
+
+	// Start with the known configurations
+	for _, c := range lms.bcl.GetAllBackendConfigs() {
+		if excludeConfigured {
+			mm[c.Model] = nil
+		}
+
+		if filterFn(c.Name) {
+			dataModels = append(dataModels, schema.OpenAIModel{ID: c.Name, Object: "model"})
+		}
+	}
+
+	// Then iterate through the loose files:
+	for _, m := range models {
+		// And only adds them if they shouldn't be skipped.
+		if _, exists := mm[m]; !exists && filterFn(m) {
+			dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
+		}
+	}
+
+	return dataModels, nil
+}
--- a/core/services/model_preload_test.go
+++ b/core/services/model_preload_test.go
@@ -1,13 +1,14 @@
-package startup_test
+package services_test

 import (
 	"fmt"
 	"os"
 	"path/filepath"

-	. "github.com/go-skynet/LocalAI/pkg/startup"
 	"github.com/go-skynet/LocalAI/pkg/utils"

+	. "github.com/go-skynet/LocalAI/core/services"
+
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
--- a/core/services/openai.go
+++ b/core/services/openai.go
@@ -0,0 +1,808 @@
+package services
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/concurrency"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/google/uuid"
+	"github.com/imdario/mergo"
+	"github.com/rs/zerolog/log"
+)
+
+type endpointGenerationConfigurationFn func(bc *config.BackendConfig, request *schema.OpenAIRequest) endpointConfiguration
+
+type endpointConfiguration struct {
+	SchemaObject        string
+	TemplatePath        string
+	TemplateData        model.PromptTemplateData
+	ResultMappingFn     func(resp *backend.LLMResponse, index int) schema.Choice
+	CompletionMappingFn func(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse]
+	TokenMappingFn      func(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse]
+}
+
+// TODO: This is used for completion and edit. I am pretty sure I forgot parts, but fix it later.
+func simpleMapper(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse] {
+	if resp.Error != nil || resp.Value == nil {
+		return concurrency.ErrorOr[*schema.OpenAIResponse]{Error: resp.Error}
+	}
+	return concurrency.ErrorOr[*schema.OpenAIResponse]{
+		Value: &schema.OpenAIResponse{
+			Choices: []schema.Choice{
+				{
+					Text: resp.Value.Response,
+				},
+			},
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     resp.Value.Usage.Prompt,
+				CompletionTokens: resp.Value.Usage.Completion,
+				TotalTokens:      resp.Value.Usage.Prompt + resp.Value.Usage.Completion,
+			},
+		},
+	}
+}
+
+// TODO: Consider alternative names for this.
+// The purpose of this struct is to hold a reference to the OpenAI request context information
+// This keeps things simple within core/services/openai.go and allows consumers to "see" this information if they need it
+type OpenAIRequestTraceID struct {
+	ID      string
+	Created int
+}
+
+// This type split out from core/backend/llm.go - I'm still not _totally_ sure about this, but it seems to make sense to keep the generic LLM code from the OpenAI specific higher level functionality
+type OpenAIService struct {
+	bcl       *config.BackendConfigLoader
+	ml        *model.ModelLoader
+	appConfig *config.ApplicationConfig
+	llmbs     *backend.LLMBackendService
+}
+
+func NewOpenAIService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig, llmbs *backend.LLMBackendService) *OpenAIService {
+	return &OpenAIService{
+		bcl:       bcl,
+		ml:        ml,
+		appConfig: appConfig,
+		llmbs:     llmbs,
+	}
+}
+
+// Keeping in place as a reminder to POTENTIALLY ADD MORE VALIDATION HERE???
+func (oais *OpenAIService) getConfig(request *schema.OpenAIRequest) (*config.BackendConfig, *schema.OpenAIRequest, error) {
+	return oais.bcl.LoadBackendConfigForModelAndOpenAIRequest(request.Model, request, oais.appConfig)
+}
+
+// TODO: It would be a lot less messy to make a return struct that had references to each of these channels
+// INTENTIONALLY not doing that quite yet - I believe we need to let the references to unused channels die for the GC to automatically collect -- can we manually free()?
+// finalResultsChannel is the primary async return path: one result for the entire request.
+// promptResultsChannels is DUBIOUS. It's expected to be raw fan-out used within the function itself, but I am exposing for testing? One bundle of LLMResponseBundle per PromptString? Gets all N completions for a single prompt.
+// completionsChannel is a channel that emits one *LLMResponse per generated completion, be that different prompts or N. Seems the most useful other than "entire request" Request is available to attempt tracing???
+// tokensChannel is a channel that emits one *LLMResponse per generated token. Let's see what happens!
+func (oais *OpenAIService) Completion(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool) (
+	traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], promptResultsChannels []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle],
+	completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
+
+	return oais.GenerateTextFromRequest(request, func(bc *config.BackendConfig, request *schema.OpenAIRequest) endpointConfiguration {
+		return endpointConfiguration{
+			SchemaObject: "text_completion",
+			TemplatePath: bc.TemplateConfig.Completion,
+			TemplateData: model.PromptTemplateData{
+				SystemPrompt: bc.SystemPrompt,
+			},
+			ResultMappingFn: func(resp *backend.LLMResponse, promptIndex int) schema.Choice {
+				return schema.Choice{
+					Index:        promptIndex,
+					FinishReason: "stop",
+					Text:         resp.Response,
+				}
+			},
+			CompletionMappingFn: simpleMapper,
+			TokenMappingFn:      simpleMapper,
+		}
+	}, notifyOnPromptResult, notifyOnToken, nil)
+}
+
+func (oais *OpenAIService) Edit(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool) (
+	traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], promptResultsChannels []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle],
+	completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
+
+	return oais.GenerateTextFromRequest(request, func(bc *config.BackendConfig, request *schema.OpenAIRequest) endpointConfiguration {
+
+		return endpointConfiguration{
+			SchemaObject: "edit",
+			TemplatePath: bc.TemplateConfig.Edit,
+			TemplateData: model.PromptTemplateData{
+				SystemPrompt: bc.SystemPrompt,
+				Instruction:  request.Instruction,
+			},
+			ResultMappingFn: func(resp *backend.LLMResponse, promptIndex int) schema.Choice {
+				return schema.Choice{
+					Index:        promptIndex,
+					FinishReason: "stop",
+					Text:         resp.Response,
+				}
+			},
+			CompletionMappingFn: simpleMapper,
+			TokenMappingFn:      simpleMapper,
+		}
+	}, notifyOnPromptResult, notifyOnToken, nil)
+}
+
+func (oais *OpenAIService) Chat(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool) (
+	traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse],
+	completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
+
+	return oais.GenerateFromMultipleMessagesChatRequest(request, notifyOnPromptResult, notifyOnToken, nil)
+}
+
+func (oais *OpenAIService) GenerateTextFromRequest(request *schema.OpenAIRequest, endpointConfigFn endpointGenerationConfigurationFn, notifyOnPromptResult bool, notifyOnToken bool, initialTraceID *OpenAIRequestTraceID) (
+	traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], promptResultsChannels []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle],
+	completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
+
+	if initialTraceID == nil {
+		traceID = &OpenAIRequestTraceID{
+			ID:      uuid.New().String(),
+			Created: int(time.Now().Unix()),
+		}
+	} else {
+		traceID = initialTraceID
+	}
+
+	bc, request, err := oais.getConfig(request)
+	if err != nil {
+		log.Error().Err(err).Msgf("[oais::GenerateTextFromRequest] error getting configuration")
+		return
+	}
+
+	if request.ResponseFormat.Type == "json_object" {
+		request.Grammar = grammar.JSONBNF
+	}
+
+	bc.Grammar = request.Grammar
+
+	if request.Stream && len(bc.PromptStrings) > 1 {
+		log.Warn().Msg("potentially cannot handle more than 1 `PromptStrings` when Streaming?")
+	}
+
+	rawFinalResultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	finalResultChannel = rawFinalResultChannel
+	promptResultsChannels = []<-chan concurrency.ErrorOr[*backend.LLMResponseBundle]{}
+	var rawCompletionsChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
+	var rawTokenChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
+	if notifyOnPromptResult {
+		rawCompletionsChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	}
+	if notifyOnToken {
+		rawTokenChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	}
+
+	promptResultsChannelLock := sync.Mutex{}
+
+	endpointConfig := endpointConfigFn(bc, request)
+
+	if len(endpointConfig.TemplatePath) == 0 {
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		if oais.ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", bc.Model)) {
+			endpointConfig.TemplatePath = bc.Model
+		} else {
+			log.Warn().Msgf("failed to find any template for %+v", request)
+		}
+	}
+
+	setupWG := sync.WaitGroup{}
+	var prompts []string
+	if lPS := len(bc.PromptStrings); lPS > 0 {
+		setupWG.Add(lPS)
+		prompts = bc.PromptStrings
+	} else {
+		setupWG.Add(len(bc.InputStrings))
+		prompts = bc.InputStrings
+	}
+
+	var setupError error = nil
+
+	for pI, p := range prompts {
+
+		go func(promptIndex int, prompt string) {
+			if endpointConfig.TemplatePath != "" {
+				promptTemplateData := model.PromptTemplateData{
+					Input: prompt,
+				}
+				err := mergo.Merge(promptTemplateData, endpointConfig.TemplateData, mergo.WithOverride)
+				if err == nil {
+					templatedInput, err := oais.ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, endpointConfig.TemplatePath, promptTemplateData)
+					if err == nil {
+						prompt = templatedInput
+						log.Debug().Msgf("Template found, input modified to: %s", prompt)
+					}
+				}
+			}
+
+			log.Debug().Msgf("[OAIS GenerateTextFromRequest] Prompt: %q", prompt)
+			promptResultsChannel, completionChannels, tokenChannels, err := oais.llmbs.GenerateText(prompt, request, bc,
+				func(r *backend.LLMResponse) schema.Choice {
+					return endpointConfig.ResultMappingFn(r, promptIndex)
+				}, notifyOnPromptResult, notifyOnToken)
+			if err != nil {
+				log.Error().Msgf("Unable to generate text prompt: %q\nerr: %q", prompt, err)
+				promptResultsChannelLock.Lock()
+				setupError = errors.Join(setupError, err)
+				promptResultsChannelLock.Unlock()
+				setupWG.Done()
+				return
+			}
+			if notifyOnPromptResult {
+				concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(completionChannels, endpointConfig.CompletionMappingFn), rawCompletionsChannel, true)
+			}
+			if notifyOnToken {
+				concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(tokenChannels, endpointConfig.TokenMappingFn), rawTokenChannel, true)
+			}
+			promptResultsChannelLock.Lock()
+			promptResultsChannels = append(promptResultsChannels, promptResultsChannel)
+			promptResultsChannelLock.Unlock()
+			setupWG.Done()
+		}(pI, p)
+
+	}
+	setupWG.Wait()
+
+	// If any of the setup goroutines experienced an error, quit early here.
+	if setupError != nil {
+		go func() {
+			log.Error().Err(setupError).Msgf("[OAIS GenerateTextFromRequest] caught an error during setup")
+			rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Error: setupError}
+			close(rawFinalResultChannel)
+		}()
+		return
+	}
+
+	initialResponse := &schema.OpenAIResponse{
+		ID:      traceID.ID,
+		Created: traceID.Created,
+		Model:   request.Model,
+		Object:  endpointConfig.SchemaObject,
+		Usage:   schema.OpenAIUsage{},
+	}
+
+	// utils.SliceOfChannelsRawMerger[[]schema.Choice](promptResultsChannels, rawFinalResultChannel, func(results []schema.Choice) (*schema.OpenAIResponse, error) {
+	concurrency.SliceOfChannelsReducer(
+		promptResultsChannels, rawFinalResultChannel,
+		func(iv concurrency.ErrorOr[*backend.LLMResponseBundle], result concurrency.ErrorOr[*schema.OpenAIResponse]) concurrency.ErrorOr[*schema.OpenAIResponse] {
+
+			if iv.Error != nil {
+				result.Error = iv.Error
+				return result
+			}
+			result.Value.Usage.PromptTokens += iv.Value.Usage.Prompt
+			result.Value.Usage.CompletionTokens += iv.Value.Usage.Completion
+			result.Value.Usage.TotalTokens = result.Value.Usage.PromptTokens + result.Value.Usage.CompletionTokens
+
+			result.Value.Choices = append(result.Value.Choices, iv.Value.Response...)
+
+			return result
+		}, concurrency.ErrorOr[*schema.OpenAIResponse]{Value: initialResponse}, true)
+
+	completionsChannel = rawCompletionsChannel
+	tokenChannel = rawTokenChannel
+
+	return
+}
+
+// TODO: For porting sanity, this is distinct from GenerateTextFromRequest and is _currently_ specific to Chat purposes
+// this is not a final decision -- just a reality of moving a lot of parts at once
+// / This has _become_ Chat which wasn't the goal... More cleanup in the future once it's stable?
+func (oais *OpenAIService) GenerateFromMultipleMessagesChatRequest(request *schema.OpenAIRequest, notifyOnPromptResult bool, notifyOnToken bool, initialTraceID *OpenAIRequestTraceID) (
+	traceID *OpenAIRequestTraceID, finalResultChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse],
+	completionsChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], tokenChannel <-chan concurrency.ErrorOr[*schema.OpenAIResponse], err error) {
+
+	if initialTraceID == nil {
+		traceID = &OpenAIRequestTraceID{
+			ID:      uuid.New().String(),
+			Created: int(time.Now().Unix()),
+		}
+	} else {
+		traceID = initialTraceID
+	}
+
+	bc, request, err := oais.getConfig(request)
+	if err != nil {
+		return
+	}
+
+	// Allow the user to set custom actions via config file
+	// to be "embedded" in each model
+	noActionName := "answer"
+	noActionDescription := "use this action to answer without performing any action"
+
+	if bc.FunctionsConfig.NoActionFunctionName != "" {
+		noActionName = bc.FunctionsConfig.NoActionFunctionName
+	}
+	if bc.FunctionsConfig.NoActionDescriptionName != "" {
+		noActionDescription = bc.FunctionsConfig.NoActionDescriptionName
+	}
+
+	if request.ResponseFormat.Type == "json_object" {
+		request.Grammar = grammar.JSONBNF
+	}
+
+	bc.Grammar = request.Grammar
+
+	processFunctions := false
+	funcs := grammar.Functions{}
+	// process functions if we have any defined or if we have a function call string
+	if len(request.Functions) > 0 && bc.ShouldUseFunctions() {
+		log.Debug().Msgf("Response needs to process functions")
+
+		processFunctions = true
+
+		noActionGrammar := grammar.Function{
+			Name:        noActionName,
+			Description: noActionDescription,
+			Parameters: map[string]interface{}{
+				"properties": map[string]interface{}{
+					"message": map[string]interface{}{
+						"type":        "string",
+						"description": "The message to reply the user with",
+					}},
+			},
+		}
+
+		// Append the no action function
+		funcs = append(funcs, request.Functions...)
+		if !bc.FunctionsConfig.DisableNoAction {
+			funcs = append(funcs, noActionGrammar)
+		}
+
+		// Force picking one of the functions by the request
+		if bc.FunctionToCall() != "" {
+			funcs = funcs.Select(bc.FunctionToCall())
+		}
+
+		// Update input grammar
+		jsStruct := funcs.ToJSONStructure()
+		bc.Grammar = jsStruct.Grammar("", bc.FunctionsConfig.ParallelCalls)
+	} else if request.JSONFunctionGrammarObject != nil {
+		bc.Grammar = request.JSONFunctionGrammarObject.Grammar("", bc.FunctionsConfig.ParallelCalls)
+	}
+
+	if request.Stream && processFunctions {
+		log.Warn().Msg("Streaming + Functions is highly experimental in this version")
+	}
+
+	var predInput string
+
+	if !bc.TemplateConfig.UseTokenizerTemplate || processFunctions {
+
+		suppressConfigSystemPrompt := false
+		mess := []string{}
+		for messageIndex, i := range request.Messages {
+			var content string
+			role := i.Role
+
+			// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
+			// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
+			if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
+				roleFn := "assistant_function_call"
+				r := bc.Roles[roleFn]
+				if r != "" {
+					role = roleFn
+				}
+			}
+			r := bc.Roles[role]
+			contentExists := i.Content != nil && i.StringContent != ""
+
+			fcall := i.FunctionCall
+			if len(i.ToolCalls) > 0 {
+				fcall = i.ToolCalls
+			}
+
+			// First attempt to populate content via a chat message specific template
+			if bc.TemplateConfig.ChatMessage != "" {
+				chatMessageData := model.ChatMessageTemplateData{
+					SystemPrompt: bc.SystemPrompt,
+					Role:         r,
+					RoleName:     role,
+					Content:      i.StringContent,
+					FunctionCall: fcall,
+					FunctionName: i.Name,
+					LastMessage:  messageIndex == (len(request.Messages) - 1),
+					Function:     bc.Grammar != "" && (messageIndex == (len(request.Messages) - 1)),
+					MessageIndex: messageIndex,
+				}
+				templatedChatMessage, err := oais.ml.EvaluateTemplateForChatMessage(bc.TemplateConfig.ChatMessage, chatMessageData)
+				if err != nil {
+					log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, bc.TemplateConfig.ChatMessage, err)
+				} else {
+					if templatedChatMessage == "" {
+						log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", bc.TemplateConfig.ChatMessage, chatMessageData)
+						continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
+					}
+					log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
+					content = templatedChatMessage
+				}
+			}
+			marshalAnyRole := func(f any) {
+				j, err := json.Marshal(f)
+				if err == nil {
+					if contentExists {
+						content += "\n" + fmt.Sprint(r, " ", string(j))
+					} else {
+						content = fmt.Sprint(r, " ", string(j))
+					}
+				}
+			}
+			marshalAny := func(f any) {
+				j, err := json.Marshal(f)
+				if err == nil {
+					if contentExists {
+						content += "\n" + string(j)
+					} else {
+						content = string(j)
+					}
+				}
+			}
+			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
+			if content == "" {
+				if r != "" {
+					if contentExists {
+						content = fmt.Sprint(r, i.StringContent)
+					}
+
+					if i.FunctionCall != nil {
+						marshalAnyRole(i.FunctionCall)
+					}
+				} else {
+					if contentExists {
+						content = fmt.Sprint(i.StringContent)
+					}
+
+					if i.FunctionCall != nil {
+						marshalAny(i.FunctionCall)
+					}
+
+					if i.ToolCalls != nil {
+						marshalAny(i.ToolCalls)
+					}
+				}
+				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+				if contentExists && role == "system" {
+					suppressConfigSystemPrompt = true
+				}
+			}
+
+			mess = append(mess, content)
+		}
+
+		predInput = strings.Join(mess, "\n")
+
+		log.Debug().Msgf("Prompt (before templating): %s", predInput)
+
+		templateFile := ""
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		if oais.ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", bc.Model)) {
+			templateFile = bc.Model
+		}
+
+		if bc.TemplateConfig.Chat != "" && !processFunctions {
+			templateFile = bc.TemplateConfig.Chat
+		}
+
+		if bc.TemplateConfig.Functions != "" && processFunctions {
+			templateFile = bc.TemplateConfig.Functions
+		}
+
+		if templateFile != "" {
+			templatedInput, err := oais.ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
+				SystemPrompt:         bc.SystemPrompt,
+				SuppressSystemPrompt: suppressConfigSystemPrompt,
+				Input:                predInput,
+				Functions:            funcs,
+			})
+			if err == nil {
+				predInput = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", predInput)
+			} else {
+				log.Debug().Msgf("Template failed loading: %s", err.Error())
+			}
+		}
+	}
+	log.Debug().Msgf("Prompt (after templating): %s", predInput)
+	if processFunctions {
+		log.Debug().Msgf("Grammar: %+v", bc.Grammar)
+	}
+
+	rawFinalResultChannel := make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	var rawCompletionsChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
+	var rawTokenChannel chan concurrency.ErrorOr[*schema.OpenAIResponse]
+	if notifyOnPromptResult {
+		rawCompletionsChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	}
+	if notifyOnToken {
+		rawTokenChannel = make(chan concurrency.ErrorOr[*schema.OpenAIResponse])
+	}
+
+	rawResultChannel, individualCompletionChannels, tokenChannels, err := oais.llmbs.GenerateText(predInput, request, bc, func(resp *backend.LLMResponse) schema.Choice {
+		return schema.Choice{
+			Index:        0, // ???
+			FinishReason: "stop",
+			Message: &schema.Message{
+				Role:    "assistant",
+				Content: resp.Response,
+			},
+		}
+	}, notifyOnPromptResult, notifyOnToken)
+
+	chatSimpleMappingFn := func(resp concurrency.ErrorOr[*backend.LLMResponse]) concurrency.ErrorOr[*schema.OpenAIResponse] {
+		if resp.Error != nil || resp.Value == nil {
+			return concurrency.ErrorOr[*schema.OpenAIResponse]{Error: resp.Error}
+		}
+		return concurrency.ErrorOr[*schema.OpenAIResponse]{
+			Value: &schema.OpenAIResponse{
+				ID:      traceID.ID,
+				Created: traceID.Created,
+				Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{
+					{
+						Delta: &schema.Message{
+							Role:    "assistant",
+							Content: resp.Value.Response,
+						},
+						Index: 0,
+					},
+				},
+				Object: "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     resp.Value.Usage.Prompt,
+					CompletionTokens: resp.Value.Usage.Completion,
+					TotalTokens:      resp.Value.Usage.Prompt + resp.Value.Usage.Completion,
+				},
+			},
+		}
+	}
+
+	if notifyOnPromptResult {
+		concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(individualCompletionChannels, chatSimpleMappingFn), rawCompletionsChannel, true)
+	}
+	if notifyOnToken {
+		concurrency.SliceOfChannelsRawMergerWithoutMapping(concurrency.SliceOfChannelsTransformer(tokenChannels, chatSimpleMappingFn), rawTokenChannel, true)
+	}
+
+	go func() {
+		rawResult := <-rawResultChannel
+		if rawResult.Error != nil {
+			log.Warn().Msgf("OpenAIService::processTools GenerateText error [DEBUG THIS?] %q", rawResult.Error)
+			return
+		}
+		llmResponseChoices := rawResult.Value.Response
+
+		if processFunctions && len(llmResponseChoices) > 1 {
+			log.Warn().Msgf("chat functions response with %d choices in response, debug this?", len(llmResponseChoices))
+			log.Debug().Msgf("%+v", llmResponseChoices)
+		}
+
+		for _, result := range rawResult.Value.Response {
+			// If no functions, just return the raw result.
+			if !processFunctions {
+
+				resp := schema.OpenAIResponse{
+					ID:      traceID.ID,
+					Created: traceID.Created,
+					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{result},
+					Object:  "chat.completion.chunk",
+					Usage: schema.OpenAIUsage{
+						PromptTokens:     rawResult.Value.Usage.Prompt,
+						CompletionTokens: rawResult.Value.Usage.Completion,
+						TotalTokens:      rawResult.Value.Usage.Prompt + rawResult.Value.Usage.Completion,
+					},
+				}
+
+				rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &resp}
+
+				continue
+			}
+			// At this point, things are function specific!
+
+			// Oh no this can't be the right way to do this... but it works. Save us, mudler!
+			fString := fmt.Sprintf("%s", result.Message.Content)
+			results := parseFunctionCall(fString, bc.FunctionsConfig.ParallelCalls)
+			noActionToRun := (len(results) > 0 && results[0].name == noActionName)
+
+			if noActionToRun {
+				log.Debug().Msg("-- noActionToRun branch --")
+				initialMessage := schema.OpenAIResponse{
+					ID:      traceID.ID,
+					Created: traceID.Created,
+					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: ""}}},
+					Object:  "stop",
+				}
+				rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &initialMessage}
+
+				result, err := oais.handleQuestion(bc, request, results[0].arguments, predInput)
+				if err != nil {
+					log.Error().Msgf("error handling question: %s", err.Error())
+					return
+				}
+
+				resp := schema.OpenAIResponse{
+					ID:      traceID.ID,
+					Created: traceID.Created,
+					Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0}},
+					Object:  "chat.completion.chunk",
+					Usage: schema.OpenAIUsage{
+						PromptTokens:     rawResult.Value.Usage.Prompt,
+						CompletionTokens: rawResult.Value.Usage.Completion,
+						TotalTokens:      rawResult.Value.Usage.Prompt + rawResult.Value.Usage.Completion,
+					},
+				}
+
+				rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &resp}
+
+			} else {
+				log.Debug().Msgf("[GenerateFromMultipleMessagesChatRequest] fnResultsBranch: %+v", results)
+				for i, ss := range results {
+					name, args := ss.name, ss.arguments
+
+					initialMessage := schema.OpenAIResponse{
+						ID:      traceID.ID,
+						Created: traceID.Created,
+						Model:   request.Model, // we have to return what the user sent here, due to OpenAI spec.
+						Choices: []schema.Choice{{
+							FinishReason: "function_call",
+							Message: &schema.Message{
+								Role: "assistant",
+								ToolCalls: []schema.ToolCall{
+									{
+										Index: i,
+										ID:    traceID.ID,
+										Type:  "function",
+										FunctionCall: schema.FunctionCall{
+											Name:      name,
+											Arguments: args,
+										},
+									},
+								},
+							}}},
+						Object: "chat.completion.chunk",
+					}
+					rawFinalResultChannel <- concurrency.ErrorOr[*schema.OpenAIResponse]{Value: &initialMessage}
+				}
+			}
+		}
+
+		close(rawFinalResultChannel)
+	}()
+
+	finalResultChannel = rawFinalResultChannel
+	completionsChannel = rawCompletionsChannel
+	tokenChannel = rawTokenChannel
+	return
+}
+
+func (oais *OpenAIService) handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, args, prompt string) (string, error) {
+	log.Debug().Msgf("[handleQuestion called] nothing to do, computing a reply")
+
+	// If there is a message that the LLM already sends as part of the JSON reply, use it
+	arguments := map[string]interface{}{}
+	json.Unmarshal([]byte(args), &arguments)
+	m, exists := arguments["message"]
+	if exists {
+		switch message := m.(type) {
+		case string:
+			if message != "" {
+				log.Debug().Msgf("Reply received from LLM: %s", message)
+				message = oais.llmbs.Finetune(*config, prompt, message)
+				log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
+
+				return message, nil
+			}
+		}
+	}
+
+	log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
+	// Otherwise ask the LLM to understand the JSON output and the context, and return a message
+	// Note: This costs (in term of CPU/GPU) another computation
+	config.Grammar = ""
+	images := []string{}
+	for _, m := range input.Messages {
+		images = append(images, m.StringImages...)
+	}
+
+	resultChannel, _, err := oais.llmbs.Inference(input.Context, &backend.LLMRequest{
+		Text:        prompt,
+		Images:      images,
+		RawMessages: input.Messages, // Experimental
+	}, config, false)
+
+	if err != nil {
+		log.Error().Msgf("inference setup error: %s", err.Error())
+		return "", err
+	}
+
+	raw := <-resultChannel
+	if raw.Error != nil {
+		log.Error().Msgf("inference error: %q", raw.Error.Error())
+		return "", err
+	}
+	if raw.Value == nil {
+		log.Warn().Msgf("nil inference response")
+		return "", nil
+	}
+	return oais.llmbs.Finetune(*config, prompt, raw.Value.Response), nil
+}
+
+type funcCallResults struct {
+	name      string
+	arguments string
+}
+
+func parseFunctionCall(llmresult string, multipleResults bool) []funcCallResults {
+
+	results := []funcCallResults{}
+
+	// TODO: use generics to avoid this code duplication
+	if multipleResults {
+		ss := []map[string]interface{}{}
+		s := utils.EscapeNewLines(llmresult)
+		json.Unmarshal([]byte(s), &ss)
+
+		for _, s := range ss {
+			func_name, ok := s["function"]
+			if !ok {
+				continue
+			}
+			args, ok := s["arguments"]
+			if !ok {
+				continue
+			}
+			d, _ := json.Marshal(args)
+			funcName, ok := func_name.(string)
+			if !ok {
+				continue
+			}
+			results = append(results, funcCallResults{name: funcName, arguments: string(d)})
+		}
+	} else {
+		// As we have to change the result before processing, we can't stream the answer token-by-token (yet?)
+		ss := map[string]interface{}{}
+		// This prevent newlines to break JSON parsing for clients
+		s := utils.EscapeNewLines(llmresult)
+		if err := json.Unmarshal([]byte(s), &ss); err != nil {
+			log.Error().Msgf("error unmarshalling JSON: %s", err.Error())
+			return results
+		}
+
+		// The grammar defines the function name as "function", while OpenAI returns "name"
+		func_name, ok := ss["function"]
+		if !ok {
+			log.Debug().Msgf("ss[function] is not OK!, llm result: %q", llmresult)
+			return results
+		}
+		// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
+		args, ok := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
+		if !ok {
+			log.Debug().Msg("ss[arguments] is not OK!")
+			return results
+		}
+		d, _ := json.Marshal(args)
+		funcName, ok := func_name.(string)
+		if !ok {
+			log.Debug().Msgf("unexpected func_name: %+v", func_name)
+			return results
+		}
+		results = append(results, funcCallResults{name: funcName, arguments: string(d)})
+	}
+	return results
+}
--- a/core/startup/config_file_watcher.go
+++ b/core/startup/config_file_watcher.go
@@ -5,8 +5,6 @@ import (
 	"fmt"
 	"os"
 	"path"
-	"path/filepath"
-	"time"

 	"github.com/fsnotify/fsnotify"
 	"github.com/go-skynet/LocalAI/core/config"
@@ -14,162 +12,89 @@ import (
 	"github.com/rs/zerolog/log"
 )

-type fileHandler func(fileContent []byte, appConfig *config.ApplicationConfig) error
+type WatchConfigDirectoryCloser func() error

-type configFileHandler struct {
-	handlers map[string]fileHandler
-
-	watcher *fsnotify.Watcher
-
-	configDir string
-	appConfig *config.ApplicationConfig
+func ReadApiKeysJson(configDir string, appConfig *config.ApplicationConfig) error {
+	fileContent, err := os.ReadFile(path.Join(configDir, "api_keys.json"))
+	if err == nil {
+		// Parse JSON content from the file
+		var fileKeys []string
+		err := json.Unmarshal(fileContent, &fileKeys)
+		if err == nil {
+			appConfig.ApiKeys = append(appConfig.ApiKeys, fileKeys...)
+			return nil
+		}
+		return err
+	}
+	return err
 }

-// TODO: This should be a singleton eventually so other parts of the code can register config file handlers,
-// then we can export it to other packages
-func newConfigFileHandler(appConfig *config.ApplicationConfig) configFileHandler {
-	c := configFileHandler{
-		handlers:  make(map[string]fileHandler),
-		configDir: appConfig.DynamicConfigsDir,
-		appConfig: appConfig,
+func ReadExternalBackendsJson(configDir string, appConfig *config.ApplicationConfig) error {
+	fileContent, err := os.ReadFile(path.Join(configDir, "external_backends.json"))
+	if err != nil {
+		return err
 	}
-	c.Register("api_keys.json", readApiKeysJson(*appConfig), true)
-	c.Register("external_backends.json", readExternalBackendsJson(*appConfig), true)
-	return c
-}
-
-func (c *configFileHandler) Register(filename string, handler fileHandler, runNow bool) error {
-	_, ok := c.handlers[filename]
-	if ok {
-		return fmt.Errorf("handler already registered for file %s", filename)
+	// Parse JSON content from the file
+	var fileBackends map[string]string
+	err = json.Unmarshal(fileContent, &fileBackends)
+	if err != nil {
+		return err
 	}
-	c.handlers[filename] = handler
-	if runNow {
-		c.callHandler(path.Join(c.appConfig.DynamicConfigsDir, filename), handler)
+	err = mergo.Merge(&appConfig.ExternalGRPCBackends, fileBackends)
+	if err != nil {
+		return err
 	}
 	return nil
 }

-func (c *configFileHandler) callHandler(filename string, handler fileHandler) {
-	log.Trace().Str("filename", filename).Msg("reading file for dynamic config update")
-	fileContent, err := os.ReadFile(filename)
-	if err != nil && !os.IsNotExist(err) {
-		log.Error().Err(err).Str("filename", filename).Msg("could not read file")
-	}
-
-	if err = handler(fileContent, c.appConfig); err != nil {
-		log.Error().Err(err).Msg("WatchConfigDirectory goroutine failed to update options")
-	}
+var CONFIG_FILE_UPDATES = map[string]func(configDir string, appConfig *config.ApplicationConfig) error{
+	"api_keys.json":          ReadApiKeysJson,
+	"external_backends.json": ReadExternalBackendsJson,
 }

-func (c *configFileHandler) Watch() error {
-	configWatcher, err := fsnotify.NewWatcher()
-	c.watcher = configWatcher
-	if err != nil {
-		log.Fatal().Err(err).Str("configdir", c.configDir).Msg("wnable to create a watcher for configuration directory")
+func WatchConfigDirectory(configDir string, appConfig *config.ApplicationConfig) (WatchConfigDirectoryCloser, error) {
+	if len(configDir) == 0 {
+		return nil, fmt.Errorf("configDir blank")
 	}
-
-	if c.appConfig.DynamicConfigsDirPollInterval > 0 {
-		log.Debug().Msg("Poll interval set, falling back to polling for configuration changes")
-		ticker := time.NewTicker(c.appConfig.DynamicConfigsDirPollInterval)
-		go func() {
-			for {
-				<-ticker.C
-				for file, handler := range c.handlers {
-					log.Debug().Str("file", file).Msg("polling config file")
-					c.callHandler(filepath.Join(c.appConfig.DynamicConfigsDir, file), handler)
-				}
-			}
-		}()
+	configWatcher, err := fsnotify.NewWatcher()
+	if err != nil {
+		log.Fatal().Msgf("Unable to create a watcher for the LocalAI Configuration Directory: %+v", err)
+	}
+	ret := func() error {
+		configWatcher.Close()
+		return nil
 	}

 	// Start listening for events.
 	go func() {
 		for {
 			select {
-			case event, ok := <-c.watcher.Events:
+			case event, ok := <-configWatcher.Events:
 				if !ok {
 					return
 				}
-				if event.Has(fsnotify.Write | fsnotify.Create | fsnotify.Remove) {
-					handler, ok := c.handlers[path.Base(event.Name)]
-					if !ok {
-						continue
+				if event.Has(fsnotify.Write) {
+					for targetName, watchFn := range CONFIG_FILE_UPDATES {
+						if event.Name == targetName {
+							err := watchFn(configDir, appConfig)
+							log.Warn().Msgf("WatchConfigDirectory goroutine for %s: failed to update options: %+v", targetName, err)
+						}
 					}
-
-					c.callHandler(event.Name, handler)
 				}
-			case err, ok := <-c.watcher.Errors:
-				log.Error().Err(err).Msg("config watcher error received")
+			case _, ok := <-configWatcher.Errors:
 				if !ok {
 					return
 				}
+				log.Error().Err(err).Msg("error encountered while watching config directory")
 			}
 		}
 	}()

 	// Add a path.
-	err = c.watcher.Add(c.appConfig.DynamicConfigsDir)
+	err = configWatcher.Add(configDir)
 	if err != nil {
-		return fmt.Errorf("unable to establish watch on the LocalAI Configuration Directory: %+v", err)
+		return ret, fmt.Errorf("unable to establish watch on the LocalAI Configuration Directory: %+v", err)
 	}

-	return nil
-}
-
-// TODO: When we institute graceful shutdown, this should be called
-func (c *configFileHandler) Stop() {
-	c.watcher.Close()
-}
-
-func readApiKeysJson(startupAppConfig config.ApplicationConfig) fileHandler {
-	handler := func(fileContent []byte, appConfig *config.ApplicationConfig) error {
-		log.Debug().Msg("processing api keys runtime update")
-		log.Trace().Int("numKeys", len(startupAppConfig.ApiKeys)).Msg("api keys provided at startup")
-
-		if len(fileContent) > 0 {
-			// Parse JSON content from the file
-			var fileKeys []string
-			err := json.Unmarshal(fileContent, &fileKeys)
-			if err != nil {
-				return err
-			}
-
-			log.Trace().Int("numKeys", len(fileKeys)).Msg("discovered API keys from api keys dynamic config dile")
-
-			appConfig.ApiKeys = append(startupAppConfig.ApiKeys, fileKeys...)
-		} else {
-			log.Trace().Msg("no API keys discovered from dynamic config file")
-			appConfig.ApiKeys = startupAppConfig.ApiKeys
-		}
-		log.Trace().Int("numKeys", len(appConfig.ApiKeys)).Msg("total api keys after processing")
-		return nil
-	}
-
-	return handler
-}
-
-func readExternalBackendsJson(startupAppConfig config.ApplicationConfig) fileHandler {
-	handler := func(fileContent []byte, appConfig *config.ApplicationConfig) error {
-		log.Debug().Msg("processing external_backends.json")
-
-		if len(fileContent) > 0 {
-			// Parse JSON content from the file
-			var fileBackends map[string]string
-			err := json.Unmarshal(fileContent, &fileBackends)
-			if err != nil {
-				return err
-			}
-			appConfig.ExternalGRPCBackends = startupAppConfig.ExternalGRPCBackends
-			err = mergo.Merge(&appConfig.ExternalGRPCBackends, &fileBackends)
-			if err != nil {
-				return err
-			}
-		} else {
-			appConfig.ExternalGRPCBackends = startupAppConfig.ExternalGRPCBackends
-		}
-		log.Debug().Msg("external backends loaded from external_backends.json")
-		return nil
-	}
-	return handler
+	return ret, nil
 }
--- a/core/startup/startup.go
+++ b/core/startup/startup.go
@@ -4,85 +4,102 @@ import (
 	"fmt"
 	"os"

+	"github.com/go-skynet/LocalAI/core"
+	"github.com/go-skynet/LocalAI/core/backend"
 	"github.com/go-skynet/LocalAI/core/config"
+	openaiendpoint "github.com/go-skynet/LocalAI/core/http/endpoints/openai" // TODO: This is dubious. Fix this when splitting assistant api up.
 	"github.com/go-skynet/LocalAI/core/services"
 	"github.com/go-skynet/LocalAI/internal"
 	"github.com/go-skynet/LocalAI/pkg/assets"
 	"github.com/go-skynet/LocalAI/pkg/model"
-	pkgStartup "github.com/go-skynet/LocalAI/pkg/startup"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )

-func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
+// (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
+func Startup(opts ...config.AppOption) (*core.Application, error) {
 	options := config.NewApplicationConfig(opts...)

+	zerolog.SetGlobalLevel(zerolog.InfoLevel)
+	if options.Debug {
+		zerolog.SetGlobalLevel(zerolog.DebugLevel)
+	}
+
 	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
 	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())

 	// Make sure directories exists
 	if options.ModelPath == "" {
-		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
+		return nil, fmt.Errorf("options.ModelPath cannot be empty")
 	}
-	err := os.MkdirAll(options.ModelPath, 0750)
+	err := os.MkdirAll(options.ModelPath, 0755)
 	if err != nil {
-		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
+		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
 	if options.ImageDir != "" {
-		err := os.MkdirAll(options.ImageDir, 0750)
+		err := os.MkdirAll(options.ImageDir, 0755)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
+			return nil, fmt.Errorf("unable to create ImageDir: %q", err)
 		}
 	}
 	if options.AudioDir != "" {
-		err := os.MkdirAll(options.AudioDir, 0750)
+		err := os.MkdirAll(options.AudioDir, 0755)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
+			return nil, fmt.Errorf("unable to create AudioDir: %q", err)
 		}
 	}
 	if options.UploadDir != "" {
-		err := os.MkdirAll(options.UploadDir, 0750)
+		err := os.MkdirAll(options.UploadDir, 0755)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
+			return nil, fmt.Errorf("unable to create UploadDir: %q", err)
+		}
+	}
+	if options.ConfigsDir != "" {
+		err := os.MkdirAll(options.ConfigsDir, 0755)
+		if err != nil {
+			return nil, fmt.Errorf("unable to create ConfigsDir: %q", err)
 		}
 	}

-	//
-	pkgStartup.PreloadModelsConfigurations(options.ModelLibraryURL, options.ModelPath, options.ModelsURL...)
+	// Load config jsons
+	utils.LoadConfig(options.UploadDir, openaiendpoint.UploadedFilesFile, &openaiendpoint.UploadedFiles)
+	utils.LoadConfig(options.ConfigsDir, openaiendpoint.AssistantsConfigFile, &openaiendpoint.Assistants)
+	utils.LoadConfig(options.ConfigsDir, openaiendpoint.AssistantsFileConfigFile, &openaiendpoint.AssistantFiles)

-	cl := config.NewBackendConfigLoader()
-	ml := model.NewModelLoader(options.ModelPath)
+	app := createApplication(options)

-	configLoaderOpts := options.ToConfigLoaderOptions()
+	services.PreloadModelsConfigurations(options.ModelLibraryURL, options.ModelPath, options.ModelsURL...)

-	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
+	if err := app.BackendConfigLoader.LoadBackendConfigsFromPath(options.ModelPath, app.ApplicationConfig.ToConfigLoaderOptions()...); err != nil {
 		log.Error().Err(err).Msg("error loading config files")
 	}

 	if options.ConfigFile != "" {
-		if err := cl.LoadBackendConfigFile(options.ConfigFile, configLoaderOpts...); err != nil {
+		if err := app.BackendConfigLoader.LoadBackendConfigFile(options.ConfigFile, app.ApplicationConfig.ToConfigLoaderOptions()...); err != nil {
 			log.Error().Err(err).Msg("error loading config file")
 		}
 	}

-	if err := cl.Preload(options.ModelPath); err != nil {
+	if err := app.BackendConfigLoader.Preload(options.ModelPath); err != nil {
 		log.Error().Err(err).Msg("error downloading models")
 	}

 	if options.PreloadJSONModels != "" {
-		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
-			return nil, nil, nil, err
+		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, app.BackendConfigLoader, options.Galleries); err != nil {
+			return nil, err
 		}
 	}

 	if options.PreloadModelsFromPath != "" {
-		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
-			return nil, nil, nil, err
+		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, app.BackendConfigLoader, options.Galleries); err != nil {
+			return nil, err
 		}
 	}

 	if options.Debug {
-		for _, v := range cl.ListBackendConfigs() {
-			cfg, _ := cl.GetBackendConfig(v)
+		for _, v := range app.BackendConfigLoader.ListBackendConfigs() {
+			cfg, _ := app.BackendConfigLoader.GetBackendConfig(v)
 			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
 		}
 	}
@@ -100,17 +117,17 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 	go func() {
 		<-options.Context.Done()
 		log.Debug().Msgf("Context canceled, shutting down")
-		ml.StopAllGRPC()
+		app.ModelLoader.StopAllGRPC()
 	}()

 	if options.WatchDog {
 		wd := model.NewWatchDog(
-			ml,
+			app.ModelLoader,
 			options.WatchDogBusyTimeout,
 			options.WatchDogIdleTimeout,
 			options.WatchDogBusy,
 			options.WatchDogIdle)
-		ml.SetWatchDog(wd)
+		app.ModelLoader.SetWatchDog(wd)
 		go wd.Run()
 		go func() {
 			<-options.Context.Done()
@@ -119,14 +136,36 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 		}()
 	}

-	// Watch the configuration directory
-	// If the directory does not exist, we don't watch it
-	configHandler := newConfigFileHandler(options)
-	err = configHandler.Watch()
-	if err != nil {
-		log.Error().Err(err).Msg("error establishing configuration directory watcher")
+	log.Info().Msg("core/startup process completed!")
+	return app, nil
+}
+
+// In Lieu of a proper DI framework, this function wires up the Application manually.
+// This is in core/startup rather than core/state.go to keep package references clean!
+func createApplication(appConfig *config.ApplicationConfig) *core.Application {
+	app := &core.Application{
+		ApplicationConfig:   appConfig,
+		BackendConfigLoader: config.NewBackendConfigLoader(),
+		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
 	}

-	log.Info().Msg("core/startup process completed!")
-	return cl, ml, options, nil
+	var err error
+
+	app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+
+	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.GalleryService = services.NewGalleryService(app.ApplicationConfig.ModelPath)
+	app.ListModelsService = services.NewListModelsService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
+	app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
+
+	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
+	if err != nil {
+		log.Warn().Msg("Unable to initialize LocalAIMetricsService - non-fatal, optional service")
+	}
+
+	return app
 }
--- a/core/state.go
+++ b/core/state.go
@@ -0,0 +1,41 @@
+package core
+
+import (
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/pkg/model"
+)
+
+// TODO: Can I come up with a better name or location for this?
+// The purpose of this structure is to hold pointers to all initialized services, to make plumbing easy
+// Perhaps a proper DI system is worth it in the future, but for now keep things simple.
+type Application struct {
+
+	// Application-Level Config
+	ApplicationConfig *config.ApplicationConfig
+	// ApplicationState *ApplicationState
+
+	// Core Low-Level Services
+	BackendConfigLoader *config.BackendConfigLoader
+	ModelLoader         *model.ModelLoader
+
+	// Backend Services
+	EmbeddingsBackendService      *backend.EmbeddingsBackendService
+	ImageGenerationBackendService *backend.ImageGenerationBackendService
+	LLMBackendService             *backend.LLMBackendService
+	TranscriptionBackendService   *backend.TranscriptionBackendService
+	TextToSpeechBackendService    *backend.TextToSpeechBackendService
+
+	// LocalAI System Services
+	BackendMonitorService *services.BackendMonitorService
+	GalleryService        *services.GalleryService
+	ListModelsService     *services.ListModelsService
+	LocalAIMetricsService *services.LocalAIMetricsService
+	OpenAIService         *services.OpenAIService
+}
+
+// TODO [NEXT PR?]: Break up ApplicationConfig.
+// Migrate over stuff that is not set via config at all - especially runtime stuff
+type ApplicationState struct {
+}
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@@ -402,7 +402,6 @@ In the help text below, BASEPATH is the location that local-ai is being executed
 | --upload-path | /tmp/localai/upload | Path to store uploads from files api | $LOCALAI_UPLOAD_PATH |
 | --config-path | /tmp/localai/config | | $LOCALAI_CONFIG_PATH |
 | --localai-config-dir | BASEPATH/configuration | Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json) | $LOCALAI_CONFIG_DIR |
-| --localai-config-dir-poll-interval |  | Typically the config path picks up changes automatically, but if your system has broken fsnotify events, set this to a time duration to poll the LocalAI Config Dir (example: 1m) | $LOCALAI_CONFIG_DIR_POLL_INTERVAL |
 | --models-config-file | STRING | YAML file containing a list of model backend configs | $LOCALAI_MODELS_CONFIG_FILE |

 #### Models Flags
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -12,7 +12,7 @@ Section under construction
 This section contains instruction on how to use LocalAI with GPU acceleration.

 {{% alert icon="⚡" context="warning" %}}
-For accelleration for AMD or Metal HW is still in development, for additional details see the [build]({{%relref "docs/getting-started/build#Acceleration" %}})
+For accelleration for AMD or Metal HW there are no specific container images, see the [build]({{%relref "docs/getting-started/build#Acceleration" %}})
 {{% /alert %}}


@@ -110,143 +110,6 @@ llama_model_load_internal: total VRAM used: 1598 MB
 llama_init_from_file: kv self size  =  512.00 MB
 ```

-## ROCM(AMD) acceleration
-
-There are a limited number of tested configurations for ROCm systems however most newer deditated GPU consumer grade devices seem to be supported under the current ROCm6 implementation. 
-
-Due to the nature of ROCm it is best to run all implementations in containers as this limits the number of packages required for installation on host system, compatability and package versions for dependencies across all variations of OS must be tested independently if disired, please refer to the [build]({{%relref "docs/getting-started/build#Acceleration" %}}) documentation.
-
-### Requirements
-
- `ROCm 6.x.x` compatible GPU/accelerator
- OS: `Ubuntu` (22.04, 20.04), `RHEL` (9.3, 9.2, 8.9, 8.8), `SLES` (15.5, 15.4)
- Installed to host: `amdgpu-dkms` and `rocm` >=6.0.0 as per ROCm documentation.
-
-### Recommendations
-
- Do not use on a system running Wayland.
- If running with Xorg do not use GPU assigned for compute for desktop rendering.
- Ensure at least 100GB of free space on disk hosting container runtime and storing images prior to installation.
-
-### Limitations
-
-Ongoing verification testing of ROCm compatability with integrated backends.
-Please note the following list of verified backends and devices.
-
-### Verified 
-
-The devices in the following list have been tested with `hipblas` images running `ROCm 6.0.0`
-
-| Backend | Verified | Devices |
-| ---- | ---- | ---- |
-| llama.cpp | yes | Radeon VII (gfx906) |
-| diffusers | yes | Radeon VII (gfx906) |
-| piper | yes | Radeon VII (gfx906) |
-| whisper | no | none |
-| autogptq | no | none |
-| bark | no | none |
-| coqui | no | none |
-| transformers | no | none |
-| exllama | no | none |
-| exllama2 | no | none |
-| mamba | no | none |
-| petals | no | none |
-| sentencetransformers | no | none |
-| transformers-musicgen | no | none |
-| vall-e-x | no | none |
-| vllm | no | none |
-
-**You can help by expanding this list.**
-
-### System Prep
-
-1. Check your GPU LLVM target is compatible with the version of ROCm. This can be found in the [LLVM Docs](https://llvm.org/docs/AMDGPUUsage.html).
-2. Check which ROCm version is compatible with your LLVM target and your chosen OS (pay special attention to supported kernel versions). See the following for compatability for ([ROCm 6.0.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.0.0/reference/system-requirements.html)) or ([ROCm 6.0.2](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html))
-3. Install you chosen version of the `dkms` and `rocm` (it is recommended that the native package manager be used for this process for any OS as version changes are executed more easily via this method if updates are required). Take care to restart after installing `amdgpu-dkms` and before installing `rocm`, for details regarding this see the installation documentation for your chosen OS ([6.0.2](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/native-install/index.html) or [6.0.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.0.0/how-to/native-install/index.html))
-4. Deploy. Yes it's that easy. 
-
-#### Setup Example (Docker/containerd)
-
-The following are examples of the ROCm specific configuration elements required.
-
-```yaml
-# docker-compose.yaml
-    # For full functionality select a non-'core' image, version locking the image is recommended for debug purposes.
-    image: quay.io/go-skynet/local-ai:master-aio-gpu-hipblas
-    environment:
-      - DEBUG=true
-      # If your gpu is not already included in the current list of default targets the following build details are required.
-      - REBUILD=true
-      - BUILD_TYPE=hipblas
-      - GPU_TARGETS=gfx906 # Example for Radeon VII
-    devices:
-      # AMD GPU only require the following devices be passed through to the container for offloading to occur.
-      - /dev/dri
-      - /dev/kfd
-```
-
-The same can also be executed as a `run` for your container runtime
-
-```
-docker run \
- -e DEBUG=true \
- -e REBUILD=true \
- -e BUILD_TYPE=hipblas \
- -e GPU_TARGETS=gfx906 \
- --device /dev/dri \
- --device /dev/kfd \
- quay.io/go-skynet/local-ai:master-aio-gpu-hipblas
-```
-
-Please ensure to add all other required environment variables, port forwardings, etc to your `compose` file or `run` command.
-
-The rebuild process will take some time to complete when deploying these containers and it is recommended that you `pull` the image prior to deployment as depending on the version these images may be ~20GB in size.
-
-#### Example (k8s) (Advanced Deployment/WIP)
-
-For k8s deployments there is an additional step required before deployment, this is the deployment of the [ROCm/k8s-device-plugin](https://artifacthub.io/packages/helm/amd-gpu-helm/amd-gpu).
-For any k8s environment the documentation provided by AMD from the ROCm project should be successful. It is recommended that if you use rke2 or OpenShift that you deploy the SUSE or RedHat provided version of this resource to ensure compatability.
-After this has been completed the [helm chart from go-skynet](https://github.com/go-skynet/helm-charts) can be configured and deployed mostly un-edited.
-
-The following are details of the changes that should be made to ensure proper function.
-While these details may be configurable in the `values.yaml` development of this Helm chart is ongoing and is subject to change.
-
-The following details indicate the final state of the localai deployment relevant to GPU function.
-
-```yaml
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {NAME}-local-ai
-...
-spec:
-  ...
-  template:
-    ...
-    spec:
-      containers:
-        - env:
-            - name: HIP_VISIBLE_DEVICES
-              value: '0'
-              # This variable indicates the devices availible to container (0:device1 1:device2 2:device3) etc.
-              # For multiple devices (say device 1 and 3) the value would be equivelant to HIP_VISIBLE_DEVICES="0,2"
-              # Please take note of this when an iGPU is present in host system as compatability is not assured.
-          ...
-          resources:
-            limits:
-              amd.com/gpu: '1'
-            requests:
-              amd.com/gpu: '1'
-```
-
-This configuration has been tested on a 'custom' cluster managed by SUSE Rancher that was deployed on top of Ubuntu 22.04.4, certification of other configuration is ongoing and compatability is not gauranteed.
-
-### Notes
-
- When installing the ROCM kernel driver on your system ensure that you are installing an equal or newer version that that which is currently implemented in LocalAI (6.0.0 at time of writing).
- AMD documentation indicates that this will ensure functionality however your milage may vary depending on the GPU and distro you are using.
- If you encounter an `Error 413` on attempting to upload an audio file or image for whisper or llava/bakllava on a k8s deployment, note that the ingress for your deployment may require the annontation `nginx.ingress.kubernetes.io/proxy-body-size: "25m"` to allow larger uploads. This may be included in future versions of the helm chart. 
-
 ## Intel acceleration (sycl)

 ### Requirements
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@@ -257,10 +257,6 @@ parameters:
 # swap_space: 2
 # Uncomment to specify the maximum length of a sequence (including prompt and output)
 # max_model_len: 32768
-# Uncomment and specify the number of Tensor divisions.
-# Allows you to partition and run large models. Performance gains are limited.
-# https://github.com/vllm-project/vllm/issues/1435
-# tensor_parallel_size: 2
 ```

 The backend will automatically download the required files in order to run the model.
@@ -360,4 +356,4 @@ template:

  completion: |
    {{.Input}}
-```
+```
--- a/Show More
+++ b/Show More