⬆️ Update ggerganov/llama.cpp (#1897 )

Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>
⬆️ Update ggerganov/whisper.cpp (#1896 )
2026-02-03 19:22:39 -05:00 · 2024-03-26 00:34:10 +01:00 · 2024-03-26 00:33:46 +01:00 · 2024-03-25 21:04:32 +00:00 · 2024-03-25 18:36:18 +01:00 · 2024-03-25 08:32:40 +01:00
178 changed files with 8405 additions and 2802 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,3 +3,4 @@ models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
+Dockerfile*
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,31 @@
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.go]
+indent_style = tab
+
+[Makefile]
+indent_style = tab
+
+[*.proto]
+indent_size = 2
+
+[*.py]
+indent_size = 4
+
+[*.js]
+indent_size = 2
+
+[*.yaml]
+indent_size = 2
+
+[*.md]
+trim_trailing_whitespace = false
--- a/.env
+++ b/.env
@@ -18,7 +18,7 @@

 ## Default path for models
 #
-MODELS_PATH=/models
+# MODELS_PATH=/models

 ## Enable debug mode
 # DEBUG=true
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -12,13 +12,23 @@ changelog:
    - title: "Bug fixes :bug:"
      labels:
        - bug
+        - regression
    - title: Exciting New Features 🎉
      labels:
        - Semver-Minor
        - enhancement
+        - ux
+        - roadmap
+    - title: 🧠 Models
+      labels:
+        - area/ai-model
+    - title: 📖 Documentation and examples
+      labels:
+        - kind/documentation
+        - examples
    - title: 👒 Dependencies
      labels:
        - dependencies
    - title: Other Changes
      labels:
-        - "*"
+        - "*"
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -22,6 +22,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      makeflags: "-j3"
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -59,6 +60,14 @@ jobs:
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: 'sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -72,6 +81,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      makeflags: "-j3"
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -105,4 +115,4 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
+            base-image: "ubuntu:22.04"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,7 +13,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  extras-image-build:
+  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
@@ -26,6 +26,8 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      aio: ${{ matrix.aio }}
+      makeflags: "-j3"
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -37,6 +39,7 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
+          # Extra images
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
@@ -48,7 +51,7 @@ jobs:
            base-image: "ubuntu:22.04"
          - build-type: ''
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
@@ -78,22 +81,24 @@ jobs:
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            aio: "-aio-gpu-nvidia-cuda-11"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            aio: "-aio-gpu-nvidia-cuda-12"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
@@ -105,10 +110,11 @@ jobs:
            runs-on: 'arc-runner-set'
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
          - build-type: 'hipblas'
@@ -119,51 +125,25 @@ jobs:
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
-  core-image-build:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      base-image: ${{ matrix.base-image }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: 'hipblas'
+          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
+            tag-latest: 'auto'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            image-type: 'extras'
            runs-on: 'arc-runner-set'
-          - build-type: 'hipblas'
+            aio: "-aio-gpu-intel-f16"
+          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            runs-on: 'arc-runner-set'
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg-core'
+            tag-latest: 'auto'
+            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
+            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            aio: "-aio-gpu-intel-f32"
+          # Core images
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -196,6 +176,55 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            runs-on: 'arc-runner-set'
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            runs-on: 'arc-runner-set'
+  
+  core-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      aio: ${{ matrix.aio }}
+      base-image: ${{ matrix.base-image }}
+      makeflags: "-j3"
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            aio: "-aio-cpu"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -46,6 +46,16 @@ on:
        required: true
        default: ''
        type: string
+      makeflags:
+        description: 'Make Flags'
+        required: false
+        default: ''
+        type: string
+      aio:
+        description: 'AIO Image Name'
+        required: false
+        default: ''
+        type: string
    secrets:
      dockerUsername:
        required: true
@@ -124,7 +134,32 @@ jobs:
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }}
-
+      - name: Docker meta AIO (quay.io)
+        if: inputs.aio != ''
+        id: meta_aio
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }}
+      - name: Docker meta AIO (dockerhub)
+        if: inputs.aio != ''
+        id: meta_aio_dockerhub
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            localai/localai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }}
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
        with:
@@ -160,12 +195,51 @@ jobs:
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
+            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
          platforms: ${{ inputs.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
+      -
+        name: Inspect image
+        if: github.event_name != 'pull_request'
+        run: |
+          docker pull localai/localai:${{ steps.meta.outputs.version }}
+          docker image inspect localai/localai:${{ steps.meta.outputs.version }}
+          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+          docker image inspect quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+      - name: Build and push AIO image
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio.outputs.tags }}
+          labels: ${{ steps.meta_aio.outputs.labels }}
+      - name: Build and push AIO image (dockerhub)
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=localai/localai:${{ steps.meta.outputs.version }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
+          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
+      - name: job summary(AIO)
+        if: inputs.aio != ''
+        run: |
+          echo "Built image: ${{ steps.meta_aio.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -89,6 +89,35 @@ jobs:
          files: |
            release/*

+  build-stablediffusion:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          sudo apt-get install -y --no-install-recommends libopencv-dev
+          sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+      - name: Build stablediffusion
+        run: |
+          make backend-assets/grpc/stablediffusion
+          mkdir -p release && cp backend-assets/grpc/stablediffusion release
+      - uses: actions/upload-artifact@v3
+        with:
+          name: stablediffusion
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v1
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
  build-macOS:
    strategy:
      matrix:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -105,9 +105,65 @@ jobs:
      - name: Test
        run: |
          GO_TAGS="stablediffusion tts" make test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
+        timeout-minutes: 5
+
+  tests-aio-container:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Build images
+        run: |
+          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=core -t local-ai:tests -f Dockerfile .
+          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+      - name: Test
+        run: |
+          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            make run-e2e-aio
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
+        timeout-minutes: 5

  tests-apple:
-    runs-on: macOS-latest
+    runs-on: macOS-14
    strategy:
      matrix:
        go-version: ['1.21.x']
@@ -130,4 +186,8 @@ jobs:
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
-          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
+          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
+        timeout-minutes: 5
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ local-ai
 !charts/*
 # prevent above rules from omitting the api/localai folder
 !api/localai
+!core/**/localai

 # Ignore models
 models/*
@@ -34,6 +35,7 @@ release/
 .idea

 # Generated during build
-backend-assets/
+backend-assets/*
+!backend-assets/.keep
 prepare
 /ggml-metal.metal
--- a/65
+++ b/65
@@ -4,6 +4,8 @@ ARG BASE_IMAGE=ubuntu:22.04
 # extras or core
 FROM ${BASE_IMAGE} as requirements-core

+USER root
+
 ARG GO_VERSION=1.21.7
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
@@ -21,7 +23,7 @@ RUN apt-get update && \
    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean

 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -v -C /usr/local -xz
+RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/usr/local/go/bin

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
@@ -61,7 +63,9 @@ WORKDIR /build
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')

-# Extras requirements
+###################################
+###################################
+
 FROM requirements-core as requirements-extras

 RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
@@ -79,6 +83,10 @@ RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 RUN apt-get install -y espeak-ng espeak && apt-get clean

+RUN if [ ! -e /usr/bin/python ]; then \
+	  ln -s /usr/bin/python3 /usr/bin/python \
+    ; fi
+
 ###################################
 ###################################

@@ -87,8 +95,11 @@ FROM requirements-${IMAGE_TYPE} as builder
 ARG GO_TAGS="stablediffusion tts"
 ARG GRPC_BACKENDS
 ARG BUILD_GRPC=true
+ARG MAKEFLAGS
+
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
+ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
@@ -97,16 +108,24 @@ WORKDIR /build

 COPY . .
 COPY .git .
+RUN echo "GO_TAGS: $GO_TAGS"
 RUN make prepare

+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+    apt-get update && \
+    apt-get install -y libclblast-dev && \
+    apt-get clean \
+    ; fi
+
 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

 RUN if [ "${BUILD_GRPC}" = "true" ]; then \
-    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    git clone --recurse-submodules --jobs 4 -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
      -DgRPC_BUILD_TESTS=OFF \
-       ../.. && make -j12 install \
+       ../.. && make install \
    ; fi

 # Rebuild with defaults backends
@@ -126,10 +145,12 @@ ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
 ARG IMAGE_TYPE=extras
+ARG MAKEFLAGS

 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
+ENV MAKEFLAGS=${MAKEFLAGS}

 ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
@@ -142,6 +163,13 @@ RUN if [ "${FFMPEG}" = "true" ]; then \
    apt-get install -y ffmpeg && apt-get clean \
    ; fi

+# Add OpenCL
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+    apt-get update && \
+    apt-get install -y libclblast1 && \
+    apt-get clean \
+    ; fi
+
 WORKDIR /build

 # we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
@@ -166,43 +194,43 @@ COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/

 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
+    make -C backend/python/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
+    make -C backend/python/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
+    make -C backend/python/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
+    make -C backend/python/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/mamba \
+    make -C backend/python/mamba \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
+    make -C backend/python/sentencetransformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
+    make -C backend/python/transformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
+    make -C backend/python/vall-e-x \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
+    make -C backend/python/exllama \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-    PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
+    make -C backend/python/exllama2 \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
+    make -C backend/python/petals \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
+    make -C backend/python/transformers-musicgen \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	PATH=$PATH:/opt/conda/bin make -C backend/python/coqui \
+    make -C backend/python/coqui \
    ; fi

 # Make sure the models directory exists
@@ -211,6 +239,7 @@ RUN mkdir -p /build/models
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
-
+  
+VOLUME /build/models
 EXPOSE 8080
 ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/Dockerfile.aio
+++ b/Dockerfile.aio
@@ -0,0 +1,8 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} 
+
+RUN apt-get update && apt-get install -y pciutils && apt-get clean
+
+COPY aio/ /aio
+ENTRYPOINT [ "/aio/entrypoint.sh" ]
--- a/338
+++ b/338
@@ -4,11 +4,8 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
-
-GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
-
-CPPLLAMA_VERSION?=fd43d66f46ee3b5345fb8a74a252d86ccd34a409
+GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
+CPPLLAMA_VERSION?=b06c16ef9f81d84da520232c125d4d8a1d273736

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,19 +13,19 @@ GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
-RWKV_VERSION?=633c5a3485c403cb2520693dc0991a25dace9f0f
+RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
+WHISPER_CPP_VERSION?=1558ec5a16cb2b2a0bf54815df1d41f83dc3815b

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d

 # go-piper version
-PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
+PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759

 # stablediffusion version
-STABLEDIFFUSION_VERSION?=d5d2be8e7e395c2d73ceef61e6fe8d240f2cd831
+STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485

 # tinydream version
 TINYDREAM_VERSION?=772a9c0d9aaf768290e63cca3c904fe69faf677a
@@ -38,12 +35,15 @@ export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=

 CGO_LDFLAGS?=
+CGO_LDFLAGS_WHISPER?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git

 TEST_DIR=/tmp/test

+TEST_FLAKES?=5
+
 RANDOM := $(shell bash -c 'echo $$RANDOM')

 VERSION?=$(shell git describe --always --tags || echo "dev" )
@@ -70,7 +70,7 @@ UNAME_S := $(shell uname -s)
 endif

 ifeq ($(OS),Darwin)
-	CGO_LDFLAGS += -lcblas -framework Accelerate
+	
 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
 	endif
@@ -81,6 +81,12 @@ ifeq ($(OS),Darwin)
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+		export LLAMA_NO_ACCELERATE=1
+	endif
+
+	ifeq ($(BUILD_TYPE),metal)
+#			-lcblas 	removed: it seems to always be listed as a duplicate flag.
+		CGO_LDFLAGS += -framework Accelerate
 	endif
 endif

@@ -89,10 +95,12 @@ ifeq ($(BUILD_TYPE),openblas)
 	export WHISPER_OPENBLAS=1
 endif

+
 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
 	export WHISPER_CUBLAS=1
+	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
 endif

 ifeq ($(BUILD_TYPE),hipblas)
@@ -146,15 +154,16 @@ endif

 ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
+ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)

 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
+TEST_PATHS?=./api/... ./pkg/... ./core/...

 # If empty, then we build all
 ifeq ($(GRPC_BACKENDS),)
@@ -165,40 +174,41 @@ ifeq ($(BUILD_API_ONLY),true)
 	GRPC_BACKENDS=
 endif

-.PHONY: all test build vendor
+.PHONY: all test build vendor get-sources prepare-sources prepare

 all: help

-## GPT4ALL
-sources/gpt4all:
-	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
-	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
-
-## go-piper
-sources/go-piper:
-	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
-	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
-
 ## BERT embeddings
 sources/go-bert:
 	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
 	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

-## stable diffusion
-sources/go-stable-diffusion:
-	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
-	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert/libgobert.a: sources/go-bert
+	$(MAKE) -C sources/go-bert libgobert.a

-sources/go-stable-diffusion/libstablediffusion.a:
-	$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
+## go-llama-ggml
+sources/go-llama-ggml:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
+	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-## tiny-dream
-sources/go-tiny-dream:
-	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
-	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
+	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-sources/go-tiny-dream/libtinydream.a:
-	$(MAKE) -C sources/go-tiny-dream libtinydream.a
+## go-piper
+sources/go-piper:
+	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
+	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
+
+sources/go-piper/libpiper_binding.a: sources/go-piper
+	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
+
+## GPT4ALL
+sources/gpt4all:
+	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
+	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
+
+sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
 sources/go-rwkv:
@@ -208,23 +218,23 @@ sources/go-rwkv:
 sources/go-rwkv/librwkv.a: sources/go-rwkv
 	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-sources/go-bert/libgobert.a: sources/go-bert
-	$(MAKE) -C sources/go-bert libgobert.a
+## stable diffusion
+sources/go-stable-diffusion:
+	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
+	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1

-backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	mkdir -p backend-assets/gpt4all
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
+	$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a

-backend-assets/espeak-ng-data: sources/go-piper
-	mkdir -p backend-assets/espeak-ng-data
-	$(MAKE) -C sources/go-piper piper.o
-	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
+## tiny-dream
+sources/go-tiny-dream:
+	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
+	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1

-sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
-	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
+sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
+	$(MAKE) -C sources/go-tiny-dream libtinydream.a

+## whisper
 sources/whisper.cpp:
 	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
@@ -232,47 +242,34 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && make libwhisper.a

-sources/go-llama:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
-	cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-llama-ggml:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-llama/libbinding.a: sources/go-llama
-	$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
-
-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
-
-sources/go-piper/libpiper_binding.a: sources/go-piper
-	$(MAKE) -C sources/go-piper libpiper_binding.a example/main
-
-backend/cpp/llama/llama.cpp:
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp	
-
-get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
-	touch $@
+get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream

 replace:
-	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+
+dropreplace:
+	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
+	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
+	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
+	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
+	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
+	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
+	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
+	$(GOCMD) mod edit -dropreplace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
-	touch $@

 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama clean
 	$(MAKE) -C sources/go-llama-ggml clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C sources/go-rwkv clean
@@ -284,7 +281,6 @@ rebuild: ## Rebuilds the project
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
-	touch $@

 clean: ## Remove build related file
 	$(GOCMD) clean -cache
@@ -295,10 +291,15 @@ clean: ## Remove build related file
 	rm -rf backend-assets
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
+	$(MAKE) dropreplace
+
+clean-tests:
+	rm -rf test-models
+	rm -rf test-dir
+	rm -rf core/http/backend-assets

 ## Build:
-
-build: backend-assets grpcs prepare ## Build the project
+build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@@ -316,10 +317,10 @@ osx-signed: build
 run: prepare ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./

-test-models/testmodel:
+test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel
+	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -328,15 +329,15 @@ test-models/testmodel:
 	cp tests/models_fixtures/* test-models

 prepare-test: grpcs
-	cp -rf backend-assets api
+	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models

-test: prepare test-models/testmodel grpcs
+test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts stablediffusion"
+	export GO_TAGS="tts stablediffusion debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
@@ -353,6 +354,10 @@ run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests

+run-e2e-aio:
+	@echo 'Running e2e AIO tests'
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
+
 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
@@ -365,23 +370,28 @@ teardown-e2e:

 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)

 test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
+
+test-stores: backend-assets/grpc/local-store
+	mkdir -p tests/integration/backend-assets/grpc
+	cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration

 test-container:
 	docker build --target requirements -t local-ai-test-container .
@@ -451,91 +461,94 @@ ifeq ($(BUILD_API_ONLY),true)
 	touch backend-assets/keep
 endif

-backend-assets/grpc:
+backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_binding.a
+	mkdir -p backend-assets/espeak-ng-data
+	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
+
+backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	mkdir -p backend-assets/gpt4all
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+
+backend-assets/grpc: replace
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
-# TODO: every binary should have its own folder instead, so can have different  implementations
-ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
-endif
-
-## BACKEND CPP LLAMA START
-# Sets the variables in case it has to build the gRPC locally.
-INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
-INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
-ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
-                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
-                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
-                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
-                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
-
-backend/cpp/llama/grpc-server:
-ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
-	$(MAKE) -C backend/cpp/grpc build
-	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
-	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
-	export PATH="${INSTALLED_PACKAGES}/bin:${PATH}" && \
-	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
-else
-	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
-endif
-## BACKEND CPP LLAMA END
-
-##
-backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
-	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
-# TODO: every binary should have its own folder instead, so can have different metal implementations
-ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
-endif
-
-backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
-
-backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
-
-backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
-
-backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
+backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/

+backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
+
 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/

-backend-assets/grpc/stablediffusion: backend-assets/grpc
-	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
-		$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
-		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-stable-diffusion/ LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
-		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion; \
-	fi
+backend/cpp/llama/llama.cpp:
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp

-backend-assets/grpc/tinydream: backend-assets/grpc sources/go-tiny-dream/libtinydream.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
+INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
+INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
+ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
+				 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
+backend/cpp/llama/grpc-server:
+# Conditionally build grpc for the llama backend to use if needed
+ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
+	$(MAKE) -C backend/cpp/grpc build
+	_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
+	_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
+	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
+	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) \
+	$(MAKE) -C backend/cpp/llama grpc-server
+else
+	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+endif

-backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
+backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
+	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
+endif
+
+backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
+
+backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

-backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
+
+backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-stable-diffusion/ LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
+
+backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
+
+backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/

+backend-assets/grpc/local-store: backend-assets/grpc
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
+
 grpcs: prepare $(GRPC_BACKENDS)

 DOCKER_IMAGE?=local-ai
+DOCKER_AIO_IMAGE?=local-ai-aio
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:22.04

@@ -546,6 +559,16 @@ docker:
 		--build-arg GO_TAGS=$(GO_TAGS) \
 		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
 		-t $(DOCKER_IMAGE) .
+	
+docker-aio:
+	@echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
+	docker build \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		-t $(DOCKER_AIO_IMAGE) -f Dockerfile.aio .
+
+docker-aio-all:
+	$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
+	$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu

 docker-image-intel:
 	docker build \
@@ -553,3 +576,10 @@ docker-image-intel:
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+
+docker-image-intel-xpu:
+	docker build \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS="none" \
+		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
--- a/README.md
+++ b/README.md
@@ -20,14 +20,14 @@
 </a>
 </p>

-[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
-[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)
-
-> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
->
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
-
-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
+<p align="center">
+<a href="https://hub.docker.com/r/localai/localai" target="blank">
+<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
+</a>
+<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
+<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
+</a>
+</p>

 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
@@ -36,20 +36,24 @@
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>
+</p>

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
+
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.

 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+- Vector store: https://github.com/mudler/LocalAI/pull/1795
+- All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
 - Parallel function calling: https://github.com/mudler/LocalAI/pull/1726
 - Upload file API: https://github.com/mudler/LocalAI/pull/1703
 - Tools API support: https://github.com/mudler/LocalAI/pull/1715
 - LLaVa 1.6: https://github.com/mudler/LocalAI/pull/1714
 - ROCm container images: https://github.com/mudler/LocalAI/pull/1595
- Intel GPU support (sycl): https://github.com/mudler/LocalAI/issues/1653
- Deprecation of old backends: https://github.com/mudler/LocalAI/issues/1651
+- Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
 - Mamba support: https://github.com/mudler/LocalAI/pull/1589
 - Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
 - 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
@@ -59,15 +63,21 @@ Hot topics (looking for contributors):
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
-  
+- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
+- Vulkan: https://github.com/mudler/LocalAI/issues/1647
+
 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22

 ## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)

-For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. For those in a hurry, here's a straightforward one-liner to launch a LocalAI instance with [phi-2](https://huggingface.co/microsoft/phi-2) using `docker`:
+For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. 

-```
-docker run -ti -p 8080:8080 localai/localai:v2.7.0-ffmpeg-core phi-2
+For those in a hurry, here's a straightforward one-liner to launch a LocalAI AIO(All-in-one) Image using `docker`:
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+# or, if you have an Nvidia GPU:
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-cuda12
 ```

 ## 🚀 [Features](https://localai.io/features/)
@@ -97,9 +107,6 @@ WebUIs:

 Model galleries
 - https://github.com/go-skynet/model-gallery
-  
-UI / Management Programs
- [LocalAI Manager](https://io.midori-ai.xyz/howtos/easy-model-installer/)

 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
@@ -110,6 +117,7 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
+  

 ### 🔗 Resources

@@ -121,6 +129,8 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

+- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/ai/answers/tiZMDoZzZV6TLxgDXNBnFE/deploying-helm-charts-on-aws-eks)
+- [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
 - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
 - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -0,0 +1,42 @@
+# Security Policy
+
+## Introduction
+
+At LocalAI, we take the security of our software seriously. We understand the importance of protecting our community from vulnerabilities and are committed to ensuring the safety and security of our users.
+
+## Supported Versions
+
+We provide support and updates for certain versions of our software. The following table outlines which versions are currently supported with security updates:
+
+| Version | Supported          |
+| ------- | ------------------ |
+| > 2.0   | :white_check_mark: |
+| < 2.0   | :x:                |
+
+Please ensure that you are using a supported version to receive the latest security updates.
+
+## Reporting a Vulnerability
+
+We encourage the responsible disclosure of any security vulnerabilities. If you believe you've found a security issue in our software, we kindly ask you to follow the steps below to report it to us:
+
+1. **Email Us:** Send an email to [security@localai.io](mailto:security@localai.io) with a detailed report. Please do not disclose the vulnerability publicly or to any third parties before it has been addressed by us.
+
+2. **Expect a Response:** We aim to acknowledge receipt of vulnerability reports within 48 hours. Our security team will review your report and work closely with you to understand the impact and ensure a thorough investigation.
+
+3. **Collaboration:** If the vulnerability is accepted, we will work with you and our community to address the issue promptly. We'll keep you informed throughout the resolution process and may request additional information or collaboration.
+
+4. **Disclosure:** Once the vulnerability has been resolved, we encourage a coordinated disclosure. We believe in transparency and will work with you to ensure that our community is informed in a responsible manner.
+
+## Use of Third-Party Platforms
+
+As a Free and Open Source Software (FOSS) organization, we do not offer monetary bounties. However, researchers who wish to report vulnerabilities can also do so via [Huntr](https://huntr.dev/bounties), a platform that recognizes contributions to open source security.
+
+## Contact
+
+For any security-related inquiries beyond vulnerability reporting, please contact us at [security@localai.io](mailto:security@localai.io).
+
+## Acknowledgments
+
+We appreciate the efforts of those who contribute to the security of our project. Your responsible disclosure is invaluable to the safety and integrity of LocalAI.
+
+Thank you for helping us keep LocalAI secure.
--- a/aio/cpu/README.md
+++ b/aio/cpu/README.md
@@ -0,0 +1,5 @@
+## AIO CPU size
+
+Use this image with CPU-only.
+
+Please keep using only C++ backends so the base image is as small as possible (without CUDA, cuDNN, python, etc).
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -0,0 +1,18 @@
+backend: bert-embeddings
+embeddings: true
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: text-embedding-ada-002
+
+parameters:
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -0,0 +1,53 @@
+name: stablediffusion
+backend: stablediffusion
+parameters:
+  model: stablediffusion_assets
+
+license: "BSD-3"
+urls:
+- https://github.com/EdVince/Stable-Diffusion-NCNN
+- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
+
+description: |
+     Stable Diffusion in NCNN with c++, supported txt2img and img2img
+
+download_files:
+- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
+  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
+  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
+  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
+- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
+  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
+- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
+  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
+  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
+  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
+- filename: "stablediffusion_assets/log_sigmas.bin"
+  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
+- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
+  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
+  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
+  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
+- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
+  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
+- filename: "stablediffusion_assets/vocab.txt"
+  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
--- a/aio/cpu/speech-to-text.yaml
+++ b/aio/cpu/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/cpu/text-to-speech.yaml
+++ b/aio/cpu/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"voice-en-us-amy-low",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -0,0 +1,25 @@
+name: gpt-4
+mmap: true
+parameters:
+  model: huggingface://l3utterfly/phi-2-layla-v1-chatml-gguf/phi-2-layla-v1-chatml-Q8_0.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+    {{if .Content}}{{.Content}}{{end}}
+    <|im_end|>
+  chat: |
+    {{.Input}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 2048
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "phi-2-chat",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -0,0 +1,40 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: bakllava-mmproj.gguf
+parameters:
+  model: bakllava.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+mirostat: 2
+mirostat_eta: 1.0
+mirostat_tau: 1.0
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: bakllava.gguf
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+- filename: bakllava-mmproj.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+echo "===> LocalAI All-in-One (AIO) container starting..."
+
+GPU_ACCELERATION=false
+GPU_VENDOR=""
+
+function detect_gpu() {
+    case "$(uname -s)" in
+        Linux)
+            if lspci | grep -E 'VGA|3D' | grep -iq nvidia; then
+                echo "NVIDIA GPU detected"
+                # nvidia-smi should be installed in the container
+                if nvidia-smi; then
+                    GPU_ACCELERATION=true
+                    GPU_VENDOR=nvidia
+                else
+                    echo "NVIDIA GPU detected, but nvidia-smi is not installed. GPU acceleration will not be available."
+                fi
+            elif lspci | grep -E 'VGA|3D' | grep -iq amd; then
+                echo "AMD GPU detected"
+                # Check if ROCm is installed
+                if [ -d /opt/rocm ]; then
+                    GPU_ACCELERATION=true
+                    GPU_VENDOR=amd
+                else
+                    echo "AMD GPU detected, but ROCm is not installed. GPU acceleration will not be available."
+                fi
+            elif lspci | grep -E 'VGA|3D' | grep -iq intel; then
+                echo "Intel GPU detected"
+                if [ -d /opt/intel ]; then
+                    GPU_ACCELERATION=true
+                else
+                    echo "Intel GPU detected, but Intel GPU drivers are not installed. GPU acceleration will not be available."
+                fi
+            elif lspci | grep -E 'VGA|3D' | grep -iq "Microsoft Corporation Device 008e"; then
+                # We make the assumption this WSL2 cars is NVIDIA, then check for nvidia-smi
+                # Make sure the container was run with `--gpus all` as the only required parameter
+                echo "NVIDIA GPU detected via WSL2"
+                # nvidia-smi should be installed in the container
+                if nvidia-smi; then
+                    GPU_ACCELERATION=true
+                    GPU_VENDOR=nvidia
+                else
+                    echo "NVIDIA GPU detected via WSL2, but nvidia-smi is not installed. GPU acceleration will not be available."
+                fi
+            fi
+            ;;
+        Darwin)
+            if system_profiler SPDisplaysDataType | grep -iq 'Metal'; then
+                echo "Apple Metal supported GPU detected"
+                GPU_ACCELERATION=true
+                GPU_VENDOR=apple
+            fi
+            ;;
+    esac
+}
+
+function detect_gpu_size() {
+    # Attempting to find GPU memory size for NVIDIA GPUs
+    if [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "nvidia" ]; then
+        echo "NVIDIA GPU detected. Attempting to find memory size..."
+        # Using head -n 1 to get the total memory of the 1st NVIDIA GPU detected.
+        # If handling multiple GPUs is required in the future, this is the place to do it
+        nvidia_sm=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n 1)
+        if [ ! -z "$nvidia_sm" ]; then
+            echo "Total GPU Memory: $nvidia_sm MiB"
+            # if bigger than 8GB, use 16GB
+            #if [ "$nvidia_sm" -gt 8192 ]; then
+            #    GPU_SIZE=gpu-16g
+            #else
+            GPU_SIZE=gpu-8g
+            #fi
+        else
+            echo "Unable to determine NVIDIA GPU memory size. Falling back to CPU."
+            GPU_SIZE=gpu-8g
+        fi
+
+    # Default to a generic GPU size until we implement GPU size detection for non NVIDIA GPUs
+    elif [ "$GPU_ACCELERATION" = true ]; then
+        echo "Non-NVIDIA GPU detected. Specific GPU memory size detection is not implemented."
+        GPU_SIZE=gpu-8g
+
+    # default to cpu if GPU_SIZE is not set
+    else
+        echo "GPU acceleration is not enabled or supported. Defaulting to CPU."
+        GPU_SIZE=cpu
+    fi
+}
+
+function check_vars() {
+    if [ -z "$MODELS" ]; then
+        echo "MODELS environment variable is not set. Please set it to a comma-separated list of model YAML files to load."
+        exit 1
+    fi
+
+    if [ -z "$SIZE" ]; then
+        echo "SIZE environment variable is not set. Please set it to one of the following: cpu, gpu-8g, gpu-16g, apple"
+        exit 1
+    fi
+}
+
+detect_gpu
+detect_gpu_size
+
+SIZE="${SIZE:-$GPU_SIZE}" # default to cpu
+export MODELS="${MODELS:-/aio/${SIZE}/embeddings.yaml,/aio/${SIZE}/text-to-speech.yaml,/aio/${SIZE}/image-gen.yaml,/aio/${SIZE}/text-to-text.yaml,/aio/${SIZE}/speech-to-text.yaml,/aio/${SIZE}/vision.yaml}"
+
+check_vars
+
+echo "Starting LocalAI with the following models: $MODELS"
+
+/build/entrypoint.sh "$@"
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -0,0 +1,13 @@
+name: text-embedding-ada-002
+backend: sentencetransformers
+embeddings: true
+parameters:
+  model: all-MiniLM-L6-v2
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/gpu-8g/image-gen.yaml
+++ b/aio/gpu-8g/image-gen.yaml
@@ -0,0 +1,26 @@
+name: stablediffusion
+parameters:
+  model: DreamShaper_8_pruned.safetensors
+backend: diffusers
+step: 25
+f16: true
+
+diffusers:
+  pipeline_type: StableDiffusionPipeline
+  cuda: true
+  enable_parameters: "negative_prompt,num_inference_steps"
+  scheduler_type: "k_dpmpp_2m"
+
+download_files:
+- filename: DreamShaper_8_pruned.safetensors
+  uri: huggingface://Lykon/DreamShaper/DreamShaper_8_pruned.safetensors
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "model": "dreamshaper",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/gpu-8g/speech-to-text.yaml
+++ b/aio/gpu-8g/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/gpu-8g/text-to-speech.yaml
+++ b/aio/gpu-8g/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"tts-1",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -0,0 +1,51 @@
+name: gpt-4
+mmap: true
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+
+roles:
+  assistant_function_call: assistant
+  function: tool
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "function"}}{{.Role}}{{else if eq .RoleName "user"}}user{{end}}
+    {{ if eq .RoleName "assistant_function_call" }}<tool_call>{{end}}
+    {{ if eq .RoleName "function" }}<tool_result>{{end}}
+    {{if .Content}}{{.Content}}{{end}}
+    {{if .FunctionCall}}{{toJson .FunctionCall}}{{end}}
+    {{ if eq .RoleName "assistant_function_call" }}</tool_call>{{end}}
+    {{ if eq .RoleName "function" }}</tool_result>{{end}}
+    <|im_end|>
+  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: 
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools> 
+    Use the following pydantic model json schema for each tool call you will make: 
+    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']} 
+    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    <tool_call>
+    {'arguments': <args-dict>, 'name': <function-name>}
+    </tool_call><|im_end|>
+    {{.Input}}
+    <|im_start|>assistant
+    <tool_call>
+  chat: |
+    {{.Input}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "gpt-4",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -0,0 +1,37 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
+parameters:
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -1,162 +0,0 @@
-package localai
-
-import (
-	"context"
-	"fmt"
-	"strings"
-
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-
-	"github.com/go-skynet/LocalAI/core/options"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-
-	gopsutil "github.com/shirou/gopsutil/v3/process"
-)
-
-type BackendMonitorRequest struct {
-	Model string `json:"model" yaml:"model"`
-}
-
-type BackendMonitorResponse struct {
-	MemoryInfo    *gopsutil.MemoryInfoStat
-	MemoryPercent float32
-	CPUPercent    float64
-}
-
-type BackendMonitor struct {
-	configLoader *config.ConfigLoader
-	options      *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
-}
-
-func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
-	return BackendMonitor{
-		configLoader: configLoader,
-		options:      options,
-	}
-}
-
-func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
-	config, exists := bm.configLoader.GetConfig(model)
-	var backend string
-	if exists {
-		backend = config.Model
-	} else {
-		// Last ditch effort: use it raw, see if a backend happens to match.
-		backend = model
-	}
-
-	if !strings.HasSuffix(backend, ".bin") {
-		backend = fmt.Sprintf("%s.bin", backend)
-	}
-
-	pid, err := bm.options.Loader.GetGRPCPID(backend)
-
-	if err != nil {
-		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
-		return nil, err
-	}
-
-	// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
-	backendProcess, err := gopsutil.NewProcess(int32(pid))
-
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
-		return nil, err
-	}
-
-	memInfo, err := backendProcess.MemoryInfo()
-
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
-		return nil, err
-	}
-
-	memPercent, err := backendProcess.MemoryPercent()
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
-		return nil, err
-	}
-
-	cpuPercent, err := backendProcess.CPUPercent()
-	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
-		return nil, err
-	}
-
-	return &BackendMonitorResponse{
-		MemoryInfo:    memInfo,
-		MemoryPercent: memPercent,
-		CPUPercent:    cpuPercent,
-	}, nil
-}
-
-func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
-	input := new(BackendMonitorRequest)
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return "", err
-	}
-
-	config, exists := bm.configLoader.GetConfig(input.Model)
-	var backendId string
-	if exists {
-		backendId = config.Model
-	} else {
-		// Last ditch effort: use it raw, see if a backend happens to match.
-		backendId = input.Model
-	}
-
-	if !strings.HasSuffix(backendId, ".bin") {
-		backendId = fmt.Sprintf("%s.bin", backendId)
-	}
-
-	return backendId, nil
-}
-
-func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		backendId, err := bm.getModelLoaderIDFromCtx(c)
-		if err != nil {
-			return err
-		}
-
-		model := bm.options.Loader.CheckIsLoaded(backendId)
-		if model == "" {
-			return fmt.Errorf("backend %s is not currently loaded", backendId)
-		}
-
-		status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
-		if rpcErr != nil {
-			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
-			val, slbErr := bm.SampleLocalBackendProcess(backendId)
-			if slbErr != nil {
-				return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
-			}
-			return c.JSON(proto.StatusResponse{
-				State: proto.StatusResponse_ERROR,
-				Memory: &proto.MemoryUsageData{
-					Total: val.MemoryInfo.VMS,
-					Breakdown: map[string]uint64{
-						"gopsutil-RSS": val.MemoryInfo.RSS,
-					},
-				},
-			})
-		}
-
-		return c.JSON(status)
-	}
-}
-
-func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		backendId, err := bm.getModelLoaderIDFromCtx(c)
-		if err != nil {
-			return err
-		}
-
-		return bm.options.Loader.ShutdownModel(backendId)
-	}
-}
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -1,326 +0,0 @@
-package localai
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"slices"
-	"strings"
-	"sync"
-
-	json "github.com/json-iterator/go"
-	"gopkg.in/yaml.v3"
-
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/gallery"
-	"github.com/go-skynet/LocalAI/pkg/utils"
-
-	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
-	"github.com/rs/zerolog/log"
-)
-
-type galleryOp struct {
-	req         gallery.GalleryModel
-	id          string
-	galleries   []gallery.Gallery
-	galleryName string
-}
-
-type galleryOpStatus struct {
-	FileName           string  `json:"file_name"`
-	Error              error   `json:"error"`
-	Processed          bool    `json:"processed"`
-	Message            string  `json:"message"`
-	Progress           float64 `json:"progress"`
-	TotalFileSize      string  `json:"file_size"`
-	DownloadedFileSize string  `json:"downloaded_size"`
-}
-
-type galleryApplier struct {
-	modelPath string
-	sync.Mutex
-	C        chan galleryOp
-	statuses map[string]*galleryOpStatus
-}
-
-func NewGalleryService(modelPath string) *galleryApplier {
-	return &galleryApplier{
-		modelPath: modelPath,
-		C:         make(chan galleryOp),
-		statuses:  make(map[string]*galleryOpStatus),
-	}
-}
-
-func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {
-
-	config, err := gallery.GetGalleryConfigFromURL(req.URL)
-	if err != nil {
-		return err
-	}
-
-	config.Files = append(config.Files, req.AdditionalFiles...)
-
-	return gallery.InstallModel(modelPath, req.Name, &config, req.Overrides, downloadStatus)
-}
-
-func (g *galleryApplier) updateStatus(s string, op *galleryOpStatus) {
-	g.Lock()
-	defer g.Unlock()
-	g.statuses[s] = op
-}
-
-func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
-	g.Lock()
-	defer g.Unlock()
-
-	return g.statuses[s]
-}
-
-func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
-	g.Lock()
-	defer g.Unlock()
-
-	return g.statuses
-}
-
-func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
-	go func() {
-		for {
-			select {
-			case <-c.Done():
-				return
-			case op := <-g.C:
-				utils.ResetDownloadTimers()
-
-				g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: 0})
-
-				// updates the status with an error
-				updateError := func(e error) {
-					g.updateStatus(op.id, &galleryOpStatus{Error: e, Processed: true, Message: "error: " + e.Error()})
-				}
-
-				// displayDownload displays the download progress
-				progressCallback := func(fileName string, current string, total string, percentage float64) {
-					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
-					utils.DisplayDownloadFunction(fileName, current, total, percentage)
-				}
-
-				var err error
-				// if the request contains a gallery name, we apply the gallery from the gallery list
-				if op.galleryName != "" {
-					if strings.Contains(op.galleryName, "@") {
-						err = gallery.InstallModelFromGallery(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
-					} else {
-						err = gallery.InstallModelFromGalleryByName(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
-					}
-				} else {
-					err = prepareModel(g.modelPath, op.req, cm, progressCallback)
-				}
-
-				if err != nil {
-					updateError(err)
-					continue
-				}
-
-				// Reload models
-				err = cm.LoadConfigs(g.modelPath)
-				if err != nil {
-					updateError(err)
-					continue
-				}
-
-				err = cm.Preload(g.modelPath)
-				if err != nil {
-					updateError(err)
-					continue
-				}
-
-				g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
-			}
-		}
-	}()
-}
-
-type galleryModel struct {
-	gallery.GalleryModel `yaml:",inline"` // https://github.com/go-yaml/yaml/issues/63
-	ID                   string           `json:"id"`
-}
-
-func processRequests(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
-	var err error
-	for _, r := range requests {
-		utils.ResetDownloadTimers()
-		if r.ID == "" {
-			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
-		} else {
-			if strings.Contains(r.ID, "@") {
-				err = gallery.InstallModelFromGallery(
-					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
-			} else {
-				err = gallery.InstallModelFromGalleryByName(
-					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
-			}
-		}
-	}
-	return err
-}
-
-func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
-	dat, err := os.ReadFile(s)
-	if err != nil {
-		return err
-	}
-	var requests []galleryModel
-
-	if err := yaml.Unmarshal(dat, &requests); err != nil {
-		return err
-	}
-
-	return processRequests(modelPath, s, cm, galleries, requests)
-}
-
-func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
-	var requests []galleryModel
-	err := json.Unmarshal([]byte(s), &requests)
-	if err != nil {
-		return err
-	}
-
-	return processRequests(modelPath, s, cm, galleries, requests)
-}
-
-/// Endpoint Service
-
-type ModelGalleryService struct {
-	galleries      []gallery.Gallery
-	modelPath      string
-	galleryApplier *galleryApplier
-}
-
-type GalleryModel struct {
-	ID string `json:"id"`
-	gallery.GalleryModel
-}
-
-func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
-	return ModelGalleryService{
-		galleries:      galleries,
-		modelPath:      modelPath,
-		galleryApplier: galleryApplier,
-	}
-}
-
-func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		status := mgs.galleryApplier.getStatus(c.Params("uuid"))
-		if status == nil {
-			return fmt.Errorf("could not find any status for ID")
-		}
-		return c.JSON(status)
-	}
-}
-
-func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		return c.JSON(mgs.galleryApplier.getAllStatus())
-	}
-}
-
-func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(GalleryModel)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		uuid, err := uuid.NewUUID()
-		if err != nil {
-			return err
-		}
-		mgs.galleryApplier.C <- galleryOp{
-			req:         input.GalleryModel,
-			id:          uuid.String(),
-			galleryName: input.ID,
-			galleries:   mgs.galleries,
-		}
-		return c.JSON(struct {
-			ID        string `json:"uuid"`
-			StatusURL string `json:"status"`
-		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
-	}
-}
-
-func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
-
-		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
-		if err != nil {
-			return err
-		}
-		log.Debug().Msgf("Models found from galleries: %+v", models)
-		for _, m := range models {
-			log.Debug().Msgf("Model found from galleries: %+v", m)
-		}
-		dat, err := json.Marshal(models)
-		if err != nil {
-			return err
-		}
-		return c.Send(dat)
-	}
-}
-
-// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
-func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
-		dat, err := json.Marshal(mgs.galleries)
-		if err != nil {
-			return err
-		}
-		return c.Send(dat)
-	}
-}
-
-func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(gallery.Gallery)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
-			return gallery.Name == input.Name
-		}) {
-			return fmt.Errorf("%s already exists", input.Name)
-		}
-		dat, err := json.Marshal(mgs.galleries)
-		if err != nil {
-			return err
-		}
-		log.Debug().Msgf("Adding %+v to gallery list", *input)
-		mgs.galleries = append(mgs.galleries, *input)
-		return c.Send(dat)
-	}
-}
-
-func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(gallery.Gallery)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
-			return gallery.Name == input.Name
-		}) {
-			return fmt.Errorf("%s is not currently registered", input.Name)
-		}
-		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
-			return gallery.Name == input.Name
-		})
-		return c.Send(nil)
-	}
-}
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -18,6 +18,48 @@ service Backend {
  rpc TTS(TTSRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
+
+  rpc StoresSet(StoresSetOptions) returns (Result) {}
+  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
+  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
+  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
+}
+
+message StoresKey {
+  repeated float Floats = 1;
+}
+
+message StoresValue {
+  bytes Bytes = 1;
+}
+
+message StoresSetOptions {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresDeleteOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresFindOptions {
+  StoresKey Key = 1;
+  int32 TopK = 2;
+}
+
+message StoresFindResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+  repeated float Similarities = 3;
 }

 message HealthMessage {}
@@ -121,11 +163,16 @@ message ModelOptions {

  bool NoMulMatQ = 37;
  string DraftModel = 39;
-  
+
  string AudioPath = 38;

  // vllm
  string Quantization = 40;
+  float  GPUMemoryUtilization = 50;
+  bool   TrustRemoteCode = 51;
+  bool   EnforceEager = 52;
+  int32  SwapSpace = 53;
+  int32  MaxModelLen = 54;

  string MMProj = 41;

@@ -186,6 +233,7 @@ message TTSRequest {
  string text = 1;
  string model = 2;
  string dst = 3;
+  string voice = 4;
 }

 message TokenizationResponse {
@@ -207,4 +255,4 @@ message StatusResponse {
  }
  State state = 1;
  MemoryUsageData memory = 2;
-}
+}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -48,7 +48,7 @@ $(INSTALLED_PACKAGES): grpc_build

 $(GRPC_REPO):
 	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
-	cd $(GRPC_REPO)/grpc && git submodule update --init --recursive --depth $(GIT_CLONE_DEPTH)
+	cd $(GRPC_REPO)/grpc && git submodule update --jobs 2 --init --recursive --depth $(GIT_CLONE_DEPTH)

 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -12,12 +12,18 @@ ifeq ($(BUILD_TYPE),cublas)
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
-else ifeq ($(BUILD_TYPE),clblast)
+# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
+# But if it's OSX without metal, disable it here
+else ifeq ($(OS),darwin)
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+	endif
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
@@ -35,7 +41,7 @@ llama.cpp:
 	fi
 	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1

-llama.cpp/examples/grpc-server:
+llama.cpp/examples/grpc-server: llama.cpp
 	mkdir -p llama.cpp/examples/grpc-server
 	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
 	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -58,9 +58,11 @@ struct server_params
    int32_t read_timeout = 600;
    int32_t write_timeout = 600;
    bool slots_endpoint = true;
+    bool metrics_endpoint = false;
 };

 bool server_verbose = false;
+bool server_log_json = true;

 static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
 {
@@ -316,12 +318,76 @@ struct llama_client_slot
    }

    void print_timings() const {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
-        LOG_TEE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
-        LOG_TEE("%s:       total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
+       char buffer[512];
+        double t_token = t_prompt_processing / num_prompt_tokens_processed;
+        double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
+        sprintf(buffer, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
+                t_prompt_processing, num_prompt_tokens_processed,
+                t_token, n_tokens_second);
+        LOG_INFO(buffer, {
+            {"slot_id",                     id},
+            {"task_id",                     task_id},
+            {"t_prompt_processing",         t_prompt_processing},
+            {"num_prompt_tokens_processed", num_prompt_tokens_processed},
+            {"t_token",                     t_token},
+            {"n_tokens_second",             n_tokens_second},
+        });
+
+        t_token = t_token_generation / n_decoded;
+        n_tokens_second = 1e3 / t_token_generation * n_decoded;
+        sprintf(buffer, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
+                t_token_generation, n_decoded,
+                t_token, n_tokens_second);
+        LOG_INFO(buffer, {
+            {"slot_id",            id},
+            {"task_id",            task_id},
+            {"t_token_generation", t_token_generation},
+            {"n_decoded",          n_decoded},
+            {"t_token",            t_token},
+            {"n_tokens_second",    n_tokens_second},
+        });
+
+        sprintf(buffer, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
+        LOG_INFO(buffer, {
+            {"slot_id",             id},
+            {"task_id",             task_id},
+            {"t_prompt_processing", t_prompt_processing},
+            {"t_token_generation",  t_token_generation},
+            {"t_total",             t_prompt_processing + t_token_generation},
+        });
+    }
+};
+
+struct llama_metrics {
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted       = 0;
+    uint64_t t_tokens_generation      = 0;
+
+
+    void on_prompt_eval(const llama_client_slot &slot) {
+        n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
+
+        n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
+        t_prompt_processing       += slot.t_prompt_processing;
+    }
+
+    void on_prediction(const llama_client_slot &slot) {
+        n_tokens_predicted_total += slot.n_decoded;
+
+        n_tokens_predicted  += slot.n_decoded;
+        t_tokens_generation += slot.t_token_generation;
+    }
+
+    void reset_bucket() {
+        n_prompt_tokens_processed = 0;
+        t_prompt_processing       = 0;
+        n_tokens_predicted        = 0;
+        t_tokens_generation       = 0;
    }
 };

@@ -359,6 +425,8 @@ struct llama_server_context
    llama_server_queue queue_tasks;
    llama_server_response queue_results;

+    llama_metrics metrics;
+
    ~llama_server_context()
    {
        if (ctx)
@@ -378,7 +446,7 @@ struct llama_server_context
        params = params_;
        if (!params.mmproj.empty()) {
            multimodal = true;
-            LOG_TEE("Multi Modal Mode Enabled");
+            LOG_INFO("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@@ -415,13 +483,23 @@ struct llama_server_context
        return true;
    }

+    void validate_model_chat_template(server_params & sparams) {
+        llama_chat_message chat[] = {{"user", "test"}};
+        std::vector<char> buf(1);
+        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
+        if (res < 0) {
+            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
+        }
+    }
+
    void initialize() {
        // create slots
        all_slots_are_idle = true;

        const int32_t n_ctx_slot = n_ctx / params.n_parallel;

-        LOG_TEE("Available slots:\n");
+        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
        for (int i = 0; i < params.n_parallel; i++)
        {
            llama_client_slot slot;
@@ -430,7 +508,10 @@ struct llama_server_context
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;

-            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+            LOG_INFO("new slot", {
+                {"slot_id",    slot.id},
+                {"n_ctx_slot", slot.n_ctx}
+            });

            const int ga_n = params.grp_attn_n;
            const int ga_w = params.grp_attn_w;
@@ -440,7 +521,12 @@ struct llama_server_context
                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
-                LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
+
+                LOG_INFO("slot self-extend", {
+                    {"slot_id",   slot.id},
+                    {"ga_n",      ga_n},
+                    {"ga_w",      ga_w}
+                });
            }

            slot.ga_i = 0;
@@ -726,10 +812,16 @@ struct llama_server_context
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
-                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
+                        LOG_ERROR("failed to load image", {
+                            {"slot_id",   slot->id},
+                            {"img_sl_id", img_sl.id}
+                        });
                        return false;
                    }
-                    LOG_TEE("slot %i - loaded image\n", slot->id);
+                    LOG_VERBOSE("image loaded", {
+                        {"slot_id",   slot->id},
+                        {"img_sl_id", img_sl.id}
+                    });
                    img_sl.request_encode_image = true;
                    slot->images.push_back(img_sl);
                }
@@ -789,7 +881,10 @@ struct llama_server_context

        all_slots_are_idle = false;

-        LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+        LOG_INFO("slot is processing task", {
+            {"slot_id", slot->id},
+            {"task_id", slot->task_id},
+        });

        return true;
    }
@@ -814,10 +909,24 @@ struct llama_server_context
                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
            }

-            if (llama_decode(ctx, batch) != 0)
+            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
            {
-                LOG_TEE("%s: llama_decode() failed\n", __func__);
-                return;
+                const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
+                llama_batch batch_view = {
+                    n_tokens,
+                    batch.token    + i,
+                    nullptr,
+                    batch.pos      + i,
+                    batch.n_seq_id + i,
+                    batch.seq_id   + i,
+                    batch.logits   + i,
+                    0, 0, 0, // unused
+                };
+                if (llama_decode(ctx, batch_view) != 0)
+                {
+                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    return;
+                }
            }

            // assign the system KV cache to all parallel sequences
@@ -975,7 +1084,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }

-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        if (result.tok == llama_token_eos(model))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1351,7 +1460,7 @@ struct llama_server_context
                if (slot == nullptr)
                {
                    // if no slot is available, we defer this task for processing later
-                    LOG_VERBOSE("no slot is available", {});
+                    LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
                    queue_tasks.defer(task);
                    break;
                }
@@ -1425,7 +1534,7 @@ struct llama_server_context
    bool update_slots() {
        if (system_need_update)
        {
-            LOG_TEE("updating system prompt\n");
+            LOG_INFO("updating system prompt", {});
            update_system_prompt();
        }

@@ -1435,12 +1544,13 @@ struct llama_server_context
        {
            if (system_prompt.empty() && clean_kv_cache)
            {
-                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
+                LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
                kv_cache_clear();
            }
            return true;
        }

+        LOG_VERBOSE("posting NEXT_RESPONSE", {});
        task_server task;
        task.type = TASK_TYPE_NEXT_RESPONSE;
        task.target_id = -1;
@@ -1471,6 +1581,7 @@ struct llama_server_context
        }

        // decode any currently ongoing sequences
+        LOG_VERBOSE("decoding ongoing sequences", {});
        for (auto & slot : slots)
        {
            // release the slot
@@ -1480,7 +1591,15 @@ struct llama_server_context
                slot.command = NONE;
                slot.t_last_used = ggml_time_us();

-                LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                LOG_INFO("slot released", {
+                    {"slot_id",         slot.id},
+                    {"task_id",         slot.task_id},
+                    {"n_ctx",           n_ctx},
+                    {"n_past",          slot.n_past},
+                    {"n_system_tokens", system_tokens.size()},
+                    {"n_cache_tokens",  slot.cache_tokens.size()},
+                    {"truncated",       slot.truncated}
+                });
                queue_tasks.notify_slot_changed();

                continue;
@@ -1607,6 +1726,14 @@ struct llama_server_context
                        }

                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+
+                        // the last token of the cache is not in the KV cache until the next call to llama_decode
+                        // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
+                        if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
+                        {
+                            slot.n_past -= 1;
+                        }
+
                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;

                        if (slot.ga_n != 1)
@@ -1628,7 +1755,12 @@ struct llama_server_context
                            slot.ga_i = ga_i;
                        }

-                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+                        LOG_INFO("slot progression", {
+                            { "slot_id", slot.id },
+                            { "task_id", slot.task_id },
+                            { "n_past",  slot.n_past },
+                            { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
+                        });
                    }

                    slot.cache_tokens = prompt_tokens;
@@ -1636,7 +1768,10 @@ struct llama_server_context
                    if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                    {
                        // we have to evaluate at least 1 token to generate logits.
-                        LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
+                        LOG_INFO("we have to evaluate at least 1 token to generate logits", {
+                            { "slot_id", slot.id },
+                            { "task_id", slot.task_id }
+                        });
                        slot.n_past--;
                        if (slot.ga_i > 0)
                        {
@@ -1644,9 +1779,13 @@ struct llama_server_context
                        }
                    }

-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
-
-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+                    int p0 = (int) system_tokens.size() + slot.n_past;
+                    LOG_INFO("kv cache rm [p0, end)", {
+                        { "slot_id", slot.id },
+                        { "task_id", slot.task_id },
+                        { "p0",      p0 }
+                    });
+                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);

                    LOG_VERBOSE("prompt ingested", {
                                                    {"n_past",  slot.n_past},
@@ -1681,7 +1820,13 @@ struct llama_server_context

                    if (has_images && !ingest_images(slot, n_batch))
                    {
-                        LOG_TEE("failed processing images\n");
+                        LOG_ERROR("failed processing images", {
+                            "slot_id", slot.id,
+                            "task_id", slot.task_id,
+                        });
+                        // FIXME @phymbert: to be properly tested
+                        //  early returning without changing the slot state will block the slot for ever
+                        // no one at the moment is checking the return value
                        return false;
                    }

@@ -1723,9 +1868,9 @@ struct llama_server_context
                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);

-                        llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
+                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
-                        llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);

                        slot.n_past_se -= bd;

@@ -1781,7 +1926,7 @@ struct llama_server_context
                    send_embedding(slot);
                    slot.release();
                    slot.i_batch = -1;
-                    return true;
+                    continue;
                }

                completion_token_output result;
@@ -1794,6 +1939,7 @@ struct llama_server_context
                {
                    slot.t_start_genereration = ggml_time_us();
                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+                    metrics.on_prompt_eval(slot);
                }

                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
@@ -1816,11 +1962,14 @@ struct llama_server_context
                    slot.release();
                    slot.print_timings();
                    send_final_response(slot);
+                    metrics.on_prediction(slot);
                }

                slot.i_batch = -1;
            }
        }
+
+        LOG_VERBOSE("slots updated", {});
        return true;
    }

@@ -1849,18 +1998,6 @@ static json format_partial_response(
    return res;
 }

-static json format_tokenizer_response(const std::vector<llama_token> &tokens)
-{
-    return json{
-        {"tokens", tokens}};
-}
-
-static json format_detokenized_response(std::string content)
-{
-    return json{
-        {"content", content}};
-}
-
 struct token_translator
 {
    llama_context * ctx;
@@ -2119,9 +2256,9 @@ static void params_parse(const backend::ModelOptions* request,
    params.use_mmap = request->mmap();
    params.embedding = request->embeddings();

-    if (request->ropescaling() == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
-    else if (request->ropescaling() == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
-    else { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+    if (request->ropescaling() == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+    else if (request->ropescaling() == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+    else { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
    if ( request->yarnextfactor() != 0.0f ) {
        params.yarn_ext_factor = request->yarnextfactor();
    }
--- a/backend/go/stores/debug.go
+++ b/backend/go/stores/debug.go
@@ -0,0 +1,14 @@
+//go:build debug
+// +build debug
+
+package main
+
+import (
+	"github.com/rs/zerolog/log"
+)
+
+func assert(cond bool, msg string) {
+	if !cond {
+		log.Fatal().Stack().Msg(msg)
+	}
+}
--- a/backend/go/stores/main.go
+++ b/backend/go/stores/main.go
@@ -0,0 +1,26 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each store
+
+import (
+	"flag"
+	"os"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, NewStore()); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/stores/production.go
+++ b/backend/go/stores/production.go
@@ -0,0 +1,7 @@
+//go:build !debug
+// +build !debug
+
+package main
+
+func assert(cond bool, msg string) {
+}
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -0,0 +1,507 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"container/heap"
+	"fmt"
+	"math"
+	"slices"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/rs/zerolog/log"
+)
+
+type Store struct {
+	base.SingleThread
+
+	// The sorted keys
+	keys [][]float32
+	// The sorted values
+	values [][]byte
+
+	// If for every K it holds that ||k||^2 = 1, then we can use the normalized distance functions
+	// TODO: Should we normalize incoming keys if they are not instead?
+	keysAreNormalized bool
+	// The first key decides the length of the keys
+	keyLen int
+}
+
+// TODO: Only used for sorting using Go's builtin implementation. The interfaces are columnar because
+// that's theoretically best for memory layout and cache locality, but this isn't optimized yet.
+type Pair struct {
+	Key   []float32
+	Value []byte
+}
+
+func NewStore() *Store {
+	return &Store{
+		keys:              make([][]float32, 0),
+		values:            make([][]byte, 0),
+		keysAreNormalized: true,
+		keyLen:            -1,
+	}
+}
+
+func compareSlices(k1, k2 []float32) int {
+	assert(len(k1) == len(k2), fmt.Sprintf("compareSlices: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
+
+	return slices.Compare(k1, k2)
+}
+
+func hasKey(unsortedSlice [][]float32, target []float32) bool {
+	return slices.ContainsFunc(unsortedSlice, func(k []float32) bool {
+		return compareSlices(k, target) == 0
+	})
+}
+
+func findInSortedSlice(sortedSlice [][]float32, target []float32) (int, bool) {
+	return slices.BinarySearchFunc(sortedSlice, target, func(k, t []float32) int {
+		return compareSlices(k, t)
+	})
+}
+
+func isSortedPairs(kvs []Pair) bool {
+	for i := 1; i < len(kvs); i++ {
+		if compareSlices(kvs[i-1].Key, kvs[i].Key) > 0 {
+			return false
+		}
+	}
+
+	return true
+}
+
+func isSortedKeys(keys [][]float32) bool {
+	for i := 1; i < len(keys); i++ {
+		if compareSlices(keys[i-1], keys[i]) > 0 {
+			return false
+		}
+	}
+
+	return true
+}
+
+func sortIntoKeySlicese(keys []*pb.StoresKey) [][]float32 {
+	ks := make([][]float32, len(keys))
+
+	for i, k := range keys {
+		ks[i] = k.Floats
+	}
+
+	slices.SortFunc(ks, compareSlices)
+
+	assert(len(ks) == len(keys), fmt.Sprintf("len(ks) = %d, len(keys) = %d", len(ks), len(keys)))
+	assert(isSortedKeys(ks), "keys are not sorted")
+
+	return ks
+}
+
+func (s *Store) Load(opts *pb.ModelOptions) error {
+	return nil
+}
+
+// Sort the incoming kvs and merge them with the existing sorted kvs
+func (s *Store) StoresSet(opts *pb.StoresSetOptions) error {
+	if len(opts.Keys) == 0 {
+		return fmt.Errorf("no keys to add")
+	}
+
+	if len(opts.Keys) != len(opts.Values) {
+		return fmt.Errorf("len(keys) = %d, len(values) = %d", len(opts.Keys), len(opts.Values))
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Keys[0].Floats)
+	} else {
+		if len(opts.Keys[0].Floats) != s.keyLen {
+			return fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
+		}
+	}
+
+	kvs := make([]Pair, len(opts.Keys))
+
+	for i, k := range opts.Keys {
+		if s.keysAreNormalized && !isNormalized(k.Floats) {
+			s.keysAreNormalized = false
+			var sample []float32
+			if len(s.keys) > 5 {
+				sample = k.Floats[:5]
+			} else {
+				sample = k.Floats
+			}
+			log.Debug().Msgf("Key is not normalized: %v", sample)
+		}
+
+		kvs[i] = Pair{
+			Key:   k.Floats,
+			Value: opts.Values[i].Bytes,
+		}
+	}
+
+	slices.SortFunc(kvs, func(a, b Pair) int {
+		return compareSlices(a.Key, b.Key)
+	})
+
+	assert(len(kvs) == len(opts.Keys), fmt.Sprintf("len(kvs) = %d, len(opts.Keys) = %d", len(kvs), len(opts.Keys)))
+	assert(isSortedPairs(kvs), "keys are not sorted")
+
+	l := len(kvs) + len(s.keys)
+	merge_ks := make([][]float32, 0, l)
+	merge_vs := make([][]byte, 0, l)
+
+	i, j := 0, 0
+	for {
+		if i+j >= l {
+			break
+		}
+
+		if i >= len(kvs) {
+			merge_ks = append(merge_ks, s.keys[j])
+			merge_vs = append(merge_vs, s.values[j])
+			j++
+			continue
+		}
+
+		if j >= len(s.keys) {
+			merge_ks = append(merge_ks, kvs[i].Key)
+			merge_vs = append(merge_vs, kvs[i].Value)
+			i++
+			continue
+		}
+
+		c := compareSlices(kvs[i].Key, s.keys[j])
+		if c < 0 {
+			merge_ks = append(merge_ks, kvs[i].Key)
+			merge_vs = append(merge_vs, kvs[i].Value)
+			i++
+		} else if c > 0 {
+			merge_ks = append(merge_ks, s.keys[j])
+			merge_vs = append(merge_vs, s.values[j])
+			j++
+		} else {
+			merge_ks = append(merge_ks, kvs[i].Key)
+			merge_vs = append(merge_vs, kvs[i].Value)
+			i++
+			j++
+		}
+	}
+
+	assert(len(merge_ks) == l, fmt.Sprintf("len(merge_ks) = %d, l = %d", len(merge_ks), l))
+	assert(isSortedKeys(merge_ks), "merge keys are not sorted")
+
+	s.keys = merge_ks
+	s.values = merge_vs
+
+	return nil
+}
+
+func (s *Store) StoresDelete(opts *pb.StoresDeleteOptions) error {
+	if len(opts.Keys) == 0 {
+		return fmt.Errorf("no keys to delete")
+	}
+
+	if len(opts.Keys) == 0 {
+		return fmt.Errorf("no keys to add")
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Keys[0].Floats)
+	} else {
+		if len(opts.Keys[0].Floats) != s.keyLen {
+			return fmt.Errorf("Trying to delete key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
+		}
+	}
+
+	ks := sortIntoKeySlicese(opts.Keys)
+
+	l := len(s.keys) - len(ks)
+	merge_ks := make([][]float32, 0, l)
+	merge_vs := make([][]byte, 0, l)
+
+	tail_ks := s.keys
+	tail_vs := s.values
+	for _, k := range ks {
+		j, found := findInSortedSlice(tail_ks, k)
+
+		if found {
+			merge_ks = append(merge_ks, tail_ks[:j]...)
+			merge_vs = append(merge_vs, tail_vs[:j]...)
+			tail_ks = tail_ks[j+1:]
+			tail_vs = tail_vs[j+1:]
+		} else {
+			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: t=%d, %v", len(tail_ks), k))
+		}
+
+		log.Debug().Msgf("Delete: found = %v, t = %d, j = %d, len(merge_ks) = %d, len(merge_vs) = %d", found, len(tail_ks), j, len(merge_ks), len(merge_vs))
+	}
+
+	merge_ks = append(merge_ks, tail_ks...)
+	merge_vs = append(merge_vs, tail_vs...)
+
+	assert(len(merge_ks) <= len(s.keys), fmt.Sprintf("len(merge_ks) = %d, len(s.keys) = %d", len(merge_ks), len(s.keys)))
+
+	s.keys = merge_ks
+	s.values = merge_vs
+
+	assert(len(s.keys) >= l, fmt.Sprintf("len(s.keys) = %d, l = %d", len(s.keys), l))
+	assert(isSortedKeys(s.keys), "keys are not sorted")
+	assert(func() bool {
+		for _, k := range ks {
+			if _, found := findInSortedSlice(s.keys, k); found {
+				return false
+			}
+		}
+		return true
+	}(), "Keys to delete still present")
+
+	if len(s.keys) != l {
+		log.Debug().Msgf("Delete: Some keys not found: len(s.keys) = %d, l = %d", len(s.keys), l)
+	}
+
+	return nil
+}
+
+func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error) {
+	pbKeys := make([]*pb.StoresKey, 0, len(opts.Keys))
+	pbValues := make([]*pb.StoresValue, 0, len(opts.Keys))
+	ks := sortIntoKeySlicese(opts.Keys)
+
+	if len(s.keys) == 0 {
+		log.Debug().Msgf("Get: No keys in store")
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Keys[0].Floats)
+	} else {
+		if len(opts.Keys[0].Floats) != s.keyLen {
+			return pb.StoresGetResult{}, fmt.Errorf("Try to get a key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
+		}
+	}
+
+	tail_k := s.keys
+	tail_v := s.values
+	for i, k := range ks {
+		j, found := findInSortedSlice(tail_k, k)
+
+		if found {
+			pbKeys = append(pbKeys, &pb.StoresKey{
+				Floats: k,
+			})
+			pbValues = append(pbValues, &pb.StoresValue{
+				Bytes: tail_v[j],
+			})
+
+			tail_k = tail_k[j+1:]
+			tail_v = tail_v[j+1:]
+		} else {
+			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: i=%d, %v", i, k))
+		}
+	}
+
+	if len(pbKeys) != len(opts.Keys) {
+		log.Debug().Msgf("Get: Some keys not found: len(pbKeys) = %d, len(opts.Keys) = %d, len(s.Keys) = %d", len(pbKeys), len(opts.Keys), len(s.keys))
+	}
+
+	return pb.StoresGetResult{
+		Keys:   pbKeys,
+		Values: pbValues,
+	}, nil
+}
+
+func isNormalized(k []float32) bool {
+	var sum float32
+	for _, v := range k {
+		sum += v
+	}
+
+	return sum == 1.0
+}
+
+// TODO: This we could replace with handwritten SIMD code
+func normalizedCosineSimilarity(k1, k2 []float32) float32 {
+	assert(len(k1) == len(k2), fmt.Sprintf("normalizedCosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
+
+	var dot float32
+	for i := 0; i < len(k1); i++ {
+		dot += k1[i] * k2[i]
+	}
+
+	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
+
+	// 2.0 * (1.0 - dot) would be the Euclidean distance
+	return dot
+}
+
+type PriorityItem struct {
+	Similarity float32
+	Key        []float32
+	Value      []byte
+}
+
+type PriorityQueue []*PriorityItem
+
+func (pq PriorityQueue) Len() int { return len(pq) }
+
+func (pq PriorityQueue) Less(i, j int) bool {
+	// Inverted because the most similar should be at the top
+	return pq[i].Similarity < pq[j].Similarity
+}
+
+func (pq PriorityQueue) Swap(i, j int) {
+	pq[i], pq[j] = pq[j], pq[i]
+}
+
+func (pq *PriorityQueue) Push(x any) {
+	item := x.(*PriorityItem)
+	*pq = append(*pq, item)
+}
+
+func (pq *PriorityQueue) Pop() any {
+	old := *pq
+	n := len(old)
+	item := old[n-1]
+	*pq = old[0 : n-1]
+	return item
+}
+
+func (s *Store) StoresFindNormalized(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
+	tk := opts.Key.Floats
+	top_ks := make(PriorityQueue, 0, int(opts.TopK))
+	heap.Init(&top_ks)
+
+	for i, k := range s.keys {
+		sim := normalizedCosineSimilarity(tk, k)
+		heap.Push(&top_ks, &PriorityItem{
+			Similarity: sim,
+			Key:        k,
+			Value:      s.values[i],
+		})
+
+		if top_ks.Len() > int(opts.TopK) {
+			heap.Pop(&top_ks)
+		}
+	}
+
+	similarities := make([]float32, top_ks.Len())
+	pbKeys := make([]*pb.StoresKey, top_ks.Len())
+	pbValues := make([]*pb.StoresValue, top_ks.Len())
+
+	for i := top_ks.Len() - 1; i >= 0; i-- {
+		item := heap.Pop(&top_ks).(*PriorityItem)
+
+		similarities[i] = item.Similarity
+		pbKeys[i] = &pb.StoresKey{
+			Floats: item.Key,
+		}
+		pbValues[i] = &pb.StoresValue{
+			Bytes: item.Value,
+		}
+	}
+
+	return pb.StoresFindResult{
+		Keys:         pbKeys,
+		Values:       pbValues,
+		Similarities: similarities,
+	}, nil
+}
+
+func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
+	assert(len(k1) == len(k2), fmt.Sprintf("cosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
+
+	var dot, mag2 float64
+	for i := 0; i < len(k1); i++ {
+		dot += float64(k1[i] * k2[i])
+		mag2 += float64(k2[i] * k2[i])
+	}
+
+	sim := float32(dot / (mag1 * math.Sqrt(mag2)))
+
+	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
+
+	return sim
+}
+
+func (s *Store) StoresFindFallback(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
+	tk := opts.Key.Floats
+	top_ks := make(PriorityQueue, 0, int(opts.TopK))
+	heap.Init(&top_ks)
+
+	var mag1 float64
+	for _, v := range tk {
+		mag1 += float64(v * v)
+	}
+	mag1 = math.Sqrt(mag1)
+
+	for i, k := range s.keys {
+		dist := cosineSimilarity(tk, k, mag1)
+		heap.Push(&top_ks, &PriorityItem{
+			Similarity: dist,
+			Key:        k,
+			Value:      s.values[i],
+		})
+
+		if top_ks.Len() > int(opts.TopK) {
+			heap.Pop(&top_ks)
+		}
+	}
+
+	similarities := make([]float32, top_ks.Len())
+	pbKeys := make([]*pb.StoresKey, top_ks.Len())
+	pbValues := make([]*pb.StoresValue, top_ks.Len())
+
+	for i := top_ks.Len() - 1; i >= 0; i-- {
+		item := heap.Pop(&top_ks).(*PriorityItem)
+
+		similarities[i] = item.Similarity
+		pbKeys[i] = &pb.StoresKey{
+			Floats: item.Key,
+		}
+		pbValues[i] = &pb.StoresValue{
+			Bytes: item.Value,
+		}
+	}
+
+	return pb.StoresFindResult{
+		Keys:         pbKeys,
+		Values:       pbValues,
+		Similarities: similarities,
+	}, nil
+}
+
+func (s *Store) StoresFind(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
+	tk := opts.Key.Floats
+
+	if len(tk) != s.keyLen {
+		return pb.StoresFindResult{}, fmt.Errorf("Try to find key with length %d when existing length is %d", len(tk), s.keyLen)
+	}
+
+	if opts.TopK < 1 {
+		return pb.StoresFindResult{}, fmt.Errorf("opts.TopK = %d, must be >= 1", opts.TopK)
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Key.Floats)
+	} else {
+		if len(opts.Key.Floats) != s.keyLen {
+			return pb.StoresFindResult{}, fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Key.Floats), s.keyLen)
+		}
+	}
+
+	if s.keysAreNormalized && isNormalized(tk) {
+		return s.StoresFindNormalized(opts)
+	} else {
+		if s.keysAreNormalized {
+			var sample []float32
+			if len(s.keys) > 5 {
+				sample = tk[:5]
+			} else {
+				sample = tk
+			}
+			log.Debug().Msgf("Trying to compare non-normalized key with normalized keys: %v", sample)
+		}
+
+		return s.StoresFindFallback(opts)
+	}
+}
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -11,21 +11,21 @@ import (
 	"github.com/go-skynet/LocalAI/core/schema"
 )

-func sh(c string) (string, error) {
-	cmd := exec.Command("/bin/sh", "-c", c)
+func runCommand(command []string) (string, error) {
+	cmd := exec.Command(command[0], command[1:]...)
 	cmd.Env = os.Environ()
-	o, err := cmd.CombinedOutput()
-	return string(o), err
+	out, err := cmd.CombinedOutput()
+	return string(out), err
 }

-// AudioToWav converts audio to wav for transcribe. It bashes out to ffmpeg
+// AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
-	out, err := sh(fmt.Sprintf("ffmpeg -i %s -format s16le -ar 16000 -ac 1 -acodec pcm_s16le %s", src, dst))
+    command := []string{"ffmpeg", "-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := runCommand(command)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
-
 	return nil
 }

--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
@@ -33,7 +33,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            model = AutoGPTQForCausalLM.from_quantized(request.Model,
                    model_basename=request.ModelBaseName,
                    use_safetensors=True,
-                    trust_remote_code=True,
+                    trust_remote_code=request.TrustRemoteCode,
                    device=device,
                    use_triton=request.UseTriton,
                    quantize_config=None)
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@@ -71,7 +71,7 @@ dependencies:
      - regex==2023.10.3
      - requests==2.31.0
      - rouge==1.0.1
-      - safetensors==0.3.3
+      - safetensors>=0.3.3
      - six==1.16.0
      - sympy==1.12
      - tokenizers==0.14.0
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/common-env/transformers/Makefile
+++ b/backend/python/common-env/transformers/Makefile
@@ -8,6 +8,13 @@ ifeq ($(BUILD_TYPE), hipblas)
 	CONDA_ENV_PATH = "transformers-rocm.yml"
 endif

+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: transformers
 transformers:
 	@echo "Installing $(CONDA_ENV_PATH)..."
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -1,24 +1,38 @@
 #!/bin/bash
 set -ex

+SKIP_CONDA=${SKIP_CONDA:-0}
+
 # Check if environment exist
 conda_env_exists(){
    ! conda list --name "${@}" >/dev/null 2>/dev/null
 }

-if conda_env_exists "transformers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name transformers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "transformers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name transformers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the intel image
+    # (no conda env)
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed
 fi

 if [ "$PIP_CACHE_PURGE" = true ] ; then
-    export PATH=$PATH:/opt/conda/bin
-
-    # Activate conda environment
-    source activate transformers
+    if [ $SKIP_CONDA -eq 0 ]; then
+        # Activate conda environment
+        source activate transformers
+    fi

    pip cache purge
 fi
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -30,13 +30,14 @@ dependencies:
      - async-timeout==4.0.3
      - attrs==23.1.0
      - bark==0.1.5
+      - bitsandbytes==0.43.0
      - boto3==1.28.61
      - botocore==1.31.61
      - certifi==2023.7.22
      - TTS==0.22.0
      - charset-normalizer==3.3.0
      - datasets==2.14.5
-      - sentence-transformers==2.2.2
+      - sentence-transformers==2.5.1 # Updated Version
      - sentencepiece==0.1.99
      - dill==0.3.7
      - einops==0.7.0
@@ -81,8 +82,8 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
-      - scipy==1.11.3
+      - safetensors>=0.4.1
+      - scipy==1.12.0 # Updated Version
      - six==1.16.0
      - sympy==1.12
      - tokenizers
@@ -113,7 +114,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.2.7
-      - transformers>=4.36.0  # Required for Mixtral.
+      - vllm==0.3.2
+      - transformers>=4.38.2  # Updated Version
      - xformers==0.0.23.post1  
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -38,7 +38,7 @@ dependencies:
      - TTS==0.22.0
      - charset-normalizer==3.3.0
      - datasets==2.14.5
-      - sentence-transformers==2.2.2
+      - sentence-transformers==2.5.1 # Updated Version
      - sentencepiece==0.1.99
      - dill==0.3.7
      - einops==0.7.0
@@ -71,8 +71,8 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
-      - scipy==1.11.3
+      - safetensors>=0.4.1
+      - scipy==1.12.0 # Updated Version
      - six==1.16.0
      - sympy==1.12
      - tokenizers
@@ -103,7 +103,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.2.7
-      - transformers>=4.36.0 # Required for Mixtral.
+      - vllm==0.3.2
+      - transformers>=4.38.2  # Updated Version
      - xformers==0.0.23.post1
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -36,7 +36,7 @@ dependencies:
      - TTS==0.22.0
      - charset-normalizer==3.3.0
      - datasets==2.14.5
-      - sentence-transformers==2.2.2
+      - sentence-transformers==2.5.1 # Updated Version
      - sentencepiece==0.1.99
      - dill==0.3.7
      - einops==0.7.0
@@ -69,8 +69,8 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
-      - scipy==1.11.3
+      - safetensors>=0.4.1
+      - scipy==1.12.0 # Updated Version
      - six==1.16.0
      - sympy==1.12
      - tokenizers
@@ -101,7 +101,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.2.7
-      - transformers>=4.36.0  # Required for Mixtral.
+      - vllm==0.3.2
+      - transformers>=4.38.2  # Updated Version
      - xformers==0.0.23.post1  
-prefix: /opt/conda/envs/transformers
+prefix: /opt/conda/envs/transformers
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/diffusers/Makefile
+++ b/backend/python/diffusers/Makefile
@@ -4,6 +4,13 @@ ifeq ($(BUILD_TYPE), hipblas)
 export CONDA_ENV_PATH = "diffusers-rocm.yml"
 endif

+# Intel GPU are supposed to have dependencies installed in the main python
+# environment, so we skip conda installation for SYCL builds.
+# https://github.com/intel/intel-extension-for-pytorch/issues/538
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: diffusers
 diffusers:
 	@echo "Installing $(CONDA_ENV_PATH)..."
--- a/backend/python/diffusers/backend_diffusers.py
+++ b/backend/python/diffusers/backend_diffusers.py
@@ -21,14 +21,15 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipelin
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image,export_to_video
-from compel import Compel
+from compel import Compel, ReturnedEmbeddingsType

 from transformers import CLIPTextModel
 from safetensors.torch import load_file


 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
-COMPEL=os.environ.get("COMPEL", "1") == "1"
+COMPEL=os.environ.get("COMPEL", "0") == "1"
+XPU=os.environ.get("XPU", "0") == "1"
 CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
 SAFETENSORS=os.environ.get("SAFETENSORS", "1") == "1"
 CHUNK_SIZE=os.environ.get("CHUNK_SIZE", "8")
@@ -36,6 +37,10 @@ FPS=os.environ.get("FPS", "7")
 DISABLE_CPU_OFFLOAD=os.environ.get("DISABLE_CPU_OFFLOAD", "0") == "1"
 FRAMES=os.environ.get("FRAMES", "64")

+if XPU:
+    import intel_extension_for_pytorch as ipex
+    print(ipex.xpu.get_device_name(0))
+
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))

@@ -231,8 +236,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.SchedulerType != "":
                self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
                
-            if not self.img2vid:
-                self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
+            if COMPEL:
+                self.compel = Compel(
+                    tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2 ], 
+                    text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2],
+                    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+                    requires_pooled=[False, True]
+                    )


            if request.ControlNet:
@@ -247,6 +257,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                self.pipe.to('cuda')
                if self.controlnet:
                    self.controlnet.to('cuda')
+            if XPU:
+                self.pipe = self.pipe.to("xpu")
            # Assume directory from request.ModelFile.
            # Only if request.LoraAdapter it's not an absolute path
            if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
@@ -386,8 +398,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        image = {}
        if COMPEL:
-            conditioning = self.compel.build_conditioning_tensor(prompt)
-            kwargs["prompt_embeds"]= conditioning
+            conditioning, pooled = self.compel.build_conditioning_tensor(prompt)
+            kwargs["prompt_embeds"] = conditioning
+            kwargs["pooled_prompt_embeds"] = pooled
            # pass the kwargs dictionary to the self.pipe method
            image = self.pipe(
                guidance_scale=self.cfg_scale,
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -1,24 +1,50 @@
 #!/bin/bash
 set -ex

+SKIP_CONDA=${SKIP_CONDA:-0}
+
 # Check if environment exist
 conda_env_exists(){
    ! conda list --name "${@}" >/dev/null 2>/dev/null
 }

-if conda_env_exists "diffusers" ; then
-    echo "Creating virtual environment..."
-    conda env create --name diffusers --file $1
-    echo "Virtual environment created."
-else 
-    echo "Virtual environment already exists."
+if [ $SKIP_CONDA -eq 1 ]; then
+    echo "Skipping conda environment installation"
+else
+    export PATH=$PATH:/opt/conda/bin
+    if conda_env_exists "diffusers" ; then
+        echo "Creating virtual environment..."
+        conda env create --name diffusers --file $1
+        echo "Virtual environment created."
+    else 
+        echo "Virtual environment already exists."
+    fi
+fi
+
+if [ -d "/opt/intel" ]; then
+    # Intel GPU: If the directory exists, we assume we are using the Intel image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    pip install torch==2.1.0a0 \
+                torchvision==0.16.0a0 \
+                torchaudio==2.1.0a0 \
+                intel-extension-for-pytorch==2.1.10+xpu \
+                --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+    
+    pip install google-api-python-client \
+                grpcio \
+                grpcio-tools \
+                diffusers==0.24.0 \
+                transformers>=4.25.1 \
+                accelerate \
+                compel==2.0.2 \
+                Pillow
 fi

 if [ "$PIP_CACHE_PURGE" = true ] ; then
-    export PATH=$PATH:/opt/conda/bin
-
-    # Activate conda environment
-    source activate diffusers
+    if [ $SKIP_CONDA -ne 1 ]; then
+        # Activate conda environment
+        source activate diffusers
+    fi

    pip cache purge
 fi
--- a/backend/python/diffusers/run.sh
+++ b/backend/python/diffusers/run.sh
@@ -3,10 +3,15 @@
 ##
 ## A bash script wrapper that runs the diffusers server with conda

-export PATH=$PATH:/opt/conda/bin
-
-# Activate conda environment
-source activate diffusers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate diffusers
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,7 +1,8 @@
+export CONDA_ENV_PATH = "exllama.yml"
+
 .PHONY: exllama
 exllama:
-	$(MAKE) -C ../common-env/transformers
-	bash install.sh
+	bash install.sh ${CONDA_ENV_PATH}

 .PHONY: run
 run:
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -1,14 +1,27 @@
 #!/bin/bash
+set -ex

-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment
 export PATH=$PATH:/opt/conda/bin

-# Activate conda environment
-source activate transformers
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllama] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi

-echo $CONDA_PREFIX
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}

+if conda_env_exists "exllama" ; then
+    echo "Creating virtual environment..."
+    conda env create --name exllama --file $1
+    echo "Virtual environment created."
+else
+    echo "Virtual environment already exists."
+fi
+
+source activate exllama

 git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd

--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@@ -2,11 +2,10 @@

 ##
 ## A bash script wrapper that runs the exllama server with conda
-
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
-source activate transformers
+source activate exllama

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -2,10 +2,14 @@
 set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
 export SHA=c0ddebaaaf8ffd1b3529c2bb654e650bce2f790f

-# Activate conda environment
+if [ "$BUILD_TYPE" != "cublas" ]; then
+    echo "[exllamav2] Attention!!! Nvidia GPU is required - skipping installation"
+    exit 0
+fi
+
+export PATH=$PATH:/opt/conda/bin
 source activate transformers

 echo $CONDA_PREFIX
--- a/backend/python/mamba/backend_pb2.py
+++ b/backend/python/mamba/backend_pb2.py
--- a/backend/python/mamba/install.sh
+++ b/backend/python/mamba/install.sh
@@ -2,13 +2,14 @@
 set -e
 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin

 if [ "$BUILD_TYPE" != "cublas" ]; then
    echo "[mamba] Attention!!! nvcc is required - skipping installation"
    exit 0
 fi

+export PATH=$PATH:/opt/conda/bin
+
 # Activate conda environment
 source activate transformers

--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,7 +1,7 @@
 .PHONY: petals
 petals:
 	@echo "Creating virtual environment..."
-	@conda env create --name petals --file petals.yml
+	bash install.sh "petals.yml"
 	@echo "Virtual environment created."

 .PHONY: run
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/petals/install.sh
+++ b/backend/python/petals/install.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export PATH=$PATH:/opt/conda/bin
+
+conda env create --name petals --file $1
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@@ -3,10 +3,16 @@
 ##
 ## A bash script wrapper that runs the transformers server with conda

-export PATH=$PATH:/opt/conda/bin

-# Activate conda environment
-source activate transformers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate transformers
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -16,7 +16,15 @@ import backend_pb2_grpc
 import grpc
 import torch
 import torch.cuda
-from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+
+XPU=os.environ.get("XPU", "0") == "1"
+if XPU:
+    import intel_extension_for_pytorch as ipex
+    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
+    from transformers import AutoTokenizer, AutoModel, set_seed
+else:
+    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
+

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -67,22 +75,60 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
+
+        compute = "auto"
+        if request.F16Memory == True:
+            compute=torch.bfloat16
+
+        self.CUDA = request.CUDA
+
+        device_map="cpu"
+
+        quantization = None
+
+        if self.CUDA:
+            if request.Device:
+                device_map=request.Device
+            else:
+                device_map="cuda:0"
+            if request.Quantization == "bnb_4bit":
+                quantization = BitsAndBytesConfig(
+                    load_in_4bit = True,
+                    bnb_4bit_compute_dtype = compute,
+                    bnb_4bit_quant_type = "nf4",
+                    bnb_4bit_use_double_quant = True,
+                    load_in_8bit = False,
+                )
+            elif request.Quantization == "bnb_8bit":
+                quantization = BitsAndBytesConfig(
+                    load_in_4bit=False,
+                    bnb_4bit_compute_dtype = None,
+                    load_in_8bit=True,                                   
+                )
+                                                   
+    
        try:
            if request.Type == "AutoModelForCausalLM":
-                self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+                if XPU:
+                    if quantization == "xpu_4bit":
+                        xpu_4bit = True
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
+                                              device_map="xpu", load_in_4bit=xpu_4bit)
+                else:
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
            else:
-                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,  use_safetensors=True,  quantization_config=quantization, device_map=device_map, torch_dtype=compute)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
+            self.XPU = False

-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.CUDA = False
-
-            if request.CUDA or torch.cuda.is_available():
+            if XPU:
+                self.XPU = True
                try:
-                    print("Loading model", model_name, "to CUDA.", file=sys.stderr)
-                    self.model = self.model.to("cuda")
-                    self.CUDA = True
+                    print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
+                    self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
                except Exception as err:
-                    print("Not using CUDA:", err, file=sys.stderr)
+                    print("Not using XPU:", err, file=sys.stderr)
+
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
@@ -109,13 +155,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")    

        # Create word embeddings
-        model_output = self.model(**encoded_input)
+        if self.CUDA:
+            encoded_input = encoded_input.to("cuda")
+
+        with torch.no_grad():    
+            model_output = self.model(**encoded_input)

        # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
-        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
+        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
        print("Embeddings:", sentence_embeddings, file=sys.stderr)
-        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
+        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])

    def Predict(self, request, context):
        """
@@ -139,13 +189,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
        if self.CUDA:
            inputs = inputs.to("cuda")
+        if XPU:
+            inputs = inputs.to("xpu")

-        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
-
-        generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-        # Remove prompt from response if present
-        if request.Prompt in generated_text:
-            generated_text = generated_text.replace(request.Prompt, "")
+        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
+        generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]

        return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))

--- a/backend/python/vall-e-x/Makefile
+++ b/backend/python/vall-e-x/Makefile
@@ -1,3 +1,7 @@
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+export SKIP_CONDA=1
+endif
+
 .PHONY: ttsvalle
 ttsvalle:
 	$(MAKE) -C ../common-env/transformers
--- a/backend/python/vall-e-x/backend_pb2.py
+++ b/backend/python/vall-e-x/backend_pb2.py
--- a/backend/python/vall-e-x/install.sh
+++ b/backend/python/vall-e-x/install.sh
@@ -2,13 +2,16 @@

 ##
 ## A bash script installs the required dependencies of VALL-E-X and prepares the environment
-export PATH=$PATH:/opt/conda/bin
 export SHA=3faaf8ccadb154d63b38070caf518ce9309ea0f4

-# Activate conda environment
-source activate transformers
+SKIP_CONDA=${SKIP_CONDA:-0}

-echo $CONDA_PREFIX
+if [ $SKIP_CONDA -ne 1 ]; then
+    source activate transformers
+else
+    export PATH=$PATH:/opt/conda/bin
+    CONDA_PREFIX=$PWD
+fi

 git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && popd

--- a/backend/python/vall-e-x/ttsvalle.yml
+++ b/backend/python/vall-e-x/ttsvalle.yml
@@ -79,7 +79,7 @@ dependencies:
      - pypinyin==0.49.0
      - python-multipart==0.0.6
      - regex==2023.10.3
-      - safetensors==0.4.0
+      - safetensors>=0.4.0
      - semantic-version==2.10.0
      - soundfile==0.12.1
      - starlette==0.27.0
--- a/backend/python/vllm/backend_pb2.py
+++ b/backend/python/vllm/backend_pb2.py
--- a/backend/python/vllm/backend_vllm.py
+++ b/backend/python/vllm/backend_vllm.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
+import asyncio
 from concurrent import futures
-import time
 import argparse
 import signal
 import sys
@@ -10,7 +10,10 @@ import backend_pb2
 import backend_pb2_grpc

 import grpc
-from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@@ -79,16 +82,30 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        Returns:
            backend_pb2.Result: The load model result.
        """
+        engine_args = AsyncEngineArgs(
+            model=request.Model,
+        )
+
+        if request.Quantization != "":
+            engine_args.quantization = request.Quantization
+        if request.GPUMemoryUtilization != 0:
+            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
+        if request.TrustRemoteCode:
+            engine_args.trust_remote_code = request.TrustRemoteCode
+        if request.EnforceEager:
+            engine_args.enforce_eager = request.EnforceEager
+        if request.SwapSpace != 0:
+            engine_args.swap_space = request.SwapSpace
+        if request.MaxModelLen != 0:
+            engine_args.max_model_len = request.MaxModelLen
+
        try:
-            if request.Quantization != "":
-                self.llm = LLM(model=request.Model, quantization=request.Quantization)
-            else:
-                self.llm = LLM(model=request.Model)
+            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)

-    def Predict(self, request, context):
+    async def Predict(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters.

@@ -99,24 +116,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        Returns:
            backend_pb2.Reply: The predict result.
        """
-        if request.TopP == 0:
-            request.TopP = 0.9
+        gen = self._predict(request, context, streaming=False)
+        res = await gen.__anext__()
+        return res

-        max_tokens = 200
-        if request.Tokens > 0:
-            max_tokens = request.Tokens
-
-        sampling_params = SamplingParams(max_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
-        outputs = self.llm.generate([request.Prompt], sampling_params)
-
-        generated_text = outputs[0].outputs[0].text
-        # Remove prompt from response if present
-        if request.Prompt in generated_text:
-            generated_text = generated_text.replace(request.Prompt, "")
-
-        return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
+    async def PredictStream(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters, and streams the results.

@@ -127,30 +131,84 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        Returns:
            backend_pb2.Result: The predict stream result.
        """
-        yield self.Predict(request, context)
+        iterations = self._predict(request, context, streaming=True)
+        try:
+            async for iteration in iterations:
+                yield iteration
+        finally:
+            await iterations.aclose()

-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    async def _predict(self, request, context, streaming=False):
+
+        # Build sampling parameters
+        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
+        if request.TopP != 0:
+            sampling_params.top_p = request.TopP
+        if request.Tokens > 0:
+            sampling_params.max_tokens = request.Tokens
+        if request.Temperature != 0:
+            sampling_params.temperature = request.Temperature
+        if request.TopK != 0:
+            sampling_params.top_k = request.TopK
+        if request.PresencePenalty != 0:
+            sampling_params.presence_penalty = request.PresencePenalty
+        if request.FrequencyPenalty != 0:
+            sampling_params.frequency_penalty = request.FrequencyPenalty
+        if request.StopPrompts:
+            sampling_params.stop = request.StopPrompts
+        if request.IgnoreEOS:
+            sampling_params.ignore_eos = request.IgnoreEOS
+        if request.Seed != 0:
+            sampling_params.seed = request.Seed
+
+        # Generate text
+        request_id = random_uuid()
+        outputs = self.llm.generate(request.Prompt, sampling_params, request_id)
+
+        # Stream the results
+        generated_text = ""
+        try:
+            async for request_output in outputs:
+                iteration_text = request_output.outputs[0].text
+
+                if streaming:
+                    # Remove text already sent as vllm concatenates the text from previous yields
+                    delta_iteration_text = iteration_text.removeprefix(generated_text)
+                    # Send the partial result
+                    yield backend_pb2.Reply(message=bytes(delta_iteration_text, encoding='utf-8'))
+
+                # Keep track of text generated
+                generated_text = iteration_text
+        finally:
+            await outputs.aclose()
+
+        # If streaming, we already sent everything
+        if streaming:
+            return
+
+        # Sending the final generated text
+        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+
+async def serve(address):
+    # Start asyncio gRPC server
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    # Bind the server to the address
    server.add_insecure_port(address)
-    server.start()
+
+    # Gracefully shutdown the server on SIGTERM or SIGINT
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGINT, signal.SIGTERM):
+        loop.add_signal_handler(
+            sig, lambda: asyncio.ensure_future(server.stop(5))
+        )
+
+    # Start the server
+    await server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
+    # Wait for the server to be terminated
+    await server.wait_for_termination()

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
@@ -159,4 +217,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    serve(args.addr)
+    asyncio.run(serve(args.addr))
--- a/configuration/.keep
+++ b/configuration/.keep
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -3,36 +3,32 @@ package backend
 import (
 	"fmt"

-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/config"
+
 	"github.com/go-skynet/LocalAI/pkg/grpc"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.Config, o *options.Option) (func() ([]float32, error), error) {
-	if !c.Embeddings {
-		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
-	}
+func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
+	modelFile := backendConfig.Model

-	modelFile := c.Model
-
-	grpcOpts := gRPCModelOpts(c)
+	grpcOpts := gRPCModelOpts(backendConfig)

 	var inferenceModel interface{}
 	var err error

-	opts := modelOpts(c, o, []model.Option{
+	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(c.Threads)),
-		model.WithAssetDir(o.AssetsDestination),
+		model.WithThreads(uint32(*backendConfig.Threads)),
+		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithModel(modelFile),
-		model.WithContext(o.Context),
+		model.WithContext(appConfig.Context),
 	})

-	if c.Backend == "" {
+	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
-		opts = append(opts, model.WithBackendString(c.Backend))
+		opts = append(opts, model.WithBackendString(backendConfig.Backend))
 		inferenceModel, err = loader.BackendLoader(opts...)
 	}
 	if err != nil {
@@ -43,7 +39,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	switch model := inferenceModel.(type) {
 	case grpc.Backend:
 		fn = func() ([]float32, error) {
-			predictOptions := gRPCPredictOpts(c, loader.ModelPath)
+			predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 			if len(tokens) > 0 {
 				embeds := []int32{}

@@ -52,7 +48,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 				}
 				predictOptions.EmbeddingTokens = embeds

-				res, err := model.Embeddings(o.Context, predictOptions)
+				res, err := model.Embeddings(appConfig.Context, predictOptions)
 				if err != nil {
 					return nil, err
 				}
@@ -61,7 +57,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 			}
 			predictOptions.Embeddings = s

-			res, err := model.Embeddings(o.Context, predictOptions)
+			res, err := model.Embeddings(appConfig.Context, predictOptions)
 			if err != nil {
 				return nil, err
 			}
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -1,34 +1,25 @@
 package backend

 import (
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/config"
+
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
-
-	opts := modelOpts(c, o, []model.Option{
-		model.WithBackendString(c.Backend),
-		model.WithAssetDir(o.AssetsDestination),
-		model.WithThreads(uint32(c.Threads)),
-		model.WithContext(o.Context),
-		model.WithModel(c.Model),
-		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
-			CUDA:          c.CUDA || c.Diffusers.CUDA,
-			SchedulerType: c.Diffusers.SchedulerType,
-			PipelineType:  c.Diffusers.PipelineType,
-			CFGScale:      c.Diffusers.CFGScale,
-			LoraAdapter:   c.LoraAdapter,
-			LoraScale:     c.LoraScale,
-			LoraBase:      c.LoraBase,
-			IMG2IMG:       c.Diffusers.IMG2IMG,
-			CLIPModel:     c.Diffusers.ClipModel,
-			CLIPSubfolder: c.Diffusers.ClipSubFolder,
-			CLIPSkip:      int32(c.Diffusers.ClipSkip),
-			ControlNet:    c.Diffusers.ControlNet,
-		}),
+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
+	threads := backendConfig.Threads
+	if *threads == 0 && appConfig.Threads != 0 {
+		threads = &appConfig.Threads
+	}
+	gRPCOpts := gRPCModelOpts(backendConfig)
+	opts := modelOpts(backendConfig, appConfig, []model.Option{
+		model.WithBackendString(backendConfig.Backend),
+		model.WithAssetDir(appConfig.AssetsDestination),
+		model.WithThreads(uint32(*threads)),
+		model.WithContext(appConfig.Context),
+		model.WithModel(backendConfig.Model),
+		model.WithLoadGRPCLoadModelOpts(gRPCOpts),
 	})

 	inferenceModel, err := loader.BackendLoader(
@@ -40,19 +31,19 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat

 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
-			o.Context,
+			appConfig.Context,
 			&proto.GenerateImageRequest{
 				Height:           int32(height),
 				Width:            int32(width),
 				Mode:             int32(mode),
 				Step:             int32(step),
 				Seed:             int32(seed),
-				CLIPSkip:         int32(c.Diffusers.ClipSkip),
+				CLIPSkip:         int32(backendConfig.Diffusers.ClipSkip),
 				PositivePrompt:   positive_prompt,
 				NegativePrompt:   negative_prompt,
 				Dst:              dst,
 				Src:              src,
-				EnableParameters: c.Diffusers.EnableParameters,
+				EnableParameters: backendConfig.Diffusers.EnableParameters,
 			})
 		return err
 	}
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -8,8 +8,8 @@ import (
 	"sync"
 	"unicode/utf8"

-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/config"
+
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/grpc"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -26,9 +26,12 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
-
+	threads := c.Threads
+	if *threads == 0 && o.Threads != 0 {
+		threads = &o.Threads
+	}
 	grpcOpts := gRPCModelOpts(c)

 	var inferenceModel grpc.Backend
@@ -36,7 +39,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode

 	opts := modelOpts(c, o, []model.Option{
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
-		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
+		model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithModel(modelFile),
 		model.WithContext(o.Context),
@@ -140,7 +143,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
 var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
 var mu sync.Mutex = sync.Mutex{}

-func Finetune(config config.Config, input, prediction string) string {
+func Finetune(config config.BackendConfig, input, prediction string) string {
 	if config.Echo {
 		prediction = input + prediction
 	}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -4,19 +4,17 @@ import (
 	"os"
 	"path/filepath"

+	"github.com/go-skynet/LocalAI/core/config"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
-
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
 )

-func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
-	if o.SingleBackend {
+func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
+	if so.SingleBackend {
 		opts = append(opts, model.WithSingleActiveBackend())
 	}

-	if o.ParallelBackendRequests {
+	if so.ParallelBackendRequests {
 		opts = append(opts, model.EnableParallelRequests)
 	}

@@ -28,52 +26,65 @@ func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.
 		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
 	}

-	for k, v := range o.ExternalGRPCBackends {
+	for k, v := range so.ExternalGRPCBackends {
 		opts = append(opts, model.WithExternalBackend(k, v))
 	}

 	return opts
 }

-func gRPCModelOpts(c config.Config) *pb.ModelOptions {
+func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}

 	return &pb.ModelOptions{
-		ContextSize:    int32(c.ContextSize),
-		Seed:           int32(c.Seed),
-		NBatch:         int32(b),
-		NoMulMatQ:      c.NoMulMatQ,
-		CUDA:           c.CUDA, // diffusers, transformers
-		DraftModel:     c.DraftModel,
-		AudioPath:      c.VallE.AudioPath,
-		Quantization:   c.Quantization,
-		MMProj:         c.MMProj,
-		YarnExtFactor:  c.YarnExtFactor,
-		YarnAttnFactor: c.YarnAttnFactor,
-		YarnBetaFast:   c.YarnBetaFast,
-		YarnBetaSlow:   c.YarnBetaSlow,
-		LoraAdapter:    c.LoraAdapter,
-		LoraBase:       c.LoraBase,
-		LoraScale:      c.LoraScale,
-		NGQA:           c.NGQA,
-		RMSNormEps:     c.RMSNormEps,
-		F16Memory:      c.F16,
-		MLock:          c.MMlock,
-		RopeFreqBase:   c.RopeFreqBase,
-		RopeScaling:    c.RopeScaling,
-		Type:           c.ModelType,
-		RopeFreqScale:  c.RopeFreqScale,
-		NUMA:           c.NUMA,
-		Embeddings:     c.Embeddings,
-		LowVRAM:        c.LowVRAM,
-		NGPULayers:     int32(c.NGPULayers),
-		MMap:           c.MMap,
-		MainGPU:        c.MainGPU,
-		Threads:        int32(c.Threads),
-		TensorSplit:    c.TensorSplit,
+		CUDA:                 c.CUDA || c.Diffusers.CUDA,
+		SchedulerType:        c.Diffusers.SchedulerType,
+		PipelineType:         c.Diffusers.PipelineType,
+		CFGScale:             c.Diffusers.CFGScale,
+		LoraAdapter:          c.LoraAdapter,
+		LoraScale:            c.LoraScale,
+		F16Memory:            *c.F16,
+		LoraBase:             c.LoraBase,
+		IMG2IMG:              c.Diffusers.IMG2IMG,
+		CLIPModel:            c.Diffusers.ClipModel,
+		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
+		CLIPSkip:             int32(c.Diffusers.ClipSkip),
+		ControlNet:           c.Diffusers.ControlNet,
+		ContextSize:          int32(*c.ContextSize),
+		Seed:                 int32(*c.Seed),
+		NBatch:               int32(b),
+		NoMulMatQ:            c.NoMulMatQ,
+		DraftModel:           c.DraftModel,
+		AudioPath:            c.VallE.AudioPath,
+		Quantization:         c.Quantization,
+		GPUMemoryUtilization: c.GPUMemoryUtilization,
+		TrustRemoteCode:      c.TrustRemoteCode,
+		EnforceEager:         c.EnforceEager,
+		SwapSpace:            int32(c.SwapSpace),
+		MaxModelLen:          int32(c.MaxModelLen),
+		MMProj:               c.MMProj,
+		YarnExtFactor:        c.YarnExtFactor,
+		YarnAttnFactor:       c.YarnAttnFactor,
+		YarnBetaFast:         c.YarnBetaFast,
+		YarnBetaSlow:         c.YarnBetaSlow,
+		NGQA:                 c.NGQA,
+		RMSNormEps:           c.RMSNormEps,
+		MLock:                *c.MMlock,
+		RopeFreqBase:         c.RopeFreqBase,
+		RopeScaling:          c.RopeScaling,
+		Type:                 c.ModelType,
+		RopeFreqScale:        c.RopeFreqScale,
+		NUMA:                 c.NUMA,
+		Embeddings:           c.Embeddings,
+		LowVRAM:              *c.LowVRAM,
+		NGPULayers:           int32(*c.NGPULayers),
+		MMap:                 *c.MMap,
+		MainGPU:              c.MainGPU,
+		Threads:              int32(*c.Threads),
+		TensorSplit:          c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
@@ -84,43 +95,44 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 	}
 }

-func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
+func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOptions {
 	promptCachePath := ""
 	if c.PromptCachePath != "" {
 		p := filepath.Join(modelPath, c.PromptCachePath)
 		os.MkdirAll(filepath.Dir(p), 0755)
 		promptCachePath = p
 	}
+
 	return &pb.PredictOptions{
-		Temperature:         float32(c.Temperature),
-		TopP:                float32(c.TopP),
+		Temperature:         float32(*c.Temperature),
+		TopP:                float32(*c.TopP),
 		NDraft:              c.NDraft,
-		TopK:                int32(c.TopK),
-		Tokens:              int32(c.Maxtokens),
-		Threads:             int32(c.Threads),
+		TopK:                int32(*c.TopK),
+		Tokens:              int32(*c.Maxtokens),
+		Threads:             int32(*c.Threads),
 		PromptCacheAll:      c.PromptCacheAll,
 		PromptCacheRO:       c.PromptCacheRO,
 		PromptCachePath:     promptCachePath,
-		F16KV:               c.F16,
-		DebugMode:           c.Debug,
+		F16KV:               *c.F16,
+		DebugMode:           *c.Debug,
 		Grammar:             c.Grammar,
 		NegativePromptScale: c.NegativePromptScale,
 		RopeFreqBase:        c.RopeFreqBase,
 		RopeFreqScale:       c.RopeFreqScale,
 		NegativePrompt:      c.NegativePrompt,
-		Mirostat:            int32(c.LLMConfig.Mirostat),
-		MirostatETA:         float32(c.LLMConfig.MirostatETA),
-		MirostatTAU:         float32(c.LLMConfig.MirostatTAU),
-		Debug:               c.Debug,
+		Mirostat:            int32(*c.LLMConfig.Mirostat),
+		MirostatETA:         float32(*c.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(*c.LLMConfig.MirostatTAU),
+		Debug:               *c.Debug,
 		StopPrompts:         c.StopWords,
 		Repeat:              int32(c.RepeatPenalty),
 		NKeep:               int32(c.Keep),
 		Batch:               int32(c.Batch),
 		IgnoreEOS:           c.IgnoreEOS,
-		Seed:                int32(c.Seed),
+		Seed:                int32(*c.Seed),
 		FrequencyPenalty:    float32(c.FrequencyPenalty),
-		MLock:               c.MMlock,
-		MMap:                c.MMap,
+		MLock:               *c.MMlock,
+		MMap:                *c.MMap,
 		MainGPU:             c.MainGPU,
 		TensorSplit:         c.TensorSplit,
 		TailFreeSamplingZ:   float32(c.TFZ),
--- a/core/backend/stores.go
+++ b/core/backend/stores.go
@@ -0,0 +1,23 @@
+package backend
+
+import (
+	"github.com/go-skynet/LocalAI/core/config"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc"
+	"github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) (grpc.Backend, error) {
+    if storeName == "" {
+      storeName = "default"
+    }
+
+    sc := []model.Option{
+      model.WithBackendString(model.LocalStoreBackend),
+      model.WithAssetDir(appConfig.AssetsDestination),
+      model.WithModel(storeName),
+    }
+
+    return sl.BackendLoader(sc...)
+}
+
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -4,25 +4,24 @@ import (
 	"context"
 	"fmt"

-	config "github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"

-	"github.com/go-skynet/LocalAI/core/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
+func ModelTranscription(audio, language string, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.Result, error) {

-	opts := modelOpts(c, o, []model.Option{
+	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
-		model.WithModel(c.Model),
-		model.WithContext(o.Context),
-		model.WithThreads(uint32(c.Threads)),
-		model.WithAssetDir(o.AssetsDestination),
+		model.WithModel(backendConfig.Model),
+		model.WithContext(appConfig.Context),
+		model.WithThreads(uint32(*backendConfig.Threads)),
+		model.WithAssetDir(appConfig.AssetsDestination),
 	})

-	whisperModel, err := o.Loader.BackendLoader(opts...)
+	whisperModel, err := ml.BackendLoader(opts...)
 	if err != nil {
 		return nil, err
 	}
@@ -34,6 +33,6 @@ func ModelTranscription(audio, language string, loader *model.ModelLoader, c con
 	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
 		Dst:      audio,
 		Language: language,
-		Threads:  uint32(c.Threads),
+		Threads:  uint32(*backendConfig.Threads),
 	})
 }
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -6,8 +6,8 @@ import (
 	"os"
 	"path/filepath"

-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/config"
+
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
@@ -29,53 +29,59 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }

-func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option, c config.Config) (string, *proto.Result, error) {
+func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
 	}

-	grpcOpts := gRPCModelOpts(c)
+	grpcOpts := gRPCModelOpts(backendConfig)

-	opts := modelOpts(config.Config{}, o, []model.Option{
+	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
 		model.WithBackendString(bb),
 		model.WithModel(modelFile),
-		model.WithContext(o.Context),
-		model.WithAssetDir(o.AssetsDestination),
+		model.WithContext(appConfig.Context),
+		model.WithAssetDir(appConfig.AssetsDestination),
 		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 	})
-	piperModel, err := o.Loader.BackendLoader(opts...)
+	ttsModel, err := loader.BackendLoader(opts...)
 	if err != nil {
 		return "", nil, err
 	}

-	if piperModel == nil {
+	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load piper model")
 	}

-	if err := os.MkdirAll(o.AudioDir, 0755); err != nil {
+	if err := os.MkdirAll(appConfig.AudioDir, 0755); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}

-	fileName := generateUniqueFileName(o.AudioDir, "piper", ".wav")
-	filePath := filepath.Join(o.AudioDir, fileName)
+	fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
+	filePath := filepath.Join(appConfig.AudioDir, fileName)

 	// If the model file is not empty, we pass it joined with the model path
 	modelPath := ""
 	if modelFile != "" {
-		if bb != model.TransformersMusicGen {
-			modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
-			if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+		// If the model file is not empty, we pass it joined with the model path
+		// Checking first that it exists and is not outside ModelPath
+		// TODO: we should actually first check if the modelFile is looking like
+		// a FS path
+		mp := filepath.Join(loader.ModelPath, modelFile)
+		if _, err := os.Stat(mp); err == nil {
+			if err := utils.VerifyPath(mp, appConfig.ModelPath); err != nil {
 				return "", nil, err
 			}
+			modelPath = mp
 		} else {
 			modelPath = modelFile
 		}
 	}

-	res, err := piperModel.TTS(context.Background(), &proto.TTSRequest{
+	res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
 		Text:  text,
 		Model: modelPath,
+		Voice: voice,
 		Dst:   filePath,
 	})

--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -1,4 +1,4 @@
-package options
+package config

 import (
 	"context"
@@ -6,16 +6,14 @@ import (
 	"encoding/json"
 	"time"

-	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
-	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )

-type Option struct {
+type ApplicationConfig struct {
 	Context                             context.Context
 	ConfigFile                          string
-	Loader                              *model.ModelLoader
+	ModelPath                           string
 	UploadLimitMB, Threads, ContextSize int
 	F16                                 bool
 	Debug, DisableMessage               bool
@@ -27,7 +25,6 @@ type Option struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
-	Metrics                             *metrics.Metrics

 	ModelLibraryURL string

@@ -52,10 +49,10 @@ type Option struct {
 	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
 }

-type AppOption func(*Option)
+type AppOption func(*ApplicationConfig)

-func NewOptions(o ...AppOption) *Option {
-	opt := &Option{
+func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
+	opt := &ApplicationConfig{
 		Context:        context.Background(),
 		UploadLimitMB:  15,
 		Threads:        1,
@@ -70,63 +67,69 @@ func NewOptions(o ...AppOption) *Option {
 }

 func WithModelsURL(urls ...string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.ModelsURL = urls
 	}
 }

+func WithModelPath(path string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.ModelPath = path
+	}
+}
+
 func WithCors(b bool) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.CORS = b
 	}
 }

 func WithModelLibraryURL(url string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.ModelLibraryURL = url
 	}
 }

-var EnableWatchDog = func(o *Option) {
+var EnableWatchDog = func(o *ApplicationConfig) {
 	o.WatchDog = true
 }

-var EnableWatchDogIdleCheck = func(o *Option) {
+var EnableWatchDogIdleCheck = func(o *ApplicationConfig) {
 	o.WatchDog = true
 	o.WatchDogIdle = true
 }

-var EnableWatchDogBusyCheck = func(o *Option) {
+var EnableWatchDogBusyCheck = func(o *ApplicationConfig) {
 	o.WatchDog = true
 	o.WatchDogBusy = true
 }

 func SetWatchDogBusyTimeout(t time.Duration) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.WatchDogBusyTimeout = t
 	}
 }

 func SetWatchDogIdleTimeout(t time.Duration) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.WatchDogIdleTimeout = t
 	}
 }

-var EnableSingleBackend = func(o *Option) {
+var EnableSingleBackend = func(o *ApplicationConfig) {
 	o.SingleBackend = true
 }

-var EnableParallelBackendRequests = func(o *Option) {
+var EnableParallelBackendRequests = func(o *ApplicationConfig) {
 	o.ParallelBackendRequests = true
 }

-var EnableGalleriesAutoload = func(o *Option) {
+var EnableGalleriesAutoload = func(o *ApplicationConfig) {
 	o.AutoloadGalleries = true
 }

 func WithExternalBackend(name string, uri string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		if o.ExternalGRPCBackends == nil {
 			o.ExternalGRPCBackends = make(map[string]string)
 		}
@@ -135,27 +138,26 @@ func WithExternalBackend(name string, uri string) AppOption {
 }

 func WithCorsAllowOrigins(b string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.CORSAllowOrigins = b
 	}
 }

 func WithBackendAssetsOutput(out string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.AssetsDestination = out
 	}
 }

 func WithBackendAssets(f embed.FS) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.BackendAssets = f
 	}
 }

 func WithStringGalleries(galls string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		if galls == "" {
-			log.Debug().Msgf("no galleries to load")
 			o.Galleries = []gallery.Gallery{}
 			return
 		}
@@ -168,102 +170,111 @@ func WithStringGalleries(galls string) AppOption {
 }

 func WithGalleries(galleries []gallery.Gallery) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.Galleries = append(o.Galleries, galleries...)
 	}
 }

 func WithContext(ctx context.Context) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.Context = ctx
 	}
 }

 func WithYAMLConfigPreload(configFile string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.PreloadModelsFromPath = configFile
 	}
 }

 func WithJSONStringPreload(configFile string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.PreloadJSONModels = configFile
 	}
 }
 func WithConfigFile(configFile string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.ConfigFile = configFile
 	}
 }

-func WithModelLoader(loader *model.ModelLoader) AppOption {
-	return func(o *Option) {
-		o.Loader = loader
-	}
-}
-
 func WithUploadLimitMB(limit int) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.UploadLimitMB = limit
 	}
 }

 func WithThreads(threads int) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.Threads = threads
 	}
 }

 func WithContextSize(ctxSize int) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.ContextSize = ctxSize
 	}
 }

 func WithF16(f16 bool) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.F16 = f16
 	}
 }

 func WithDebug(debug bool) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.Debug = debug
 	}
 }

 func WithDisableMessage(disableMessage bool) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.DisableMessage = disableMessage
 	}
 }

 func WithAudioDir(audioDir string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.AudioDir = audioDir
 	}
 }

 func WithImageDir(imageDir string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.ImageDir = imageDir
 	}
 }

 func WithUploadDir(uploadDir string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.UploadDir = uploadDir
 	}
 }

 func WithApiKeys(apiKeys []string) AppOption {
-	return func(o *Option) {
+	return func(o *ApplicationConfig) {
 		o.ApiKeys = apiKeys
 	}
 }

-func WithMetrics(meter *metrics.Metrics) AppOption {
-	return func(o *Option) {
-		o.Metrics = meter
+// ToConfigLoaderOptions returns a slice of ConfigLoader Option.
+// Some options defined at the application level are going to be passed as defaults for
+// all the configuration for the models.
+// This includes for instance the context size or the number of threads.
+// If a model doesn't set configs directly to the config model file
+// it will use the defaults defined here.
+func (o *ApplicationConfig) ToConfigLoaderOptions() []ConfigLoaderOption {
+	return []ConfigLoaderOption{
+		LoadOptionContextSize(o.ContextSize),
+		LoadOptionDebug(o.Debug),
+		LoadOptionF16(o.F16),
+		LoadOptionThreads(o.Threads),
 	}
 }
+
+// func WithMetrics(meter *metrics.Metrics) AppOption {
+// 	return func(o *StartupOptions) {
+// 		o.Metrics = meter
+// 	}
+// }
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -4,24 +4,28 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"

+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/downloader"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v3"
+
+	"github.com/charmbracelet/glamour"
 )

-type Config struct {
-	PredictionOptions `yaml:"parameters"`
-	Name              string `yaml:"name"`
+type BackendConfig struct {
+	schema.PredictionOptions `yaml:"parameters"`
+	Name                     string `yaml:"name"`

-	F16            bool              `yaml:"f16"`
-	Threads        int               `yaml:"threads"`
-	Debug          bool              `yaml:"debug"`
+	F16            *bool             `yaml:"f16"`
+	Threads        *int              `yaml:"threads"`
+	Debug          *bool             `yaml:"debug"`
 	Roles          map[string]string `yaml:"roles"`
 	Embeddings     bool              `yaml:"embeddings"`
 	Backend        string            `yaml:"backend"`
@@ -104,29 +108,34 @@ type LLMConfig struct {
 	PromptCachePath string   `yaml:"prompt_cache_path"`
 	PromptCacheAll  bool     `yaml:"prompt_cache_all"`
 	PromptCacheRO   bool     `yaml:"prompt_cache_ro"`
-	MirostatETA     float64  `yaml:"mirostat_eta"`
-	MirostatTAU     float64  `yaml:"mirostat_tau"`
-	Mirostat        int      `yaml:"mirostat"`
-	NGPULayers      int      `yaml:"gpu_layers"`
-	MMap            bool     `yaml:"mmap"`
-	MMlock          bool     `yaml:"mmlock"`
-	LowVRAM         bool     `yaml:"low_vram"`
+	MirostatETA     *float64 `yaml:"mirostat_eta"`
+	MirostatTAU     *float64 `yaml:"mirostat_tau"`
+	Mirostat        *int     `yaml:"mirostat"`
+	NGPULayers      *int     `yaml:"gpu_layers"`
+	MMap            *bool    `yaml:"mmap"`
+	MMlock          *bool    `yaml:"mmlock"`
+	LowVRAM         *bool    `yaml:"low_vram"`
 	Grammar         string   `yaml:"grammar"`
 	StopWords       []string `yaml:"stopwords"`
 	Cutstrings      []string `yaml:"cutstrings"`
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`

-	ContextSize  int     `yaml:"context_size"`
-	NUMA         bool    `yaml:"numa"`
-	LoraAdapter  string  `yaml:"lora_adapter"`
-	LoraBase     string  `yaml:"lora_base"`
-	LoraScale    float32 `yaml:"lora_scale"`
-	NoMulMatQ    bool    `yaml:"no_mulmatq"`
-	DraftModel   string  `yaml:"draft_model"`
-	NDraft       int32   `yaml:"n_draft"`
-	Quantization string  `yaml:"quantization"`
-	MMProj       string  `yaml:"mmproj"`
+	ContextSize          *int    `yaml:"context_size"`
+	NUMA                 bool    `yaml:"numa"`
+	LoraAdapter          string  `yaml:"lora_adapter"`
+	LoraBase             string  `yaml:"lora_base"`
+	LoraScale            float32 `yaml:"lora_scale"`
+	NoMulMatQ            bool    `yaml:"no_mulmatq"`
+	DraftModel           string  `yaml:"draft_model"`
+	NDraft               int32   `yaml:"n_draft"`
+	Quantization         string  `yaml:"quantization"`
+	GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
+	TrustRemoteCode      bool    `yaml:"trust_remote_code"`      // vLLM
+	EnforceEager         bool    `yaml:"enforce_eager"`          // vLLM
+	SwapSpace            int     `yaml:"swap_space"`             // vLLM
+	MaxModelLen          int     `yaml:"max_model_len"`          // vLLM
+	MMProj               string  `yaml:"mmproj"`

 	RopeScaling string `yaml:"rope_scaling"`
 	ModelType   string `yaml:"type"`
@@ -159,108 +168,216 @@ type TemplateConfig struct {
 	Functions   string `yaml:"function"`
 }

-type ConfigLoader struct {
-	configs map[string]Config
-	sync.Mutex
-}
-
-func (c *Config) SetFunctionCallString(s string) {
+func (c *BackendConfig) SetFunctionCallString(s string) {
 	c.functionCallString = s
 }

-func (c *Config) SetFunctionCallNameString(s string) {
+func (c *BackendConfig) SetFunctionCallNameString(s string) {
 	c.functionCallNameString = s
 }

-func (c *Config) ShouldUseFunctions() bool {
+func (c *BackendConfig) ShouldUseFunctions() bool {
 	return ((c.functionCallString != "none" || c.functionCallString == "") || c.ShouldCallSpecificFunction())
 }

-func (c *Config) ShouldCallSpecificFunction() bool {
+func (c *BackendConfig) ShouldCallSpecificFunction() bool {
 	return len(c.functionCallNameString) > 0
 }

-func (c *Config) FunctionToCall() string {
+func (c *BackendConfig) FunctionToCall() string {
 	return c.functionCallNameString
 }

-// Load a config file for a model
-func Load(modelName, modelPath string, cm *ConfigLoader, debug bool, threads, ctx int, f16 bool) (*Config, error) {
-	// Load a config file if present after the model name
-	modelConfig := filepath.Join(modelPath, modelName+".yaml")
+func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
+	lo := &LoadOptions{}
+	lo.Apply(opts...)

-	var cfg *Config
+	ctx := lo.ctxSize
+	threads := lo.threads
+	f16 := lo.f16
+	debug := lo.debug
+	defaultTopP := 0.7
+	defaultTopK := 80
+	defaultTemp := 0.9
+	defaultMaxTokens := 2048
+	defaultMirostat := 2
+	defaultMirostatTAU := 5.0
+	defaultMirostatETA := 0.1

-	defaults := func() {
-		cfg = DefaultConfig(modelName)
-		cfg.ContextSize = ctx
-		cfg.Threads = threads
-		cfg.F16 = f16
-		cfg.Debug = debug
+	// Try to offload all GPU layers (if GPU is found)
+	defaultNGPULayers := 99999999
+
+	trueV := true
+	falseV := false
+
+	if cfg.Seed == nil {
+		//  random number generator seed
+		defaultSeed := int(rand.Int31())
+		cfg.Seed = &defaultSeed
 	}

-	cfgExisting, exists := cm.GetConfig(modelName)
-	if !exists {
+	if cfg.TopK == nil {
+		cfg.TopK = &defaultTopK
+	}
+
+	if cfg.MMap == nil {
+		// MMap is enabled by default
+		cfg.MMap = &trueV
+	}
+
+	if cfg.MMlock == nil {
+		// MMlock is disabled by default
+		cfg.MMlock = &falseV
+	}
+
+	if cfg.TopP == nil {
+		cfg.TopP = &defaultTopP
+	}
+	if cfg.Temperature == nil {
+		cfg.Temperature = &defaultTemp
+	}
+
+	if cfg.Maxtokens == nil {
+		cfg.Maxtokens = &defaultMaxTokens
+	}
+
+	if cfg.Mirostat == nil {
+		cfg.Mirostat = &defaultMirostat
+	}
+
+	if cfg.MirostatETA == nil {
+		cfg.MirostatETA = &defaultMirostatETA
+	}
+
+	if cfg.MirostatTAU == nil {
+		cfg.MirostatTAU = &defaultMirostatTAU
+	}
+	if cfg.NGPULayers == nil {
+		cfg.NGPULayers = &defaultNGPULayers
+	}
+
+	if cfg.LowVRAM == nil {
+		cfg.LowVRAM = &falseV
+	}
+
+	// Value passed by the top level are treated as default (no implicit defaults)
+	// defaults are set by the user
+	if ctx == 0 {
+		ctx = 1024
+	}
+
+	if cfg.ContextSize == nil {
+		cfg.ContextSize = &ctx
+	}
+
+	if threads == 0 {
+		// Threads can't be 0
+		threads = 4
+	}
+
+	if cfg.Threads == nil {
+		cfg.Threads = &threads
+	}
+
+	if cfg.F16 == nil {
+		cfg.F16 = &f16
+	}
+
+	if cfg.Debug == nil {
+		cfg.Debug = &falseV
+	}
+
+	if debug {
+		cfg.Debug = &trueV
+	}
+}
+
+////// Config Loader ////////
+
+type BackendConfigLoader struct {
+	configs map[string]BackendConfig
+	sync.Mutex
+}
+
+type LoadOptions struct {
+	debug            bool
+	threads, ctxSize int
+	f16              bool
+}
+
+func LoadOptionDebug(debug bool) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.debug = debug
+	}
+}
+
+func LoadOptionThreads(threads int) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.threads = threads
+	}
+}
+
+func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.ctxSize = ctxSize
+	}
+}
+
+func LoadOptionF16(f16 bool) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.f16 = f16
+	}
+}
+
+type ConfigLoaderOption func(*LoadOptions)
+
+func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
+	for _, l := range options {
+		l(lo)
+	}
+}
+
+// Load a config file for a model
+func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+
+	// Load a config file if present after the model name
+	cfg := &BackendConfig{
+		PredictionOptions: schema.PredictionOptions{
+			Model: modelName,
+		},
+	}
+
+	cfgExisting, exists := cl.GetBackendConfig(modelName)
+	if exists {
+		cfg = &cfgExisting
+	} else {
+		// Try loading a model config file
+		modelConfig := filepath.Join(modelPath, modelName+".yaml")
 		if _, err := os.Stat(modelConfig); err == nil {
-			if err := cm.LoadConfig(modelConfig); err != nil {
+			if err := cl.LoadBackendConfig(
+				modelConfig, opts...,
+			); err != nil {
 				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
 			}
-			cfgExisting, exists = cm.GetConfig(modelName)
+			cfgExisting, exists = cl.GetBackendConfig(modelName)
 			if exists {
 				cfg = &cfgExisting
-			} else {
-				defaults()
 			}
-		} else {
-			defaults()
-		}
-	} else {
-		cfg = &cfgExisting
-	}
-
-	// Set the parameters for the language model prediction
-	//updateConfig(cfg, input)
-
-	// Don't allow 0 as setting
-	if cfg.Threads == 0 {
-		if threads != 0 {
-			cfg.Threads = threads
-		} else {
-			cfg.Threads = 4
 		}
 	}

-	// Enforce debug flag if passed from CLI
-	if debug {
-		cfg.Debug = true
-	}
+	cfg.SetDefaults(opts...)

 	return cfg, nil
 }

-func defaultPredictOptions(modelFile string) PredictionOptions {
-	return PredictionOptions{
-		TopP:        0.7,
-		TopK:        80,
-		Maxtokens:   512,
-		Temperature: 0.9,
-		Model:       modelFile,
+func NewBackendConfigLoader() *BackendConfigLoader {
+	return &BackendConfigLoader{
+		configs: make(map[string]BackendConfig),
 	}
 }
-
-func DefaultConfig(modelFile string) *Config {
-	return &Config{
-		PredictionOptions: defaultPredictOptions(modelFile),
-	}
-}
-
-func NewConfigLoader() *ConfigLoader {
-	return &ConfigLoader{
-		configs: make(map[string]Config),
-	}
-}
-func ReadConfigFile(file string) ([]*Config, error) {
-	c := &[]*Config{}
+func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
+	c := &[]*BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
 		return nil, fmt.Errorf("cannot read config file: %w", err)
@@ -269,11 +386,18 @@ func ReadConfigFile(file string) ([]*Config, error) {
 		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
 	}

+	for _, cc := range *c {
+		cc.SetDefaults(opts...)
+	}
+
 	return *c, nil
 }

-func ReadConfig(file string) (*Config, error) {
-	c := &Config{}
+func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+	lo := &LoadOptions{}
+	lo.Apply(opts...)
+
+	c := &BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
 		return nil, fmt.Errorf("cannot read config file: %w", err)
@@ -282,13 +406,14 @@ func ReadConfig(file string) (*Config, error) {
 		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
 	}

+	c.SetDefaults(opts...)
 	return c, nil
 }

-func (cm *ConfigLoader) LoadConfigFile(file string) error {
+func (cm *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
 	cm.Lock()
 	defer cm.Unlock()
-	c, err := ReadConfigFile(file)
+	c, err := ReadBackendConfigFile(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot load config file: %w", err)
 	}
@@ -299,49 +424,49 @@ func (cm *ConfigLoader) LoadConfigFile(file string) error {
 	return nil
 }

-func (cm *ConfigLoader) LoadConfig(file string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadConfig(file)
+func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
+	cl.Lock()
+	defer cl.Unlock()
+	c, err := ReadBackendConfig(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot read config file: %w", err)
 	}

-	cm.configs[c.Name] = *c
+	cl.configs[c.Name] = *c
 	return nil
 }

-func (cm *ConfigLoader) GetConfig(m string) (Config, bool) {
-	cm.Lock()
-	defer cm.Unlock()
-	v, exists := cm.configs[m]
+func (cl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
+	cl.Lock()
+	defer cl.Unlock()
+	v, exists := cl.configs[m]
 	return v, exists
 }

-func (cm *ConfigLoader) GetAllConfigs() []Config {
-	cm.Lock()
-	defer cm.Unlock()
-	var res []Config
-	for _, v := range cm.configs {
+func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
+	cl.Lock()
+	defer cl.Unlock()
+	var res []BackendConfig
+	for _, v := range cl.configs {
 		res = append(res, v)
 	}
 	return res
 }

-func (cm *ConfigLoader) ListConfigs() []string {
-	cm.Lock()
-	defer cm.Unlock()
+func (cl *BackendConfigLoader) ListBackendConfigs() []string {
+	cl.Lock()
+	defer cl.Unlock()
 	var res []string
-	for k := range cm.configs {
+	for k := range cl.configs {
 		res = append(res, k)
 	}
 	return res
 }

 // Preload prepare models if they are not local but url or huggingface repositories
-func (cm *ConfigLoader) Preload(modelPath string) error {
-	cm.Lock()
-	defer cm.Unlock()
+func (cl *BackendConfigLoader) Preload(modelPath string) error {
+	cl.Lock()
+	defer cl.Unlock()

 	status := func(fileName, current, total string, percent float64) {
 		utils.DisplayDownloadFunction(fileName, current, total, percent)
@@ -349,7 +474,21 @@ func (cm *ConfigLoader) Preload(modelPath string) error {

 	log.Info().Msgf("Preloading models from %s", modelPath)

-	for i, config := range cm.configs {
+	renderMode := "dark"
+	if os.Getenv("COLOR") != "" {
+		renderMode = os.Getenv("COLOR")
+	}
+
+	glamText := func(t string) {
+		out, err := glamour.Render(t, renderMode)
+		if err == nil && os.Getenv("NO_COLOR") == "" {
+			fmt.Println(out)
+		} else {
+			fmt.Println(t)
+		}
+	}
+
+	for i, config := range cl.configs {

 		// Download files and verify their SHA
 		for _, file := range config.DownloadFiles {
@@ -381,25 +520,29 @@ func (cm *ConfigLoader) Preload(modelPath string) error {
 				}
 			}

-			cc := cm.configs[i]
+			cc := cl.configs[i]
 			c := &cc
 			c.PredictionOptions.Model = md5Name
-			cm.configs[i] = *c
+			cl.configs[i] = *c
 		}
-		if cm.configs[i].Name != "" {
-			log.Info().Msgf("Model name: %s", cm.configs[i].Name)
+		if cl.configs[i].Name != "" {
+			glamText(fmt.Sprintf("**Model name**: _%s_", cl.configs[i].Name))
 		}
-		if cm.configs[i].Description != "" {
-			log.Info().Msgf("Model description: %s", cm.configs[i].Description)
+		if cl.configs[i].Description != "" {
+			//glamText("**Description**")
+			glamText(cl.configs[i].Description)
 		}
-		if cm.configs[i].Usage != "" {
-			log.Info().Msgf("Model usage: \n%s", cm.configs[i].Usage)
+		if cl.configs[i].Usage != "" {
+			//glamText("**Usage**")
+			glamText(cl.configs[i].Usage)
 		}
 	}
 	return nil
 }

-func (cm *ConfigLoader) LoadConfigs(path string) error {
+// LoadBackendConfigsFromPath reads all the configurations of the models from a path
+// (non-recursive)
+func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
 	cm.Lock()
 	defer cm.Unlock()
 	entries, err := os.ReadDir(path)
@@ -419,7 +562,7 @@ func (cm *ConfigLoader) LoadConfigs(path string) error {
 		if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
 			continue
 		}
-		c, err := ReadConfig(filepath.Join(path, file.Name()))
+		c, err := ReadBackendConfig(filepath.Join(path, file.Name()), opts...)
 		if err == nil {
 			cm.configs[c.Name] = *c
 		}
--- a/core/config/config_test.go
+++ b/core/config/config_test.go
@@ -4,8 +4,7 @@ import (
 	"os"

 	. "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
-	"github.com/go-skynet/LocalAI/pkg/model"
+
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
@@ -19,7 +18,7 @@ var _ = Describe("Test cases for config related functions", func() {
 	Context("Test Read configuration functions", func() {
 		configFile = os.Getenv("CONFIG_FILE")
 		It("Test ReadConfigFile", func() {
-			config, err := ReadConfigFile(configFile)
+			config, err := ReadBackendConfigFile(configFile)
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			// two configs in config.yaml
@@ -28,29 +27,26 @@ var _ = Describe("Test cases for config related functions", func() {
 		})

 		It("Test LoadConfigs", func() {
-			cm := NewConfigLoader()
-			opts := options.NewOptions()
-			modelLoader := model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			options.WithModelLoader(modelLoader)(opts)
-
-			err := cm.LoadConfigs(opts.Loader.ModelPath)
+			cm := NewBackendConfigLoader()
+			opts := NewApplicationConfig()
+			err := cm.LoadBackendConfigsFromPath(opts.ModelPath)
 			Expect(err).To(BeNil())
-			Expect(cm.ListConfigs()).ToNot(BeNil())
+			Expect(cm.ListBackendConfigs()).ToNot(BeNil())

 			// config should includes gpt4all models's api.config
-			Expect(cm.ListConfigs()).To(ContainElements("gpt4all"))
+			Expect(cm.ListBackendConfigs()).To(ContainElements("gpt4all"))

 			// config should includes gpt2 models's api.config
-			Expect(cm.ListConfigs()).To(ContainElements("gpt4all-2"))
+			Expect(cm.ListBackendConfigs()).To(ContainElements("gpt4all-2"))

 			// config should includes text-embedding-ada-002 models's api.config
-			Expect(cm.ListConfigs()).To(ContainElements("text-embedding-ada-002"))
+			Expect(cm.ListBackendConfigs()).To(ContainElements("text-embedding-ada-002"))

 			// config should includes rwkv_test models's api.config
-			Expect(cm.ListConfigs()).To(ContainElements("rwkv_test"))
+			Expect(cm.ListBackendConfigs()).To(ContainElements("rwkv_test"))

 			// config should includes whisper-1 models's api.config
-			Expect(cm.ListConfigs()).To(ContainElements("whisper-1"))
+			Expect(cm.ListBackendConfigs()).To(ContainElements("whisper-1"))
 		})
 	})
 })
--- a/core/http/api.go
+++ b/core/http/api.go
@@ -3,122 +3,48 @@ package http
 import (
 	"encoding/json"
 	"errors"
-	"fmt"
 	"os"
 	"strings"

-	"github.com/go-skynet/LocalAI/api/localai"
-	"github.com/go-skynet/LocalAI/api/openai"
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/elevenlabs"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
+	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
+
+	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/core/services"
 	"github.com/go-skynet/LocalAI/internal"
-	"github.com/go-skynet/LocalAI/metrics"
-	"github.com/go-skynet/LocalAI/pkg/assets"
 	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/startup"

 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
 	"github.com/gofiber/fiber/v2/middleware/logger"
 	"github.com/gofiber/fiber/v2/middleware/recover"
-	"github.com/rs/zerolog"
-	"github.com/rs/zerolog/log"
 )

-func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader, error) {
-	options := options.NewOptions(opts...)
+func readAuthHeader(c *fiber.Ctx) string {
+	authHeader := c.Get("Authorization")

-	zerolog.SetGlobalLevel(zerolog.InfoLevel)
-	if options.Debug {
-		zerolog.SetGlobalLevel(zerolog.DebugLevel)
+	// elevenlabs
+	xApiKey := c.Get("xi-api-key")
+	if xApiKey != "" {
+		authHeader = "Bearer " + xApiKey
 	}

-	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
-	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
-
-	startup.PreloadModelsConfigurations(options.ModelLibraryURL, options.Loader.ModelPath, options.ModelsURL...)
-
-	cl := config.NewConfigLoader()
-	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
-		log.Error().Msgf("error loading config files: %s", err.Error())
+	// anthropic
+	xApiKey = c.Get("x-api-key")
+	if xApiKey != "" {
+		authHeader = "Bearer " + xApiKey
 	}

-	if options.ConfigFile != "" {
-		if err := cl.LoadConfigFile(options.ConfigFile); err != nil {
-			log.Error().Msgf("error loading config file: %s", err.Error())
-		}
-	}
-
-	if err := cl.Preload(options.Loader.ModelPath); err != nil {
-		log.Error().Msgf("error downloading models: %s", err.Error())
-	}
-
-	if options.PreloadJSONModels != "" {
-		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
-	if options.PreloadModelsFromPath != "" {
-		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
-			return nil, nil, err
-		}
-	}
-
-	if options.Debug {
-		for _, v := range cl.ListConfigs() {
-			cfg, _ := cl.GetConfig(v)
-			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
-		}
-	}
-
-	if options.AssetsDestination != "" {
-		// Extract files from the embedded FS
-		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
-		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
-		if err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
-		}
-	}
-
-	// turn off any process that was started by GRPC if the context is canceled
-	go func() {
-		<-options.Context.Done()
-		log.Debug().Msgf("Context canceled, shutting down")
-		options.Loader.StopAllGRPC()
-	}()
-
-	if options.WatchDog {
-		wd := model.NewWatchDog(
-			options.Loader,
-			options.WatchDogBusyTimeout,
-			options.WatchDogIdleTimeout,
-			options.WatchDogBusy,
-			options.WatchDogIdle)
-		options.Loader.SetWatchDog(wd)
-		go wd.Run()
-		go func() {
-			<-options.Context.Done()
-			log.Debug().Msgf("Context canceled, shutting down")
-			wd.Shutdown()
-		}()
-	}
-
-	return options, cl, nil
+	return authHeader
 }

-func App(opts ...options.AppOption) (*fiber.App, error) {
-
-	options, cl, err := Startup(opts...)
-	if err != nil {
-		return nil, fmt.Errorf("failed basic startup tasks with error %s", err.Error())
-	}
-
+func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
-		BodyLimit:             options.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
-		DisableStartupMessage: options.DisableMessage,
+		BodyLimit:             appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		DisableStartupMessage: appConfig.DisableMessage,
 		// Override default error handler
 		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
 			// Status code defaults to 500
@@ -139,7 +65,7 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		},
 	})

-	if options.Debug {
+	if appConfig.Debug {
 		app.Use(logger.New(logger.Config{
 			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
 		}))
@@ -147,17 +73,25 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// Default middleware config

-	if !options.Debug {
+	if !appConfig.Debug {
 		app.Use(recover.New())
 	}

-	if options.Metrics != nil {
-		app.Use(metrics.APIMiddleware(options.Metrics))
+	metricsService, err := services.NewLocalAIMetricsService()
+	if err != nil {
+		return nil, err
+	}
+
+	if metricsService != nil {
+		app.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
+		app.Hooks().OnShutdown(func() error {
+			return metricsService.Shutdown()
+		})
 	}

 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
-		if len(options.ApiKeys) == 0 {
+		if len(appConfig.ApiKeys) == 0 {
 			return c.Next()
 		}

@@ -172,47 +106,48 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 			}

 			// Add file keys to options.ApiKeys
-			options.ApiKeys = append(options.ApiKeys, fileKeys...)
+			appConfig.ApiKeys = append(appConfig.ApiKeys, fileKeys...)
 		}

-		if len(options.ApiKeys) == 0 {
+		if len(appConfig.ApiKeys) == 0 {
 			return c.Next()
 		}

-		authHeader := c.Get("Authorization")
+		authHeader := readAuthHeader(c)
 		if authHeader == "" {
 			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
 		}
+
+		// If it's a bearer token
 		authHeaderParts := strings.Split(authHeader, " ")
 		if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
 			return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
 		}

 		apiKey := authHeaderParts[1]
-		for _, key := range options.ApiKeys {
+		for _, key := range appConfig.ApiKeys {
 			if apiKey == key {
 				return c.Next()
 			}
 		}

 		return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
-
 	}

-	if options.CORS {
+	if appConfig.CORS {
 		var c func(ctx *fiber.Ctx) error
-		if options.CORSAllowOrigins == "" {
+		if appConfig.CORSAllowOrigins == "" {
 			c = cors.New()
 		} else {
-			c = cors.New(cors.Config{AllowOrigins: options.CORSAllowOrigins})
+			c = cors.New(cors.Config{AllowOrigins: appConfig.CORSAllowOrigins})
 		}

 		app.Use(c)
 	}

 	// LocalAI API endpoints
-	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
-	galleryService.Start(options.Context, cl)
+	galleryService := services.NewGalleryService(appConfig.ModelPath)
+	galleryService.Start(appConfig.Context, cl)

 	app.Get("/version", auth, func(c *fiber.Ctx) error {
 		return c.JSON(struct {
@@ -220,69 +155,75 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}{Version: internal.PrintableVersion()})
 	})

-	// Make sure directories exists
-	os.MkdirAll(options.ImageDir, 0755)
-	os.MkdirAll(options.AudioDir, 0755)
-	os.MkdirAll(options.UploadDir, 0755)
-	os.MkdirAll(options.Loader.ModelPath, 0755)
-
 	// Load upload json
-	openai.LoadUploadConfig(options.UploadDir)
+	openai.LoadUploadConfig(appConfig.UploadDir)

-	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
-	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
-	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
-	app.Get("/models/galleries", auth, modelGalleryService.ListModelGalleriesEndpoint())
-	app.Post("/models/galleries", auth, modelGalleryService.AddModelGalleryEndpoint())
-	app.Delete("/models/galleries", auth, modelGalleryService.RemoveModelGalleryEndpoint())
-	app.Get("/models/jobs/:uuid", auth, modelGalleryService.GetOpStatusEndpoint())
-	app.Get("/models/jobs", auth, modelGalleryService.GetAllStatusEndpoint())
+	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
+	app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+	app.Get("/models/available", auth, modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+	app.Get("/models/galleries", auth, modelGalleryEndpointService.ListModelGalleriesEndpoint())
+	app.Post("/models/galleries", auth, modelGalleryEndpointService.AddModelGalleryEndpoint())
+	app.Delete("/models/galleries", auth, modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+	app.Get("/models/jobs/:uuid", auth, modelGalleryEndpointService.GetOpStatusEndpoint())
+	app.Get("/models/jobs", auth, modelGalleryEndpointService.GetAllStatusEndpoint())
+
+	app.Post("/tts", auth, localai.TTSEndpoint(cl, ml, appConfig))
+
+	// Elevenlabs
+	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
+
+	// Stores
+	sl := model.NewModelLoader("")
+	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
+	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
+	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
+	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))

 	// openAI compatible API endpoint

 	// chat
-	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, options))
-	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, options))
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, ml, appConfig))

 	// edit
-	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, options))
-	app.Post("/edits", auth, openai.EditEndpoint(cl, options))
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))

 	// files
-	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, options))
-	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, options))
-	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, options))
-	app.Get("/files", auth, openai.ListFilesEndpoint(cl, options))
-	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, options))
-	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, options))
-	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, options))
-	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, options))
-	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, options))
-	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, options))
+	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
+	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files", auth, openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/files", auth, openai.ListFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
+	app.Get("/files/:file_id", auth, openai.GetFilesEndpoint(cl, appConfig))
+	app.Delete("/v1/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Delete("/files/:file_id", auth, openai.DeleteFilesEndpoint(cl, appConfig))
+	app.Get("/v1/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Get("/files/:file_id/content", auth, openai.GetFilesContentsEndpoint(cl, appConfig))

 	// completion
-	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, options))
-	app.Post("/completions", auth, openai.CompletionEndpoint(cl, options))
-	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, ml, appConfig))

 	// embeddings
-	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
-	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
-	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, ml, appConfig))

 	// audio
-	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, options))
-	app.Post("/tts", auth, localai.TTSEndpoint(cl, options))
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/speech", auth, localai.TTSEndpoint(cl, ml, appConfig))

 	// images
-	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, options))
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, ml, appConfig))

-	if options.ImageDir != "" {
-		app.Static("/generated-images", options.ImageDir)
+	if appConfig.ImageDir != "" {
+		app.Static("/generated-images", appConfig.ImageDir)
 	}

-	if options.AudioDir != "" {
-		app.Static("/generated-audio", options.AudioDir)
+	if appConfig.AudioDir != "" {
+		app.Static("/generated-audio", appConfig.AudioDir)
 	}

 	ok := func(c *fiber.Ctx) error {
@@ -294,15 +235,15 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/readyz", ok)

 	// Experimental Backend Statistics Module
-	backendMonitor := localai.NewBackendMonitor(cl, options) // Split out for now
+	backendMonitor := services.NewBackendMonitor(cl, ml, appConfig) // Split out for now
 	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
 	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))

 	// models
-	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
-	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(cl, ml))
+	app.Get("/models", auth, openai.ListModelsEndpoint(cl, ml))

-	app.Get("/metrics", metrics.MetricsHandler())
+	app.Get("/metrics", localai.LocalAIMetricsEndpoint())

 	return app, nil
 }
--- a/core/http/api_test.go
+++ b/core/http/api_test.go
@@ -13,9 +13,11 @@ import (
 	"path/filepath"
 	"runtime"

+	"github.com/go-skynet/LocalAI/core/config"
 	. "github.com/go-skynet/LocalAI/core/http"
-	"github.com/go-skynet/LocalAI/core/options"
-	"github.com/go-skynet/LocalAI/metrics"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/core/startup"
+
 	"github.com/go-skynet/LocalAI/pkg/downloader"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
@@ -121,31 +123,108 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 	return
 }

+func postRequestJSON[B any](url string, bodyJson *B) error {
+	payload, err := json.Marshal(bodyJson)
+	if err != nil {
+		return err
+	}
+
+	GinkgoWriter.Printf("POST %s: %s\n", url, string(payload))
+
+	req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
+	if err != nil {
+		return err
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err
+	}
+
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body))
+	}
+
+	return nil
+}
+
+func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *B2) error {
+	payload, err := json.Marshal(reqJson)
+	if err != nil {
+		return err
+	}
+
+	GinkgoWriter.Printf("POST %s: %s\n", url, string(payload))
+
+	req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
+	if err != nil {
+		return err
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body))
+	}
+
+	return json.Unmarshal(body, respJson)
+}
+
 //go:embed backend-assets/*
 var backendAssets embed.FS

 var _ = Describe("API test", func() {

 	var app *fiber.App
-	var modelLoader *model.ModelLoader
 	var client *openai.Client
 	var client2 *openaigo.Client
 	var c context.Context
 	var cancel context.CancelFunc
 	var tmpdir string
+	var modelDir string
+	var bcl *config.BackendConfigLoader
+	var ml *model.ModelLoader
+	var applicationConfig *config.ApplicationConfig

-	commonOpts := []options.AppOption{
-		options.WithDebug(true),
-		options.WithDisableMessage(true),
+	commonOpts := []config.AppOption{
+		config.WithDebug(true),
+		config.WithDisableMessage(true),
 	}

 	Context("API with ephemeral models", func() {
-		BeforeEach(func() {
+
+		BeforeEach(func(sc SpecContext) {
 			var err error
 			tmpdir, err = os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())

-			modelLoader = model.NewModelLoader(tmpdir)
+			modelDir = filepath.Join(tmpdir, "models")
+			backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
+			err = os.Mkdir(backendAssetsDir, 0755)
+			Expect(err).ToNot(HaveOccurred())
+
 			c, cancel = context.WithCancel(context.Background())

 			g := []gallery.GalleryModel{
@@ -172,16 +251,18 @@ var _ = Describe("API test", func() {
 				},
 			}

-			metricsService, err := metrics.SetupMetrics()
+			bcl, ml, applicationConfig, err = startup.Startup(
+				append(commonOpts,
+					config.WithContext(c),
+					config.WithGalleries(galleries),
+					config.WithModelPath(modelDir),
+					config.WithBackendAssets(backendAssets),
+					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())

-			app, err = App(
-				append(commonOpts,
-					options.WithMetrics(metricsService),
-					options.WithContext(c),
-					options.WithGalleries(galleries),
-					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
+			app, err = App(bcl, ml, applicationConfig)
 			Expect(err).ToNot(HaveOccurred())
+
 			go app.Listen("127.0.0.1:9090")

 			defaultConfig := openai.DefaultConfig("")
@@ -198,15 +279,21 @@ var _ = Describe("API test", func() {
 			}, "2m").ShouldNot(HaveOccurred())
 		})

-		AfterEach(func() {
+		AfterEach(func(sc SpecContext) {
 			cancel()
-			app.Shutdown()
-			os.RemoveAll(tmpdir)
+			if app != nil {
+				err := app.Shutdown()
+				Expect(err).ToNot(HaveOccurred())
+			}
+			err := os.RemoveAll(tmpdir)
+			Expect(err).ToNot(HaveOccurred())
+			_, err = os.ReadDir(tmpdir)
+			Expect(err).To(HaveOccurred())
 		})

 		Context("Applying models", func() {
-			It("applies models from a gallery", func() {

+			It("applies models from a gallery", func() {
 				models := getModels("http://127.0.0.1:9090/models/available")
 				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
 				Expect(models[0].Installed).To(BeFalse(), fmt.Sprint(models))
@@ -228,10 +315,10 @@ var _ = Describe("API test", func() {
 				}, "360s", "10s").Should(Equal(true))
 				Expect(resp["message"]).ToNot(ContainSubstring("error"))

-				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert2.yaml"))
+				dat, err := os.ReadFile(filepath.Join(modelDir, "bert2.yaml"))
 				Expect(err).ToNot(HaveOccurred())

-				_, err = os.ReadFile(filepath.Join(tmpdir, "foo.yaml"))
+				_, err = os.ReadFile(filepath.Join(modelDir, "foo.yaml"))
 				Expect(err).ToNot(HaveOccurred())

 				content := map[string]interface{}{}
@@ -253,6 +340,7 @@ var _ = Describe("API test", func() {
 				}
 			})
 			It("overrides models", func() {
+
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
 					Name: "bert",
@@ -270,7 +358,7 @@ var _ = Describe("API test", func() {
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))

-				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
+				dat, err := os.ReadFile(filepath.Join(modelDir, "bert.yaml"))
 				Expect(err).ToNot(HaveOccurred())

 				content := map[string]interface{}{}
@@ -294,7 +382,7 @@ var _ = Describe("API test", func() {
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))

-				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
+				dat, err := os.ReadFile(filepath.Join(modelDir, "bert.yaml"))
 				Expect(err).ToNot(HaveOccurred())

 				content := map[string]interface{}{}
@@ -368,7 +456,7 @@ var _ = Describe("API test", func() {
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
+				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))

@@ -483,8 +571,11 @@ var _ = Describe("API test", func() {
 			var err error
 			tmpdir, err = os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
+			modelDir = filepath.Join(tmpdir, "models")
+			backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
+			err = os.Mkdir(backendAssetsDir, 0755)
+			Expect(err).ToNot(HaveOccurred())

-			modelLoader = model.NewModelLoader(tmpdir)
 			c, cancel = context.WithCancel(context.Background())

 			galleries := []gallery.Gallery{
@@ -494,21 +585,20 @@ var _ = Describe("API test", func() {
 				},
 			}

-			metricsService, err := metrics.SetupMetrics()
-			Expect(err).ToNot(HaveOccurred())
-
-			app, err = App(
+			bcl, ml, applicationConfig, err = startup.Startup(
 				append(commonOpts,
-					options.WithContext(c),
-					options.WithMetrics(metricsService),
-					options.WithAudioDir(tmpdir),
-					options.WithImageDir(tmpdir),
-					options.WithGalleries(galleries),
-					options.WithModelLoader(modelLoader),
-					options.WithBackendAssets(backendAssets),
-					options.WithBackendAssetsOutput(tmpdir))...,
+					config.WithContext(c),
+					config.WithAudioDir(tmpdir),
+					config.WithImageDir(tmpdir),
+					config.WithGalleries(galleries),
+					config.WithModelPath(modelDir),
+					config.WithBackendAssets(backendAssets),
+					config.WithBackendAssetsOutput(tmpdir))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
+			app, err = App(bcl, ml, applicationConfig)
+			Expect(err).ToNot(HaveOccurred())
+
 			go app.Listen("127.0.0.1:9090")

 			defaultConfig := openai.DefaultConfig("")
@@ -527,8 +617,14 @@ var _ = Describe("API test", func() {

 		AfterEach(func() {
 			cancel()
-			app.Shutdown()
-			os.RemoveAll(tmpdir)
+			if app != nil {
+				err := app.Shutdown()
+				Expect(err).ToNot(HaveOccurred())
+			}
+			err := os.RemoveAll(tmpdir)
+			Expect(err).ToNot(HaveOccurred())
+			_, err = os.ReadDir(tmpdir)
+			Expect(err).To(HaveOccurred())
 		})
 		It("installs and is capable to run tts", Label("tts"), func() {
 			if runtime.GOOS != "linux" {
@@ -599,20 +695,20 @@ var _ = Describe("API test", func() {

 	Context("API query", func() {
 		BeforeEach(func() {
-			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
+			modelPath := os.Getenv("MODELS_PATH")
 			c, cancel = context.WithCancel(context.Background())

-			metricsService, err := metrics.SetupMetrics()
-			Expect(err).ToNot(HaveOccurred())
+			var err error

-			app, err = App(
+			bcl, ml, applicationConfig, err = startup.Startup(
 				append(commonOpts,
-					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
-					options.WithContext(c),
-					options.WithModelLoader(modelLoader),
-					options.WithMetrics(metricsService),
+					config.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
+					config.WithContext(c),
+					config.WithModelPath(modelPath),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
+			app, err = App(bcl, ml, applicationConfig)
+			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

 			defaultConfig := openai.DefaultConfig("")
@@ -630,22 +726,25 @@ var _ = Describe("API test", func() {
 		})
 		AfterEach(func() {
 			cancel()
-			app.Shutdown()
+			if app != nil {
+				err := app.Shutdown()
+				Expect(err).ToNot(HaveOccurred())
+			}
 		})
 		It("returns the models list", func() {
 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
-		It("can generate completions", func() {
-			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: testPrompt})
+		It("can generate completions via ggml", func() {
+			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel.ggml", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
 		})

-		It("can generate chat completions ", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
+		It("can generate chat completions via ggml", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel.ggml", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
@@ -807,24 +906,96 @@ var _ = Describe("API test", func() {
 				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
 			})
 		})
+
+		// See tests/integration/stores_test
+		Context("Stores", Label("stores"), func() {
+
+			It("sets, gets, finds and deletes entries", func() {
+				ks := [][]float32{
+					{0.1, 0.2, 0.3},
+					{0.4, 0.5, 0.6},
+					{0.7, 0.8, 0.9},
+				}
+				vs := []string{
+					"test1",
+					"test2",
+					"test3",
+				}
+				setBody := schema.StoresSet{
+					Keys:   ks,
+					Values: vs,
+				}
+
+				url := "http://127.0.0.1:9090/stores/"
+				err := postRequestJSON(url+"set", &setBody)
+				Expect(err).ToNot(HaveOccurred())
+
+				getBody := schema.StoresGet{
+					Keys: ks,
+				}
+				var getRespBody schema.StoresGetResponse
+				err = postRequestResponseJSON(url+"get", &getBody, &getRespBody)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(getRespBody.Keys)).To(Equal(len(ks)))
+
+				for i, v := range getRespBody.Keys {
+					if v[0] == 0.1 {
+						Expect(getRespBody.Values[i]).To(Equal("test1"))
+					} else if v[0] == 0.4 {
+						Expect(getRespBody.Values[i]).To(Equal("test2"))
+					} else {
+						Expect(getRespBody.Values[i]).To(Equal("test3"))
+					}
+				}
+
+				deleteBody := schema.StoresDelete{
+					Keys: [][]float32{
+						{0.1, 0.2, 0.3},
+					},
+				}
+				err = postRequestJSON(url+"delete", &deleteBody)
+				Expect(err).ToNot(HaveOccurred())
+
+				findBody := schema.StoresFind{
+					Key:  []float32{0.1, 0.3, 0.7},
+					Topk: 10,
+				}
+
+				var findRespBody schema.StoresFindResponse
+				err = postRequestResponseJSON(url+"find", &findBody, &findRespBody)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(findRespBody.Keys)).To(Equal(2))
+
+				for i, v := range findRespBody.Keys {
+					if v[0] == 0.4 {
+						Expect(findRespBody.Values[i]).To(Equal("test2"))
+					} else {
+						Expect(findRespBody.Values[i]).To(Equal("test3"))
+					}
+
+					Expect(findRespBody.Similarities[i]).To(BeNumerically(">=", -1))
+					Expect(findRespBody.Similarities[i]).To(BeNumerically("<=", 1))
+				}
+			})
+		})
 	})

 	Context("Config file", func() {
 		BeforeEach(func() {
-			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
+			modelPath := os.Getenv("MODELS_PATH")
 			c, cancel = context.WithCancel(context.Background())

-			metricsService, err := metrics.SetupMetrics()
-			Expect(err).ToNot(HaveOccurred())
-
-			app, err = App(
+			var err error
+			bcl, ml, applicationConfig, err = startup.Startup(
 				append(commonOpts,
-					options.WithContext(c),
-					options.WithMetrics(metricsService),
-					options.WithModelLoader(modelLoader),
-					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
+					config.WithContext(c),
+					config.WithModelPath(modelPath),
+					config.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
+			app, err = App(bcl, ml, applicationConfig)
+			Expect(err).ToNot(HaveOccurred())
+
 			go app.Listen("127.0.0.1:9090")

 			defaultConfig := openai.DefaultConfig("")
@@ -840,7 +1011,10 @@ var _ = Describe("API test", func() {
 		})
 		AfterEach(func() {
 			cancel()
-			app.Shutdown()
+			if app != nil {
+				err := app.Shutdown()
+				Expect(err).ToNot(HaveOccurred())
+			}
 		})
 		It("can generate chat completions from config file (list1)", func() {
 			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}}})
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -0,0 +1,55 @@
+package elevenlabs
+
+import (
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/pkg/model"
+
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		input := new(schema.ElevenLabsTTSRequest)
+		voiceID := c.Params("voice-id")
+
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		modelFile, err := fiberContext.ModelFromContext(c, ml, input.ModelID, false)
+		if err != nil {
+			modelFile = input.ModelID
+			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
+		}
+
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+		if err != nil {
+			modelFile = input.ModelID
+			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
+		} else {
+			if input.ModelID != "" {
+				modelFile = input.ModelID
+			} else {
+				modelFile = cfg.Model
+			}
+		}
+		log.Debug().Msgf("Request for model: %s", modelFile)
+
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
+		if err != nil {
+			return err
+		}
+		return c.Download(filePath)
+	}
+}
--- a/core/http/endpoints/localai/backend_monitor.go
+++ b/core/http/endpoints/localai/backend_monitor.go
@@ -0,0 +1,36 @@
+package localai
+
+import (
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/gofiber/fiber/v2"
+)
+
+func BackendMonitorEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		input := new(schema.BackendMonitorRequest)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		resp, err := bm.CheckAndSample(input.Model)
+		if err != nil {
+			return err
+		}
+		return c.JSON(resp)
+	}
+}
+
+func BackendShutdownEndpoint(bm services.BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.BackendMonitorRequest)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		return bm.ShutdownModel(input.Model)
+	}
+}
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -0,0 +1,146 @@
+package localai
+
+import (
+	"encoding/json"
+	"fmt"
+	"slices"
+
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog/log"
+)
+
+type ModelGalleryEndpointService struct {
+	galleries      []gallery.Gallery
+	modelPath      string
+	galleryApplier *services.GalleryService
+}
+
+type GalleryModel struct {
+	ID string `json:"id"`
+	gallery.GalleryModel
+}
+
+func CreateModelGalleryEndpointService(galleries []gallery.Gallery, modelPath string, galleryApplier *services.GalleryService) ModelGalleryEndpointService {
+	return ModelGalleryEndpointService{
+		galleries:      galleries,
+		modelPath:      modelPath,
+		galleryApplier: galleryApplier,
+	}
+}
+
+func (mgs *ModelGalleryEndpointService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		status := mgs.galleryApplier.GetStatus(c.Params("uuid"))
+		if status == nil {
+			return fmt.Errorf("could not find any status for ID")
+		}
+		return c.JSON(status)
+	}
+}
+
+func (mgs *ModelGalleryEndpointService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		return c.JSON(mgs.galleryApplier.GetAllStatus())
+	}
+}
+
+func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(GalleryModel)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		uuid, err := uuid.NewUUID()
+		if err != nil {
+			return err
+		}
+		mgs.galleryApplier.C <- gallery.GalleryOp{
+			Req:         input.GalleryModel,
+			Id:          uuid.String(),
+			GalleryName: input.ID,
+			Galleries:   mgs.galleries,
+		}
+		return c.JSON(struct {
+			ID        string `json:"uuid"`
+			StatusURL string `json:"status"`
+		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+	}
+}
+
+func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
+
+		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Models found from galleries: %+v", models)
+		for _, m := range models {
+			log.Debug().Msgf("Model found from galleries: %+v", m)
+		}
+		dat, err := json.Marshal(models)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
+
+// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
+func (mgs *ModelGalleryEndpointService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryEndpointService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s already exists", input.Name)
+		}
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Adding %+v to gallery list", *input)
+		mgs.galleries = append(mgs.galleries, *input)
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryEndpointService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s is not currently registered", input.Name)
+		}
+		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		})
+		return c.Send(nil)
+	}
+}
--- a/core/http/endpoints/localai/metrics.go
+++ b/core/http/endpoints/localai/metrics.go
@@ -0,0 +1,43 @@
+package localai
+
+import (
+	"time"
+
+	"github.com/go-skynet/LocalAI/core/services"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/adaptor"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
+)
+
+func LocalAIMetricsEndpoint() fiber.Handler {
+
+	return adaptor.HTTPHandler(promhttp.Handler())
+}
+
+type apiMiddlewareConfig struct {
+	Filter         func(c *fiber.Ctx) bool
+	metricsService *services.LocalAIMetricsService
+}
+
+func LocalAIMetricsAPIMiddleware(metrics *services.LocalAIMetricsService) fiber.Handler {
+	cfg := apiMiddlewareConfig{
+		metricsService: metrics,
+		Filter: func(c *fiber.Ctx) bool {
+			return c.Path() == "/metrics"
+		},
+	}
+
+	return func(c *fiber.Ctx) error {
+		if cfg.Filter != nil && cfg.Filter(c) {
+			return c.Next()
+		}
+		path := c.Path()
+		method := c.Method()
+
+		start := time.Now()
+		err := c.Next()
+		elapsed := float64(time.Since(start)) / float64(time.Second)
+		cfg.metricsService.ObserveAPICall(method, path, elapsed)
+		return err
+	}
+}
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -0,0 +1,121 @@
+package localai
+
+import (
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/store"
+	"github.com/gofiber/fiber/v2"
+)
+
+func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresSet)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		vals := make([][]byte, len(input.Values))
+		for i, v := range input.Values {
+			vals[i] = []byte(v)
+		}
+
+		err = store.SetCols(c.Context(), sb, input.Keys, vals)
+		if err != nil {
+			return err
+		}
+
+		return c.Send(nil)
+	}
+}
+
+func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresDelete)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
+			return err
+		}
+
+		return c.Send(nil)
+	}
+}
+
+func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresGet)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
+		if err != nil {
+			return err
+		}
+
+		res := schema.StoresGetResponse{
+			Keys:   keys,
+			Values: make([]string, len(vals)),
+		}
+
+		for i, v := range vals {
+			res.Values[i] = string(v)
+		}
+
+		return c.JSON(res)
+	}
+}
+
+func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresFind)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
+		if err != nil {
+			return err
+		}
+
+		res := schema.StoresFindResponse{
+			Keys:         keys,
+			Values:       make([]string, len(vals)),
+			Similarities: similarities,
+		}
+
+		for i, v := range vals {
+			res.Values[i] = string(v)
+		}
+
+		return c.JSON(res)
+	}
+}
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -1,37 +1,39 @@
 package localai

 import (
-	fiberContext "github.com/go-skynet/LocalAI/api/ctx"
 	"github.com/go-skynet/LocalAI/core/backend"
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/rs/zerolog/log"
+	"github.com/go-skynet/LocalAI/core/config"
+	fiberContext "github.com/go-skynet/LocalAI/core/http/ctx"
+	"github.com/go-skynet/LocalAI/pkg/model"

-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
 )

-type TTSRequest struct {
-	Model   string `json:"model" yaml:"model"`
-	Input   string `json:"input" yaml:"input"`
-	Backend string `json:"backend" yaml:"backend"`
-}
-
-func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

-		input := new(TTSRequest)
+		input := new(schema.TTSRequest)

 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}

-		modelFile, err := fiberContext.ModelFromContext(c, o.Loader, input.Model, false)
+		modelFile, err := fiberContext.ModelFromContext(c, ml, input.Model, false)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
-		cfg, err := config.Load(modelFile, o.Loader.ModelPath, cm, false, 0, 0, false)
+
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
+			config.LoadOptionDebug(appConfig.Debug),
+			config.LoadOptionThreads(appConfig.Threads),
+			config.LoadOptionContextSize(appConfig.ContextSize),
+			config.LoadOptionF16(appConfig.F16),
+		)
+
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
@@ -44,7 +46,7 @@ func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			cfg.Backend = input.Backend
 		}

-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, o.Loader, o, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -9,8 +9,7 @@ import (
 	"time"

 	"github.com/go-skynet/LocalAI/core/backend"
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -21,12 +20,12 @@ import (
 	"github.com/valyala/fasthttp"
 )

-func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	emptyMessage := ""
 	id := uuid.New().String()
 	created := int(time.Now().Unix())

-	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		initialMessage := schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
@@ -36,7 +35,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}
 		responses <- initialMessage

-		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
@@ -55,9 +54,9 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		})
 		close(responses)
 	}
-	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		result := ""
-		_, tokenUsage, _ := ComputeChoices(req, prompt, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+		_, tokenUsage, _ := ComputeChoices(req, prompt, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			result += s
 			// TODO: Change generated BNF grammar to be compliant with the schema so we can
 			// stream the result token by token here.
@@ -78,7 +77,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			}
 			responses <- initialMessage

-			result, err := handleQuestion(config, req, o, results[0].arguments, prompt)
+			result, err := handleQuestion(config, req, ml, startupOptions, results[0].arguments, prompt)
 			if err != nil {
 				log.Error().Msgf("error handling question: %s", err.Error())
 				return
@@ -154,12 +153,12 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 	return func(c *fiber.Ctx) error {
 		processFunctions := false
 		funcs := grammar.Functions{}
-		modelFile, input, err := readRequest(c, o, true)
+		modelFile, input, err := readRequest(c, ml, startupOptions, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := mergeRequestWithConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, startupOptions.Debug, startupOptions.Threads, startupOptions.ContextSize, startupOptions.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -249,10 +248,13 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					Role:         r,
 					RoleName:     role,
 					Content:      i.StringContent,
+					FunctionCall: i.FunctionCall,
 					FunctionName: i.Name,
+					LastMessage:  messageIndex == (len(input.Messages) - 1),
+					Function:     config.Grammar != "" && (messageIndex == (len(input.Messages) - 1)),
 					MessageIndex: messageIndex,
 				}
-				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
+				templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
 				if err != nil {
 					log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, config.TemplateConfig.ChatMessage, err)
 				} else {
@@ -320,7 +322,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		templateFile := ""

 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
 			templateFile = config.Model
 		}

@@ -333,7 +335,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}

 		if templateFile != "" {
-			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
+			templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
 				SystemPrompt:         config.SystemPrompt,
 				SuppressSystemPrompt: suppressConfigSystemPrompt,
 				Input:                predInput,
@@ -357,9 +359,9 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			responses := make(chan schema.OpenAIResponse)

 			if !processFunctions {
-				go process(predInput, input, config, o.Loader, responses)
+				go process(predInput, input, config, ml, responses)
 			} else {
-				go processTools(noActionName, predInput, input, config, o.Loader, responses)
+				go processTools(noActionName, predInput, input, config, ml, responses)
 			}

 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
@@ -413,7 +415,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		// no streaming mode
 		default:
-			result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+			result, tokenUsage, err := ComputeChoices(input, predInput, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
 				if !processFunctions {
 					// no function is called, just reply and use stop as finish reason
 					*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
@@ -425,7 +427,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 				switch {
 				case noActionsToRun:
-					result, err := handleQuestion(config, input, o, results[0].arguments, predInput)
+					result, err := handleQuestion(config, input, ml, startupOptions, results[0].arguments, predInput)
 					if err != nil {
 						log.Error().Msgf("error handling question: %s", err.Error())
 						return
@@ -506,7 +508,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 	}
 }

-func handleQuestion(config *config.Config, input *schema.OpenAIRequest, o *options.Option, args, prompt string) (string, error) {
+func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, args, prompt string) (string, error) {
 	log.Debug().Msgf("nothing to do, computing a reply")

 	// If there is a message that the LLM already sends as part of the JSON reply, use it
@@ -535,7 +537,7 @@ func handleQuestion(config *config.Config, input *schema.OpenAIRequest, o *optio
 		images = append(images, m.StringImages...)
 	}

-	predFunc, err := backend.ModelInference(input.Context, prompt, images, o.Loader, *config, o, nil)
+	predFunc, err := backend.ModelInference(input.Context, prompt, images, ml, *config, o, nil)
 	if err != nil {
 		log.Error().Msgf("inference error: %s", err.Error())
 		return "", err
@@ -565,10 +567,20 @@ func parseFunctionCall(llmresult string, multipleResults bool) []funcCallResults
 		log.Debug().Msgf("Function return: %s %+v", s, ss)

 		for _, s := range ss {
-			func_name := s["function"]
-			args := s["arguments"]
+			func_name, ok := s["function"]
+			if !ok {
+				continue
+			}
+			args, ok := s["arguments"]
+			if !ok {
+				continue
+			}
 			d, _ := json.Marshal(args)
-			results = append(results, funcCallResults{name: func_name.(string), arguments: string(d)})
+			funcName, ok := func_name.(string)
+			if !ok {
+				continue
+			}
+			results = append(results, funcCallResults{name: funcName, arguments: string(d)})
 		}
 	} else {
 		// As we have to change the result before processing, we can't stream the answer token-by-token (yet?)
@@ -579,12 +591,21 @@ func parseFunctionCall(llmresult string, multipleResults bool) []funcCallResults
 		log.Debug().Msgf("Function return: %s %+v", s, ss)

 		// The grammar defines the function name as "function", while OpenAI returns "name"
-		func_name := ss["function"]
+		func_name, ok := ss["function"]
+		if !ok {
+			return results
+		}
 		// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
-		args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
+		args, ok := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
+		if !ok {
+			return results
+		}
 		d, _ := json.Marshal(args)
-
-		results = append(results, funcCallResults{name: func_name.(string), arguments: string(d)})
+		funcName, ok := func_name.(string)
+		if !ok {
+			return results
+		}
+		results = append(results, funcCallResults{name: funcName, arguments: string(d)})
 	}

 	return results
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -9,8 +9,8 @@ import (
 	"time"

 	"github.com/go-skynet/LocalAI/core/backend"
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/config"
+
 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -21,12 +21,12 @@ import (
 )

 // https://platform.openai.com/docs/api-reference/completions
-func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	id := uuid.New().String()
 	created := int(time.Now().Unix())

-	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
-		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		ComputeChoices(req, s, config, appConfig, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
@@ -53,14 +53,14 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 	}

 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readRequest(c, o, true)
+		modelFile, input, err := readRequest(c, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

 		log.Debug().Msgf("`input`: %+v", input)

-		config, input, err := mergeRequestWithConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -84,7 +84,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		templateFile := ""

 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
 			templateFile = config.Model
 		}

@@ -100,7 +100,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			predInput := config.PromptStrings[0]

 			if templateFile != "" {
-				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
 					Input: predInput,
 				})
 				if err == nil {
@@ -111,7 +111,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe

 			responses := make(chan schema.OpenAIResponse)

-			go process(predInput, input, config, o.Loader, responses)
+			go process(predInput, input, config, ml, responses)

 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {

@@ -153,7 +153,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		for k, i := range config.PromptStrings {
 			if templateFile != "" {
 				// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
 					SystemPrompt: config.SystemPrompt,
 					Input:        i,
 				})
@@ -164,7 +164,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			}

 			r, tokenUsage, err := ComputeChoices(
-				input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+				input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
 					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
 				}, nil)
 			if err != nil {
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -6,8 +6,8 @@ import (
 	"time"

 	"github.com/go-skynet/LocalAI/core/backend"
-	config "github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/options"
+	"github.com/go-skynet/LocalAI/core/config"
+
 	"github.com/go-skynet/LocalAI/core/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
@@ -16,14 +16,14 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readRequest(c, o, true)
+		modelFile, input, err := readRequest(c, ml, appConfig, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

-		config, input, err := mergeRequestWithConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -33,7 +33,7 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		templateFile := ""

 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if o.Loader.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
 			templateFile = config.Model
 		}

@@ -46,7 +46,7 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		for _, i := range config.InputStrings {
 			if templateFile != "" {
-				templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
+				templatedInput, err := ml.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
 					Input:        i,
 					Instruction:  input.Instruction,
 					SystemPrompt: config.SystemPrompt,
@@ -57,7 +57,7 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 			}

-			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+			r, tokenUsage, err := ComputeChoices(input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
 				*c = append(*c, schema.Choice{Text: s})
 			}, nil)
 			if err != nil {
--- a/Show More
+++ b/Show More