fix(autogptq): do not use_triton with qwen-vl (#1985 )

* Enhance autogptq backend to support VL models * update dependencies for autogptq * remove redundant auto-gptq dependency * Convert base64 to image_url for Qwen-VL model * implemented model inference for qwen-vl * remove user prompt from generated answer * fixed write image error * fixed use_triton issue when loading Qwen-VL model --------- Co-authored-by: Binghua Wu <bingwu@estee.com>
ci: push latest images for dockerhub (#1984 )
2026-02-03 11:13:31 -05:00 · 2024-04-11 12:33:58 +02:00 · 2024-04-10 10:31:59 +02:00 · 2024-04-10 09:08:00 +02:00 · 2024-04-10 09:07:41 +02:00 · 2024-04-10 09:07:21 +02:00
133 changed files with 10042 additions and 1149 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,4 +3,4 @@ models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
-Dockerfile
+Dockerfile*
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,31 @@
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.go]
+indent_style = tab
+
+[Makefile]
+indent_style = tab
+
+[*.proto]
+indent_size = 2
+
+[*.py]
+indent_size = 4
+
+[*.js]
+indent_size = 2
+
+[*.yaml]
+indent_size = 2
+
+[*.md]
+trim_trailing_whitespace = false
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -0,0 +1,19 @@
+enhancements:
+ - head-branch: ['^feature', 'feature']
+
+kind/documentation:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'docs/*'
+  - changed-files:
+    - any-glob-to-any-file: '*.md'
+
+examples:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'examples/*'
+
+ci:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: '.github/*'
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -12,13 +12,23 @@ changelog:
    - title: "Bug fixes :bug:"
      labels:
        - bug
+        - regression
    - title: Exciting New Features 🎉
      labels:
        - Semver-Minor
        - enhancement
+        - ux
+        - roadmap
+    - title: 🧠 Models
+      labels:
+        - area/ai-model
+    - title: 📖 Documentation and examples
+      labels:
+        - kind/documentation
+        - examples
    - title: 👒 Dependencies
      labels:
        - dependencies
    - title: Other Changes
      labels:
-        - "*"
+        - "*"
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -22,6 +22,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -41,6 +42,7 @@ jobs:
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -51,6 +53,7 @@ jobs:
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -59,6 +62,7 @@ jobs:
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -67,6 +71,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -80,6 +85,7 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -96,6 +102,7 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -104,6 +111,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -113,4 +121,5 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=5 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -26,6 +26,10 @@ jobs:
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      aio: ${{ matrix.aio }}
+      makeflags: ${{ matrix.makeflags }}
+      latest-image: ${{ matrix.latest-image }}
+      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -47,14 +51,16 @@ jobs:
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -65,6 +71,7 @@ jobs:
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -75,26 +82,35 @@ jobs:
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            aio: "-aio-gpu-nvidia-cuda-11"
+            latest-image: 'latest-gpu-nvidia-cuda-11'
+            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
+            aio: "-aio-gpu-nvidia-cuda-12"
+            latest-image: 'latest-gpu-nvidia-cuda-12'
+            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
@@ -104,14 +120,19 @@ jobs:
            image-type: 'extras'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
+            latest-image: 'latest-gpu-hipblas'
+            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -120,22 +141,31 @@ jobs:
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            aio: "-aio-gpu-intel-f16"
+            latest-image: 'latest-gpu-intel-f16'
+            latest-image-aio: 'latest-aio-gpu-intel-f16'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
+            aio: "-aio-gpu-intel-f32"
+            latest-image: 'latest-gpu-intel-f32'
+            latest-image-aio: 'latest-aio-gpu-intel-f32'
+            makeflags: "--jobs=3 --output-sync=target"
          # Core images
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
@@ -145,6 +175,7 @@ jobs:
            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -153,6 +184,7 @@ jobs:
            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -161,6 +193,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -169,6 +202,7 @@ jobs:
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -177,6 +211,7 @@ jobs:
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -185,6 +220,7 @@ jobs:
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
  
  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -198,7 +234,11 @@ jobs:
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
+      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
+      makeflags: ${{ matrix.makeflags }}
+      latest-image: ${{ matrix.latest-image }}
+      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -209,12 +249,16 @@ jobs:
        include:
          - build-type: ''
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
+            aio: "-aio-cpu"
+            latest-image: 'latest-cpu'
+            latest-image-aio: 'latest-aio-cpu'
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -225,6 +269,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -235,6 +280,7 @@ jobs:
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -245,6 +291,7 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=5 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "1"
@@ -255,3 +302,4 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=5 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -29,6 +29,14 @@ on:
        description: 'Tag latest'
        default: ''
        type: string
+      latest-image:
+          description: 'Tag latest'
+          default: ''
+          type: string
+      latest-image-aio:
+          description: 'Tag latest'
+          default: ''
+          type: string
      tag-suffix:
        description: 'Tag suffix'
        default: ''
@@ -46,6 +54,16 @@ on:
        required: true
        default: ''
        type: string
+      makeflags:
+        description: 'Make Flags'
+        required: false
+        default: '--jobs=3 --output-sync=target'
+        type: string
+      aio:
+        description: 'AIO Image Name'
+        required: false
+        default: ''
+        type: string
    secrets:
      dockerUsername:
        required: true
@@ -69,6 +87,7 @@ jobs:
          && sudo apt-get install -y git
      - name: Checkout
        uses: actions/checkout@v4
+
      - name: Release space from worker
        if: inputs.runs-on == 'ubuntu-latest'
        run: |
@@ -110,6 +129,7 @@ jobs:
          sudo rm -rf "/usr/local/share/boost" || true
          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
          df -h
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
@@ -125,6 +145,34 @@ jobs:
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }}

+      - name: Docker meta AIO (quay.io)
+        if: inputs.aio != ''
+        id: meta_aio
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }}
+
+      - name: Docker meta AIO (dockerhub)
+        if: inputs.aio != ''
+        id: meta_aio_dockerhub
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            localai/localai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }}
+
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
        with:
@@ -149,6 +197,25 @@ jobs:
          username: ${{ secrets.quayUsername }}
          password: ${{ secrets.quayPassword }}

+      - name: Cache GRPC
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            IMAGE_TYPE=${{ inputs.image-type }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            MAKEFLAGS=${{ inputs.makeflags }}
+            GRPC_VERSION=v1.58.0
+          context: .
+          file: ./Dockerfile
+          cache-from: type=gha
+          cache-to: type=gha,ignore-error=true
+          target: grpc
+          platforms: ${{ inputs.platforms }}
+          push: false
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
      - name: Build and push
        uses: docker/build-push-action@v5
        with:
@@ -160,12 +227,79 @@ jobs:
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
+            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
+          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
+
+      - name: Inspect image
+        if: github.event_name != 'pull_request'
+        run: |
+          docker pull localai/localai:${{ steps.meta.outputs.version }}
+          docker image inspect localai/localai:${{ steps.meta.outputs.version }}
+          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+          docker image inspect quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+
+      - name: Build and push AIO image
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio.outputs.tags }}
+          labels: ${{ steps.meta_aio.outputs.labels }}
+
+      - name: Build and push AIO image (dockerhub)
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=localai/localai:${{ steps.meta.outputs.version }}
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
+          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
+
+      - name: Latest tag
+        # run this on branches, when it is a tag and there is a latest-image defined
+        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
+        run: |
+          docker pull localai/localai:${{ steps.meta.outputs.version }}
+          docker tag localai/localai:${{ steps.meta.outputs.version }} localai/localai:${{ inputs.latest-image }}
+          docker push localai/localai:${{ inputs.latest-image }}
+          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+          docker tag quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
+          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
+      - name: Latest AIO tag
+        # run this on branches, when it is a tag and there is a latest-image defined
+        if: github.event_name != 'pull_request' && inputs.latest-image-aio != ''  && github.ref_type == 'tag'
+        run: |
+          docker pull localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }}
+          docker tag localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }} localai/localai:${{ inputs.latest-image-aio }}
+          docker push localai/localai:${{ inputs.latest-image-aio }}
+          docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
+          docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
+          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
+  
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
+
+      - name: job summary(AIO)
+        if: inputs.aio != ''
+        run: |
+          echo "Built image: ${{ steps.meta_aio.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,12 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+  labeler:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v5
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -2,6 +2,9 @@ name: Build and Release

 on: push

+env:
+  GRPC_VERSION: v1.58.0
+
 permissions:
  contents: write

@@ -32,7 +35,8 @@ jobs:
          submodules: true
      - uses: actions/setup-go@v4
        with:
-          go-version: '>=1.21.0'
+          go-version: '1.21.x'
+          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
@@ -54,17 +58,17 @@ jobs:
        uses: actions/cache@v3
        with:
          path: grpc
-          key: ${{ runner.os }}-grpc
+          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
      - name: Build grpc
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
-          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make -j12
+            ../.. && sudo make --jobs 5 --output-sync=target
      - name: Install gRPC
        run: |
-          cd grpc && cd cmake/build && sudo make -j12 install
+          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
      - name: Build
        id: build
        env:
@@ -98,11 +102,11 @@ jobs:
          submodules: true
      - uses: actions/setup-go@v4
        with:
-          go-version: '>=1.21.0'
+          go-version: '1.21.x'
+          cache: false
      - name: Dependencies
        run: |
          sudo apt-get install -y --no-install-recommends libopencv-dev
-          sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
      - name: Build stablediffusion
        run: |
          make backend-assets/grpc/stablediffusion
@@ -136,7 +140,8 @@ jobs:
          submodules: true
      - uses: actions/setup-go@v4
        with:
-          go-version: '>=1.21.0'
+          go-version: '1.21.x'
+          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -0,0 +1,27 @@
+name: "Security Scan"
+
+# Run workflow each time code is pushed to your repository and on a schedule.
+# The scheduled workflow runs every at 00:00 on Sunday UTC time.
+on:
+  push:
+  schedule:
+  - cron: '0 0 * * 0'
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    env:
+      GO111MODULE: on
+    steps:
+      - name: Checkout Source
+        uses: actions/checkout@v3
+      - name: Run Gosec Security Scanner
+        uses: securego/gosec@master
+        with:
+          # we let the report trigger content trigger a failure using the GitHub Security features.
+          args: '-no-fail -fmt sarif -out results.sarif ./...'
+      - name: Upload SARIF file
+        uses: github/codeql-action/upload-sarif@v2
+        with:
+          # Path to SARIF file relative to the root of the repository
+          sarif_file: results.sarif
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -33,15 +33,15 @@ jobs:
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          sudo apt-get install -y libopencv-dev
          
          sudo rm -rfv /usr/bin/conda || true

      - name: Test transformers
        run: |
           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/transformers
-           make -C backend/python/transformers test
+           make --jobs=5 --output-sync=target -C backend/python/transformers
+           make --jobs=5 --output-sync=target -C backend/python/transformers test

  tests-sentencetransformers:
    runs-on: ubuntu-latest
@@ -62,15 +62,15 @@ jobs:
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          sudo apt-get install -y libopencv-dev
          
          sudo rm -rfv /usr/bin/conda || true

      - name: Test sentencetransformers
        run: |
           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/sentencetransformers
-           make -C backend/python/sentencetransformers test
+           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
+           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test

  tests-diffusers:
    runs-on: ubuntu-latest
@@ -91,15 +91,15 @@ jobs:
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          sudo apt-get install -y libopencv-dev
          
          sudo rm -rfv /usr/bin/conda || true

      - name: Test diffusers
        run: |
           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/diffusers
-           make -C backend/python/diffusers test
+           make --jobs=5 --output-sync=target -C backend/python/diffusers
+           make --jobs=5 --output-sync=target -C backend/python/diffusers test


  tests-transformers-musicgen:
@@ -121,46 +121,46 @@ jobs:
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          sudo apt-get install -y libopencv-dev
          
          sudo rm -rfv /usr/bin/conda || true

      - name: Test transformers-musicgen
        run: |
           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/transformers-musicgen
-           make -C backend/python/transformers-musicgen test
+           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test



-  tests-petals:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+  # tests-petals:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+  #            sudo apt-get update && \
+  #            sudo apt-get install -y conda
+  #         sudo apt-get install -y ca-certificates cmake curl patch
+  #         sudo apt-get install -y libopencv-dev
          
-          sudo rm -rfv /usr/bin/conda || true
+  #         sudo rm -rfv /usr/bin/conda || true

-      - name: Test petals
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/petals
-           make -C backend/python/petals test
+  #     - name: Test petals
+  #       run: |
+  #          export PATH=$PATH:/opt/conda/bin
+  #          make --jobs=5 --output-sync=target -C backend/python/petals
+  #          make --jobs=5 --output-sync=target -C backend/python/petals test

           

@@ -223,15 +223,15 @@ jobs:
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
  #         sudo apt-get install -y ca-certificates cmake curl patch
-  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+  #         sudo apt-get install -y libopencv-dev
          
  #         sudo rm -rfv /usr/bin/conda || true

  #     - name: Test bark
  #       run: |
  #          export PATH=$PATH:/opt/conda/bin
-  #          make -C backend/python/bark
-  #          make -C backend/python/bark test
+  #          make --jobs=5 --output-sync=target -C backend/python/bark
+  #          make --jobs=5 --output-sync=target -C backend/python/bark test

           
  # Below tests needs GPU. Commented out for now
@@ -255,13 +255,13 @@ jobs:
  #            sudo apt-get update && \
  #            sudo apt-get install -y conda
  #         sudo apt-get install -y ca-certificates cmake curl patch
-  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+  #         sudo apt-get install -y libopencv-dev
  #         sudo rm -rfv /usr/bin/conda || true
  #     - name: Test vllm
  #       run: |
  #          export PATH=$PATH:/opt/conda/bin
-  #          make -C backend/python/vllm
-  #          make -C backend/python/vllm test
+  #          make --jobs=5 --output-sync=target -C backend/python/vllm
+  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
  tests-vallex:
    runs-on: ubuntu-latest
    steps:
@@ -281,13 +281,13 @@ jobs:
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2    
+          sudo apt-get install -y libopencv-dev    
          sudo rm -rfv /usr/bin/conda || true
      - name: Test vall-e-x
        run: |
           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/vall-e-x
-           make -C backend/python/vall-e-x test
+           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
+           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test

  tests-coqui:
    runs-on: ubuntu-latest
@@ -313,5 +313,5 @@ jobs:
      - name: Test coqui
        run: |
           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/coqui
-           make -C backend/python/coqui test
+           make --jobs=5 --output-sync=target -C backend/python/coqui
+           make --jobs=5 --output-sync=target -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,6 +9,9 @@ on:
    tags:
      - '*'

+env:
+  GRPC_VERSION: v1.58.0
+
 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
  cancel-in-progress: true
@@ -60,6 +63,7 @@ jobs:
        uses: actions/setup-go@v4
        with:
          go-version: ${{ matrix.go-version }}
+          cache: false
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
@@ -75,7 +79,7 @@ jobs:
             sudo apt-get update && \
             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          sudo apt-get install -y libopencv-dev
          
          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
@@ -91,23 +95,79 @@ jobs:
        uses: actions/cache@v3
        with:
          path: grpc
-          key: ${{ runner.os }}-grpc
+          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
      - name: Build grpc
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
-          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make -j12
+            ../.. && sudo make --jobs 5
      - name: Install gRPC
        run: |
-          cd grpc && cd cmake/build && sudo make -j12 install
+          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          GO_TAGS="stablediffusion tts" make test
+          GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
+        timeout-minutes: 5
+
+  tests-aio-container:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Build images
+        run: |
+          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=core --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
+          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+      - name: Test
+        run: |
+          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            make run-e2e-aio
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
+        timeout-minutes: 5

  tests-apple:
-    runs-on: macOS-latest
+    runs-on: macOS-14
    strategy:
      matrix:
        go-version: ['1.21.x']
@@ -120,14 +180,21 @@ jobs:
        uses: actions/setup-go@v4
        with:
          go-version: ${{ matrix.go-version }}
+          cache: false
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc
+          brew install protobuf grpc make
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
-          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
+          # Used to run the newer GNUMake version from brew that supports --output-sync
+          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
+          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
+        timeout-minutes: 5
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "golang.go"
+    ]
+}
--- a/76
+++ b/76
@@ -63,7 +63,9 @@ WORKDIR /build
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')

-# Extras requirements
+###################################
+###################################
+
 FROM requirements-core as requirements-extras

 RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
@@ -88,13 +90,40 @@ RUN if [ ! -e /usr/bin/python ]; then \
 ###################################
 ###################################

+FROM ${BASE_IMAGE} as grpc
+
+ARG MAKEFLAGS
+ARG GRPC_VERSION=v1.58.0
+
+ENV MAKEFLAGS=${MAKEFLAGS}
+
+WORKDIR /build
+
+RUN apt-get update && \
+    apt-get install -y g++ cmake git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc
+
+RUN cd grpc && \
+    mkdir -p cmake/build && \
+    cd cmake/build && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF ../.. && \
+    make
+
+###################################
+###################################
+
 FROM requirements-${IMAGE_TYPE} as builder

 ARG GO_TAGS="stablediffusion tts"
 ARG GRPC_BACKENDS
-ARG BUILD_GRPC=true
+ARG MAKEFLAGS
+
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
+ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
@@ -103,6 +132,7 @@ WORKDIR /build

 COPY . .
 COPY .git .
+RUN echo "GO_TAGS: $GO_TAGS"
 RUN make prepare

 # If we are building with clblas support, we need the libraries for the builds
@@ -115,12 +145,9 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

-RUN if [ "${BUILD_GRPC}" = "true" ]; then \
-    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-      -DgRPC_BUILD_TESTS=OFF \
-       ../.. && make -j12 install \
-    ; fi
+COPY --from=grpc /build/grpc ./grpc/
+
+RUN cd /build/grpc/cmake/build && make install

 # Rebuild with defaults backends
 RUN make build
@@ -139,10 +166,12 @@ ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
 ARG IMAGE_TYPE=extras
+ARG MAKEFLAGS

 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
+ENV MAKEFLAGS=${MAKEFLAGS}

 ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
@@ -171,7 +200,7 @@ WORKDIR /build
 COPY . .

 COPY --from=builder /build/sources ./sources/
-COPY --from=builder /build/grpc ./grpc/
+COPY --from=grpc /build/grpc ./grpc/

 RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc

@@ -186,43 +215,43 @@ COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/

 ## Duplicated from Makefile to avoid having a big layer that's hard to push
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/autogptq \
+    make -C backend/python/autogptq \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/bark \
+    make -C backend/python/bark \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/diffusers \
+    make -C backend/python/diffusers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/vllm \
+    make -C backend/python/vllm \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/mamba \
+    make -C backend/python/mamba \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/sentencetransformers \
+    make -C backend/python/sentencetransformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/transformers \
+    make -C backend/python/transformers \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/vall-e-x \
+    make -C backend/python/vall-e-x \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/exllama \
+    make -C backend/python/exllama \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-     make -C backend/python/exllama2 \
+    make -C backend/python/exllama2 \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/petals \
+    make -C backend/python/petals \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/transformers-musicgen \
+    make -C backend/python/transformers-musicgen \
    ; fi
 RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/coqui \
+    make -C backend/python/coqui \
    ; fi

 # Make sure the models directory exists
@@ -231,6 +260,7 @@ RUN mkdir -p /build/models
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
-
+  
+VOLUME /build/models
 EXPOSE 8080
 ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/Dockerfile.aio
+++ b/Dockerfile.aio
@@ -0,0 +1,8 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} 
+
+RUN apt-get update && apt-get install -y pciutils && apt-get clean
+
+COPY aio/ /aio
+ENTRYPOINT [ "/aio/entrypoint.sh" ]
--- a/299
+++ b/299
@@ -4,11 +4,8 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c
-
-GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
-
-CPPLLAMA_VERSION?=4755afd1cbd40d93c017e5b98c39796f52345314
+GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
+CPPLLAMA_VERSION?=1b67731e184e27a465b8c5476061294a4af668ea

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -19,7 +16,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
+WHISPER_CPP_VERSION?=8f253ef3af1c62c04316ba4afa7145fc4d701a8c

 # bert.cpp version
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
@@ -31,13 +28,14 @@ PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
 STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485

 # tinydream version
-TINYDREAM_VERSION?=772a9c0d9aaf768290e63cca3c904fe69faf677a
+TINYDREAM_VERSION?=22a12a4bc0ac5455856f28f3b771331a551a4293

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=

 CGO_LDFLAGS?=
+CGO_LDFLAGS_WHISPER?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git
@@ -72,7 +70,7 @@ UNAME_S := $(shell uname -s)
 endif

 ifeq ($(OS),Darwin)
-	CGO_LDFLAGS += -lcblas -framework Accelerate
+	
 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
 	endif
@@ -83,6 +81,12 @@ ifeq ($(OS),Darwin)
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+		export LLAMA_NO_ACCELERATE=1
+	endif
+
+	ifeq ($(BUILD_TYPE),metal)
+#			-lcblas 	removed: it seems to always be listed as a duplicate flag.
+		CGO_LDFLAGS += -framework Accelerate
 	endif
 endif

@@ -91,10 +95,12 @@ ifeq ($(BUILD_TYPE),openblas)
 	export WHISPER_OPENBLAS=1
 endif

+
 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
 	export WHISPER_CUBLAS=1
+	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
 endif

 ifeq ($(BUILD_TYPE),hipblas)
@@ -148,12 +154,12 @@ endif

 ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
+ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)

 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
@@ -168,40 +174,41 @@ ifeq ($(BUILD_API_ONLY),true)
 	GRPC_BACKENDS=
 endif

-.PHONY: all test build vendor
+.PHONY: all test build vendor get-sources prepare-sources prepare

 all: help

-## GPT4ALL
-sources/gpt4all:
-	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
-	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
-
-## go-piper
-sources/go-piper:
-	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
-	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
-
 ## BERT embeddings
 sources/go-bert:
 	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
 	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

-## stable diffusion
-sources/go-stable-diffusion:
-	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
-	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-bert/libgobert.a: sources/go-bert
+	$(MAKE) -C sources/go-bert libgobert.a

-sources/go-stable-diffusion/libstablediffusion.a:
-	$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
+## go-llama-ggml
+sources/go-llama-ggml:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
+	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

-## tiny-dream
-sources/go-tiny-dream:
-	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
-	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
+sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
+	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-sources/go-tiny-dream/libtinydream.a:
-	$(MAKE) -C sources/go-tiny-dream libtinydream.a
+## go-piper
+sources/go-piper:
+	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
+	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
+
+sources/go-piper/libpiper_binding.a: sources/go-piper
+	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
+
+## GPT4ALL
+sources/gpt4all:
+	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
+	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
+
+sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
 sources/go-rwkv:
@@ -211,23 +218,23 @@ sources/go-rwkv:
 sources/go-rwkv/librwkv.a: sources/go-rwkv
 	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-sources/go-bert/libgobert.a: sources/go-bert
-	$(MAKE) -C sources/go-bert libgobert.a
+## stable diffusion
+sources/go-stable-diffusion:
+	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
+	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1

-backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	mkdir -p backend-assets/gpt4all
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
+	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a

-backend-assets/espeak-ng-data: sources/go-piper
-	mkdir -p backend-assets/espeak-ng-data
-	$(MAKE) -C sources/go-piper piper.o
-	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
+## tiny-dream
+sources/go-tiny-dream:
+	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
+	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1

-sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
-	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
+sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
+	$(MAKE) -C sources/go-tiny-dream libtinydream.a

+## whisper
 sources/whisper.cpp:
 	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
@@ -235,47 +242,35 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && make libwhisper.a

-sources/go-llama:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
-	cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-llama-ggml:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-llama/libbinding.a: sources/go-llama
-	$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
-
-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
-
-sources/go-piper/libpiper_binding.a: sources/go-piper
-	$(MAKE) -C sources/go-piper libpiper_binding.a example/main
-
-backend/cpp/llama/llama.cpp:
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
-
-get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
-	touch $@
+get-sources: sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream

 replace:
-	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+
+dropreplace:
+	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
+	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
+	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
+	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
+	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
+	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
+	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
+	$(GOCMD) mod edit -dropreplace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
-	touch $@

 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama clean
 	$(MAKE) -C sources/go-llama-ggml clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C sources/go-rwkv clean
@@ -287,7 +282,6 @@ rebuild: ## Rebuilds the project
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
-	touch $@

 clean: ## Remove build related file
 	$(GOCMD) clean -cache
@@ -298,16 +292,27 @@ clean: ## Remove build related file
 	rm -rf backend-assets
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/cpp/llama clean
+	$(MAKE) dropreplace
+
+clean-tests:
+	rm -rf test-models
+	rm -rf test-dir
+	rm -rf core/http/backend-assets

 ## Build:
-
-build: backend-assets grpcs prepare ## Build the project
+build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

+build-minimal:
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS=backend-assets/grpc/llama-cpp GO_TAGS=none $(MAKE) build
+
+build-api:
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
+
 dist: build
 	mkdir -p release
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
@@ -319,10 +324,10 @@ osx-signed: build
 run: prepare ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./

-test-models/testmodel:
+test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel
+	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -334,9 +339,9 @@ prepare-test: grpcs
 	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models

-test: prepare test-models/testmodel grpcs
+test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts stablediffusion"
+	export GO_TAGS="tts stablediffusion debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
@@ -350,12 +355,16 @@ prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
-	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .

 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests

+run-e2e-aio:
+	@echo 'Running e2e AIO tests'
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
+
 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
@@ -386,6 +395,11 @@ test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)

+test-stores: backend-assets/grpc/local-store
+	mkdir -p tests/integration/backend-assets/grpc
+	cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
+
 test-container:
 	docker build --target requirements -t local-ai-test-container .
 	docker run -ti --rm --entrypoint /bin/bash -ti -v $(abspath ./):/build local-ai-test-container
@@ -454,39 +468,55 @@ ifeq ($(BUILD_API_ONLY),true)
 	touch backend-assets/keep
 endif

-backend-assets/grpc:
+backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_binding.a
+	mkdir -p backend-assets/espeak-ng-data
+	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
+
+backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	mkdir -p backend-assets/gpt4all
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+
+backend-assets/grpc: replace
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
-# TODO: every binary should have its own folder instead, so can have different  implementations
+backend-assets/grpc/bert-embeddings: sources/go-bert sources/go-bert/libgobert.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
+
+backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
+
+backend-assets/grpc/langchain-huggingface: backend-assets/grpc
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
+
+backend/cpp/llama/llama.cpp:
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp

-## BACKEND CPP LLAMA START
-# Sets the variables in case it has to build the gRPC locally.
 INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
 INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
 ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
-                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
-                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
-                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
-                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
-
+				 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
 backend/cpp/llama/grpc-server:
+# Conditionally build grpc for the llama backend to use if needed
 ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	$(MAKE) -C backend/cpp/grpc build
-	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
-	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
-	export PATH="${INSTALLED_PACKAGES}/bin:${PATH}" && \
-	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+	_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
+	_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
+	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
+	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) \
+	$(MAKE) -C backend/cpp/llama grpc-server
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
 endif
-## BACKEND CPP LLAMA END

-##
 backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
 	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
 # TODO: every binary should have its own folder instead, so can have different metal implementations
@@ -494,49 +524,38 @@ ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif

-backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
+backend-assets/grpc/llama-ggml: sources/go-llama-ggml sources/go-llama-ggml/libbinding.a backend-assets/grpc
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/

-backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
-
-backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
-
-backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
-
-backend-assets/grpc/langchain-huggingface: backend-assets/grpc
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
-
-backend-assets/grpc/stablediffusion: backend-assets/grpc
-	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
-		$(MAKE) sources/go-stable-diffusion; \
-		$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
-		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-stable-diffusion/ LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
-		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion; \
-	fi
-
-backend-assets/grpc/tinydream: backend-assets/grpc sources/go-tiny-dream/libtinydream.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
-
-backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
+backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

-backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+backend-assets/grpc/rwkv: sources/go-rwkv sources/go-rwkv/librwkv.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
+
+backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
+
+backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
+
+backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/

+backend-assets/grpc/local-store: backend-assets/grpc
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
+
 grpcs: prepare $(GRPC_BACKENDS)

 DOCKER_IMAGE?=local-ai
+DOCKER_AIO_IMAGE?=local-ai-aio
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:22.04

@@ -544,15 +563,28 @@ docker:
 	docker build \
 		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
-		--build-arg GO_TAGS=$(GO_TAGS) \
+		--build-arg GO_TAGS="$(GO_TAGS)" \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
 		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
 		-t $(DOCKER_IMAGE) .
+	
+docker-aio:
+	@echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
+	docker build \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
+		-t $(DOCKER_AIO_IMAGE) -f Dockerfile.aio .
+
+docker-aio-all:
+	$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
+	$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu

 docker-image-intel:
 	docker build \
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
 		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .

 docker-image-intel-xpu:
@@ -560,4 +592,9 @@ docker-image-intel-xpu:
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
-		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
+		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+
+.PHONY: swagger
+swagger:
+	swag init -g core/http/api.go --output swagger
--- a/README.md
+++ b/README.md
@@ -20,14 +20,14 @@
 </a>
 </p>

-[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
-[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)
-
-> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
->
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
-
-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
+<p align="center">
+<a href="https://hub.docker.com/r/localai/localai" target="blank">
+<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
+</a>
+<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
+<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
+</a>
+</p>

 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
@@ -36,20 +36,27 @@
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>
+</p>

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+>
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
+
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.

 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726
+- Landing page: https://github.com/mudler/LocalAI/pull/1922
+- Openvino support: https://github.com/mudler/LocalAI/pull/1892
+- Vector store: https://github.com/mudler/LocalAI/pull/1795
+- All-in-one container image: https://github.com/mudler/LocalAI/issues/1855
+- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726 / Tools API support: https://github.com/mudler/LocalAI/pull/1715
 - Upload file API: https://github.com/mudler/LocalAI/pull/1703
- Tools API support: https://github.com/mudler/LocalAI/pull/1715
- LLaVa 1.6: https://github.com/mudler/LocalAI/pull/1714
- ROCm container images: https://github.com/mudler/LocalAI/pull/1595
- Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
- Deprecation of old backends: https://github.com/mudler/LocalAI/issues/1651
+- ROCm container images: https://github.com/mudler/LocalAI/pull/1595 / Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
 - Mamba support: https://github.com/mudler/LocalAI/pull/1589
 - Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
 - 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
@@ -66,10 +73,14 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl

 ## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)

-For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. For those in a hurry, here's a straightforward one-liner to launch a LocalAI instance with [phi-2](https://huggingface.co/microsoft/phi-2) using `docker`:
+For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. 

-```
-docker run -ti -p 8080:8080 localai/localai:v2.9.0-ffmpeg-core phi-2
+For those in a hurry, here's a straightforward one-liner to launch a LocalAI AIO(All-in-one) Image using `docker`:
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+# or, if you have an Nvidia GPU:
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
 ```

 ## 🚀 [Features](https://localai.io/features/)
--- a/aio/cpu/README.md
+++ b/aio/cpu/README.md
@@ -0,0 +1,5 @@
+## AIO CPU size
+
+Use this image with CPU-only.
+
+Please keep using only C++ backends so the base image is as small as possible (without CUDA, cuDNN, python, etc).
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -0,0 +1,12 @@
+name: text-embedding-ada-002
+backend: bert-embeddings
+parameters:
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -0,0 +1,62 @@
+name: stablediffusion
+backend: stablediffusion
+parameters:
+  model: stablediffusion_assets
+
+license: "BSD-3"
+urls:
+- https://github.com/EdVince/Stable-Diffusion-NCNN
+- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
+
+description: |
+     Stable Diffusion in NCNN with c++, supported txt2img and img2img
+
+download_files:
+- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
+  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
+  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
+  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
+- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
+  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
+- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
+  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
+  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
+  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
+- filename: "stablediffusion_assets/log_sigmas.bin"
+  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
+- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
+  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
+  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
+  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
+- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
+  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
+- filename: "stablediffusion_assets/vocab.txt"
+  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/cpu/speech-to-text.yaml
+++ b/aio/cpu/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/cpu/text-to-speech.yaml
+++ b/aio/cpu/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"voice-en-us-amy-low",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -0,0 +1,53 @@
+name: gpt-4
+mmap: true
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q2_K.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
+    {{- if .Content}}
+    {{.Content}}
+    {{- end }}
+    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    <|im_end|>
+  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    Use the following pydantic model json schema for each tool call you will make:
+    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
+    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    <tool_call>
+    {'arguments': <args-dict>, 'name': <function-name>}
+    </tool_call>
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+    <tool_call>
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- "\n</tool_call>"
+- "\n\n\n"
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "gpt-4",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -0,0 +1,31 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+mmap: true
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: bakllava-mmproj.gguf
+parameters:
+  model: bakllava.gguf
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: bakllava.gguf
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+- filename: bakllava-mmproj.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+
+echo "===> LocalAI All-in-One (AIO) container starting..."
+
+GPU_ACCELERATION=false
+GPU_VENDOR=""
+
+function check_intel() {
+    if lspci | grep -E 'VGA|3D' | grep -iq intel; then
+        echo "Intel GPU detected"
+        if [ -d /opt/intel ]; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=intel
+        else
+            echo "Intel GPU detected, but Intel GPU drivers are not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_nvidia_wsl() {
+    if lspci | grep -E 'VGA|3D' | grep -iq "Microsoft Corporation Device 008e"; then
+        # We make the assumption this WSL2 cars is NVIDIA, then check for nvidia-smi
+        # Make sure the container was run with `--gpus all` as the only required parameter
+        echo "NVIDIA GPU detected via WSL2"
+        # nvidia-smi should be installed in the container
+        if nvidia-smi; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=nvidia
+        else
+            echo "NVIDIA GPU detected via WSL2, but nvidia-smi is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_amd() {
+    if lspci | grep -E 'VGA|3D' | grep -iq amd; then
+        echo "AMD GPU detected"
+        # Check if ROCm is installed
+        if [ -d /opt/rocm ]; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=amd
+        else
+            echo "AMD GPU detected, but ROCm is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_nvidia() {
+    if lspci | grep -E 'VGA|3D' | grep -iq nvidia; then
+        echo "NVIDIA GPU detected"
+        # nvidia-smi should be installed in the container
+        if nvidia-smi; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=nvidia
+        else
+            echo "NVIDIA GPU detected, but nvidia-smi is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_metal() {
+    if system_profiler SPDisplaysDataType | grep -iq 'Metal'; then
+        echo "Apple Metal supported GPU detected"
+        GPU_ACCELERATION=true
+        GPU_VENDOR=apple
+    fi
+}
+
+function detect_gpu() {
+    case "$(uname -s)" in
+        Linux)
+            check_nvidia
+            check_amd
+            check_intel
+            check_nvidia_wsl
+            ;;
+        Darwin)
+            check_metal
+            ;;
+    esac
+}
+
+function detect_gpu_size() {
+    # Attempting to find GPU memory size for NVIDIA GPUs
+    if [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "nvidia" ]; then
+        echo "NVIDIA GPU detected. Attempting to find memory size..."
+        # Using head -n 1 to get the total memory of the 1st NVIDIA GPU detected.
+        # If handling multiple GPUs is required in the future, this is the place to do it
+        nvidia_sm=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n 1)
+        if [ ! -z "$nvidia_sm" ]; then
+            echo "Total GPU Memory: $nvidia_sm MiB"
+            # if bigger than 8GB, use 16GB
+            #if [ "$nvidia_sm" -gt 8192 ]; then
+            #    GPU_SIZE=gpu-16g
+            #else
+            GPU_SIZE=gpu-8g
+            #fi
+        else
+            echo "Unable to determine NVIDIA GPU memory size. Falling back to CPU."
+            GPU_SIZE=gpu-8g
+        fi
+    elif [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "intel" ]; then
+        GPU_SIZE=intel
+    # Default to a generic GPU size until we implement GPU size detection for non NVIDIA GPUs
+    elif [ "$GPU_ACCELERATION" = true ]; then
+        echo "Non-NVIDIA GPU detected. Specific GPU memory size detection is not implemented."
+        GPU_SIZE=gpu-8g
+
+    # default to cpu if GPU_SIZE is not set
+    else
+        echo "GPU acceleration is not enabled or supported. Defaulting to CPU."
+        GPU_SIZE=cpu
+    fi
+}
+
+function check_vars() {
+    if [ -z "$MODELS" ]; then
+        echo "MODELS environment variable is not set. Please set it to a comma-separated list of model YAML files to load."
+        exit 1
+    fi
+
+    if [ -z "$PROFILE" ]; then
+        echo "PROFILE environment variable is not set. Please set it to one of the following: cpu, gpu-8g, gpu-16g, apple"
+        exit 1
+    fi
+}
+
+detect_gpu
+detect_gpu_size
+
+PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+
+check_vars
+
+echo "===> Starting LocalAI[$PROFILE] with the following models: $MODELS"
+
+exec /build/entrypoint.sh "$@"
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -0,0 +1,12 @@
+name: text-embedding-ada-002
+backend: sentencetransformers
+parameters:
+  model: all-MiniLM-L6-v2
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/gpu-8g/image-gen.yaml
+++ b/aio/gpu-8g/image-gen.yaml
@@ -0,0 +1,25 @@
+name: stablediffusion
+parameters:
+  model: DreamShaper_8_pruned.safetensors
+backend: diffusers
+step: 25
+f16: true
+
+diffusers:
+  pipeline_type: StableDiffusionPipeline
+  cuda: true
+  enable_parameters: "negative_prompt,num_inference_steps"
+  scheduler_type: "k_dpmpp_2m"
+
+download_files:
+- filename: DreamShaper_8_pruned.safetensors
+  uri: huggingface://Lykon/DreamShaper/DreamShaper_8_pruned.safetensors
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/gpu-8g/speech-to-text.yaml
+++ b/aio/gpu-8g/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/gpu-8g/text-to-speech.yaml
+++ b/aio/gpu-8g/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"tts-1",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -0,0 +1,53 @@
+name: gpt-4
+mmap: true
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
+    {{- if .Content}}
+    {{.Content}}
+    {{- end }}
+    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    <|im_end|>
+  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    Use the following pydantic model json schema for each tool call you will make:
+    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
+    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    <tool_call>
+    {'arguments': <args-dict>, 'name': <function-name>}
+    </tool_call>
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+    <tool_call>
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- "\n</tool_call>"
+- "\n\n\n"
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "gpt-4",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -0,0 +1,35 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+mmap: true
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
+parameters:
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -0,0 +1,12 @@
+name: text-embedding-ada-002
+backend: sentencetransformers
+parameters:
+  model: all-MiniLM-L6-v2
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -0,0 +1,20 @@
+name: stablediffusion
+parameters:
+  model: runwayml/stable-diffusion-v1-5
+backend: diffusers
+step: 25
+f16: true
+diffusers:
+  pipeline_type: StableDiffusionPipeline
+  cuda: true
+  enable_parameters: "negative_prompt,num_inference_steps"
+  scheduler_type: "k_dpmpp_2m"
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/intel/speech-to-text.yaml
+++ b/aio/intel/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/intel/text-to-speech.yaml
+++ b/aio/intel/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"tts-1",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -0,0 +1,53 @@
+name: gpt-4
+mmap: false
+f16: false
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
+    {{- if .Content}}
+    {{.Content}}
+    {{- end }}
+    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    <|im_end|>
+  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    Use the following pydantic model json schema for each tool call you will make:
+    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
+    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    <tool_call>
+    {'arguments': <args-dict>, 'name': <function-name>}
+    </tool_call>
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+    <tool_call>
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+stopwords:
+- <|im_end|>
+- "\n</tool_call>"
+- <dummy32000>
+- "\n\n\n"
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "gpt-4",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -0,0 +1,35 @@
+backend: llama-cpp
+context_size: 4096
+mmap: false
+f16: false
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
+parameters:
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -18,6 +18,48 @@ service Backend {
  rpc TTS(TTSRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
+
+  rpc StoresSet(StoresSetOptions) returns (Result) {}
+  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
+  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
+  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
+}
+
+message StoresKey {
+  repeated float Floats = 1;
+}
+
+message StoresValue {
+  bytes Bytes = 1;
+}
+
+message StoresSetOptions {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresDeleteOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresFindOptions {
+  StoresKey Key = 1;
+  int32 TopK = 2;
+}
+
+message StoresFindResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+  repeated float Similarities = 3;
 }

 message HealthMessage {}
@@ -121,7 +163,7 @@ message ModelOptions {

  bool NoMulMatQ = 37;
  string DraftModel = 39;
-  
+
  string AudioPath = 38;

  // vllm
@@ -213,4 +255,4 @@ message StatusResponse {
  }
  State state = 1;
  MemoryUsageData memory = 2;
-}
+}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -48,7 +48,7 @@ $(INSTALLED_PACKAGES): grpc_build

 $(GRPC_REPO):
 	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
-	cd $(GRPC_REPO)/grpc && git submodule update --init --recursive --depth $(GIT_CLONE_DEPTH)
+	cd $(GRPC_REPO)/grpc && git submodule update --jobs 2 --init --recursive --depth $(GIT_CLONE_DEPTH)

 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -19,6 +19,11 @@ else ifeq ($(BUILD_TYPE),clblas)
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
+# But if it's OSX without metal, disable it here
+else ifeq ($(OS),darwin)
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+	endif
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
@@ -36,7 +41,7 @@ llama.cpp:
 	fi
 	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1

-llama.cpp/examples/grpc-server:
+llama.cpp/examples/grpc-server: llama.cpp
 	mkdir -p llama.cpp/examples/grpc-server
 	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
 	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -1084,7 +1084,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }

-        if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
+        if (result.tok == llama_token_eos(model))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
--- a/backend/go/stores/debug.go
+++ b/backend/go/stores/debug.go
@@ -0,0 +1,14 @@
+//go:build debug
+// +build debug
+
+package main
+
+import (
+	"github.com/rs/zerolog/log"
+)
+
+func assert(cond bool, msg string) {
+	if !cond {
+		log.Fatal().Stack().Msg(msg)
+	}
+}
--- a/backend/go/stores/main.go
+++ b/backend/go/stores/main.go
@@ -0,0 +1,26 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each store
+
+import (
+	"flag"
+	"os"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, NewStore()); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/stores/production.go
+++ b/backend/go/stores/production.go
@@ -0,0 +1,7 @@
+//go:build !debug
+// +build !debug
+
+package main
+
+func assert(cond bool, msg string) {
+}
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -0,0 +1,507 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"container/heap"
+	"fmt"
+	"math"
+	"slices"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/rs/zerolog/log"
+)
+
+type Store struct {
+	base.SingleThread
+
+	// The sorted keys
+	keys [][]float32
+	// The sorted values
+	values [][]byte
+
+	// If for every K it holds that ||k||^2 = 1, then we can use the normalized distance functions
+	// TODO: Should we normalize incoming keys if they are not instead?
+	keysAreNormalized bool
+	// The first key decides the length of the keys
+	keyLen int
+}
+
+// TODO: Only used for sorting using Go's builtin implementation. The interfaces are columnar because
+// that's theoretically best for memory layout and cache locality, but this isn't optimized yet.
+type Pair struct {
+	Key   []float32
+	Value []byte
+}
+
+func NewStore() *Store {
+	return &Store{
+		keys:              make([][]float32, 0),
+		values:            make([][]byte, 0),
+		keysAreNormalized: true,
+		keyLen:            -1,
+	}
+}
+
+func compareSlices(k1, k2 []float32) int {
+	assert(len(k1) == len(k2), fmt.Sprintf("compareSlices: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
+
+	return slices.Compare(k1, k2)
+}
+
+func hasKey(unsortedSlice [][]float32, target []float32) bool {
+	return slices.ContainsFunc(unsortedSlice, func(k []float32) bool {
+		return compareSlices(k, target) == 0
+	})
+}
+
+func findInSortedSlice(sortedSlice [][]float32, target []float32) (int, bool) {
+	return slices.BinarySearchFunc(sortedSlice, target, func(k, t []float32) int {
+		return compareSlices(k, t)
+	})
+}
+
+func isSortedPairs(kvs []Pair) bool {
+	for i := 1; i < len(kvs); i++ {
+		if compareSlices(kvs[i-1].Key, kvs[i].Key) > 0 {
+			return false
+		}
+	}
+
+	return true
+}
+
+func isSortedKeys(keys [][]float32) bool {
+	for i := 1; i < len(keys); i++ {
+		if compareSlices(keys[i-1], keys[i]) > 0 {
+			return false
+		}
+	}
+
+	return true
+}
+
+func sortIntoKeySlicese(keys []*pb.StoresKey) [][]float32 {
+	ks := make([][]float32, len(keys))
+
+	for i, k := range keys {
+		ks[i] = k.Floats
+	}
+
+	slices.SortFunc(ks, compareSlices)
+
+	assert(len(ks) == len(keys), fmt.Sprintf("len(ks) = %d, len(keys) = %d", len(ks), len(keys)))
+	assert(isSortedKeys(ks), "keys are not sorted")
+
+	return ks
+}
+
+func (s *Store) Load(opts *pb.ModelOptions) error {
+	return nil
+}
+
+// Sort the incoming kvs and merge them with the existing sorted kvs
+func (s *Store) StoresSet(opts *pb.StoresSetOptions) error {
+	if len(opts.Keys) == 0 {
+		return fmt.Errorf("no keys to add")
+	}
+
+	if len(opts.Keys) != len(opts.Values) {
+		return fmt.Errorf("len(keys) = %d, len(values) = %d", len(opts.Keys), len(opts.Values))
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Keys[0].Floats)
+	} else {
+		if len(opts.Keys[0].Floats) != s.keyLen {
+			return fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
+		}
+	}
+
+	kvs := make([]Pair, len(opts.Keys))
+
+	for i, k := range opts.Keys {
+		if s.keysAreNormalized && !isNormalized(k.Floats) {
+			s.keysAreNormalized = false
+			var sample []float32
+			if len(s.keys) > 5 {
+				sample = k.Floats[:5]
+			} else {
+				sample = k.Floats
+			}
+			log.Debug().Msgf("Key is not normalized: %v", sample)
+		}
+
+		kvs[i] = Pair{
+			Key:   k.Floats,
+			Value: opts.Values[i].Bytes,
+		}
+	}
+
+	slices.SortFunc(kvs, func(a, b Pair) int {
+		return compareSlices(a.Key, b.Key)
+	})
+
+	assert(len(kvs) == len(opts.Keys), fmt.Sprintf("len(kvs) = %d, len(opts.Keys) = %d", len(kvs), len(opts.Keys)))
+	assert(isSortedPairs(kvs), "keys are not sorted")
+
+	l := len(kvs) + len(s.keys)
+	merge_ks := make([][]float32, 0, l)
+	merge_vs := make([][]byte, 0, l)
+
+	i, j := 0, 0
+	for {
+		if i+j >= l {
+			break
+		}
+
+		if i >= len(kvs) {
+			merge_ks = append(merge_ks, s.keys[j])
+			merge_vs = append(merge_vs, s.values[j])
+			j++
+			continue
+		}
+
+		if j >= len(s.keys) {
+			merge_ks = append(merge_ks, kvs[i].Key)
+			merge_vs = append(merge_vs, kvs[i].Value)
+			i++
+			continue
+		}
+
+		c := compareSlices(kvs[i].Key, s.keys[j])
+		if c < 0 {
+			merge_ks = append(merge_ks, kvs[i].Key)
+			merge_vs = append(merge_vs, kvs[i].Value)
+			i++
+		} else if c > 0 {
+			merge_ks = append(merge_ks, s.keys[j])
+			merge_vs = append(merge_vs, s.values[j])
+			j++
+		} else {
+			merge_ks = append(merge_ks, kvs[i].Key)
+			merge_vs = append(merge_vs, kvs[i].Value)
+			i++
+			j++
+		}
+	}
+
+	assert(len(merge_ks) == l, fmt.Sprintf("len(merge_ks) = %d, l = %d", len(merge_ks), l))
+	assert(isSortedKeys(merge_ks), "merge keys are not sorted")
+
+	s.keys = merge_ks
+	s.values = merge_vs
+
+	return nil
+}
+
+func (s *Store) StoresDelete(opts *pb.StoresDeleteOptions) error {
+	if len(opts.Keys) == 0 {
+		return fmt.Errorf("no keys to delete")
+	}
+
+	if len(opts.Keys) == 0 {
+		return fmt.Errorf("no keys to add")
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Keys[0].Floats)
+	} else {
+		if len(opts.Keys[0].Floats) != s.keyLen {
+			return fmt.Errorf("Trying to delete key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
+		}
+	}
+
+	ks := sortIntoKeySlicese(opts.Keys)
+
+	l := len(s.keys) - len(ks)
+	merge_ks := make([][]float32, 0, l)
+	merge_vs := make([][]byte, 0, l)
+
+	tail_ks := s.keys
+	tail_vs := s.values
+	for _, k := range ks {
+		j, found := findInSortedSlice(tail_ks, k)
+
+		if found {
+			merge_ks = append(merge_ks, tail_ks[:j]...)
+			merge_vs = append(merge_vs, tail_vs[:j]...)
+			tail_ks = tail_ks[j+1:]
+			tail_vs = tail_vs[j+1:]
+		} else {
+			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: t=%d, %v", len(tail_ks), k))
+		}
+
+		log.Debug().Msgf("Delete: found = %v, t = %d, j = %d, len(merge_ks) = %d, len(merge_vs) = %d", found, len(tail_ks), j, len(merge_ks), len(merge_vs))
+	}
+
+	merge_ks = append(merge_ks, tail_ks...)
+	merge_vs = append(merge_vs, tail_vs...)
+
+	assert(len(merge_ks) <= len(s.keys), fmt.Sprintf("len(merge_ks) = %d, len(s.keys) = %d", len(merge_ks), len(s.keys)))
+
+	s.keys = merge_ks
+	s.values = merge_vs
+
+	assert(len(s.keys) >= l, fmt.Sprintf("len(s.keys) = %d, l = %d", len(s.keys), l))
+	assert(isSortedKeys(s.keys), "keys are not sorted")
+	assert(func() bool {
+		for _, k := range ks {
+			if _, found := findInSortedSlice(s.keys, k); found {
+				return false
+			}
+		}
+		return true
+	}(), "Keys to delete still present")
+
+	if len(s.keys) != l {
+		log.Debug().Msgf("Delete: Some keys not found: len(s.keys) = %d, l = %d", len(s.keys), l)
+	}
+
+	return nil
+}
+
+func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error) {
+	pbKeys := make([]*pb.StoresKey, 0, len(opts.Keys))
+	pbValues := make([]*pb.StoresValue, 0, len(opts.Keys))
+	ks := sortIntoKeySlicese(opts.Keys)
+
+	if len(s.keys) == 0 {
+		log.Debug().Msgf("Get: No keys in store")
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Keys[0].Floats)
+	} else {
+		if len(opts.Keys[0].Floats) != s.keyLen {
+			return pb.StoresGetResult{}, fmt.Errorf("Try to get a key with length %d when existing length is %d", len(opts.Keys[0].Floats), s.keyLen)
+		}
+	}
+
+	tail_k := s.keys
+	tail_v := s.values
+	for i, k := range ks {
+		j, found := findInSortedSlice(tail_k, k)
+
+		if found {
+			pbKeys = append(pbKeys, &pb.StoresKey{
+				Floats: k,
+			})
+			pbValues = append(pbValues, &pb.StoresValue{
+				Bytes: tail_v[j],
+			})
+
+			tail_k = tail_k[j+1:]
+			tail_v = tail_v[j+1:]
+		} else {
+			assert(!hasKey(s.keys, k), fmt.Sprintf("Key exists, but was not found: i=%d, %v", i, k))
+		}
+	}
+
+	if len(pbKeys) != len(opts.Keys) {
+		log.Debug().Msgf("Get: Some keys not found: len(pbKeys) = %d, len(opts.Keys) = %d, len(s.Keys) = %d", len(pbKeys), len(opts.Keys), len(s.keys))
+	}
+
+	return pb.StoresGetResult{
+		Keys:   pbKeys,
+		Values: pbValues,
+	}, nil
+}
+
+func isNormalized(k []float32) bool {
+	var sum float32
+	for _, v := range k {
+		sum += v
+	}
+
+	return sum == 1.0
+}
+
+// TODO: This we could replace with handwritten SIMD code
+func normalizedCosineSimilarity(k1, k2 []float32) float32 {
+	assert(len(k1) == len(k2), fmt.Sprintf("normalizedCosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
+
+	var dot float32
+	for i := 0; i < len(k1); i++ {
+		dot += k1[i] * k2[i]
+	}
+
+	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
+
+	// 2.0 * (1.0 - dot) would be the Euclidean distance
+	return dot
+}
+
+type PriorityItem struct {
+	Similarity float32
+	Key        []float32
+	Value      []byte
+}
+
+type PriorityQueue []*PriorityItem
+
+func (pq PriorityQueue) Len() int { return len(pq) }
+
+func (pq PriorityQueue) Less(i, j int) bool {
+	// Inverted because the most similar should be at the top
+	return pq[i].Similarity < pq[j].Similarity
+}
+
+func (pq PriorityQueue) Swap(i, j int) {
+	pq[i], pq[j] = pq[j], pq[i]
+}
+
+func (pq *PriorityQueue) Push(x any) {
+	item := x.(*PriorityItem)
+	*pq = append(*pq, item)
+}
+
+func (pq *PriorityQueue) Pop() any {
+	old := *pq
+	n := len(old)
+	item := old[n-1]
+	*pq = old[0 : n-1]
+	return item
+}
+
+func (s *Store) StoresFindNormalized(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
+	tk := opts.Key.Floats
+	top_ks := make(PriorityQueue, 0, int(opts.TopK))
+	heap.Init(&top_ks)
+
+	for i, k := range s.keys {
+		sim := normalizedCosineSimilarity(tk, k)
+		heap.Push(&top_ks, &PriorityItem{
+			Similarity: sim,
+			Key:        k,
+			Value:      s.values[i],
+		})
+
+		if top_ks.Len() > int(opts.TopK) {
+			heap.Pop(&top_ks)
+		}
+	}
+
+	similarities := make([]float32, top_ks.Len())
+	pbKeys := make([]*pb.StoresKey, top_ks.Len())
+	pbValues := make([]*pb.StoresValue, top_ks.Len())
+
+	for i := top_ks.Len() - 1; i >= 0; i-- {
+		item := heap.Pop(&top_ks).(*PriorityItem)
+
+		similarities[i] = item.Similarity
+		pbKeys[i] = &pb.StoresKey{
+			Floats: item.Key,
+		}
+		pbValues[i] = &pb.StoresValue{
+			Bytes: item.Value,
+		}
+	}
+
+	return pb.StoresFindResult{
+		Keys:         pbKeys,
+		Values:       pbValues,
+		Similarities: similarities,
+	}, nil
+}
+
+func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
+	assert(len(k1) == len(k2), fmt.Sprintf("cosineSimilarity: len(k1) = %d, len(k2) = %d", len(k1), len(k2)))
+
+	var dot, mag2 float64
+	for i := 0; i < len(k1); i++ {
+		dot += float64(k1[i] * k2[i])
+		mag2 += float64(k2[i] * k2[i])
+	}
+
+	sim := float32(dot / (mag1 * math.Sqrt(mag2)))
+
+	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
+
+	return sim
+}
+
+func (s *Store) StoresFindFallback(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
+	tk := opts.Key.Floats
+	top_ks := make(PriorityQueue, 0, int(opts.TopK))
+	heap.Init(&top_ks)
+
+	var mag1 float64
+	for _, v := range tk {
+		mag1 += float64(v * v)
+	}
+	mag1 = math.Sqrt(mag1)
+
+	for i, k := range s.keys {
+		dist := cosineSimilarity(tk, k, mag1)
+		heap.Push(&top_ks, &PriorityItem{
+			Similarity: dist,
+			Key:        k,
+			Value:      s.values[i],
+		})
+
+		if top_ks.Len() > int(opts.TopK) {
+			heap.Pop(&top_ks)
+		}
+	}
+
+	similarities := make([]float32, top_ks.Len())
+	pbKeys := make([]*pb.StoresKey, top_ks.Len())
+	pbValues := make([]*pb.StoresValue, top_ks.Len())
+
+	for i := top_ks.Len() - 1; i >= 0; i-- {
+		item := heap.Pop(&top_ks).(*PriorityItem)
+
+		similarities[i] = item.Similarity
+		pbKeys[i] = &pb.StoresKey{
+			Floats: item.Key,
+		}
+		pbValues[i] = &pb.StoresValue{
+			Bytes: item.Value,
+		}
+	}
+
+	return pb.StoresFindResult{
+		Keys:         pbKeys,
+		Values:       pbValues,
+		Similarities: similarities,
+	}, nil
+}
+
+func (s *Store) StoresFind(opts *pb.StoresFindOptions) (pb.StoresFindResult, error) {
+	tk := opts.Key.Floats
+
+	if len(tk) != s.keyLen {
+		return pb.StoresFindResult{}, fmt.Errorf("Try to find key with length %d when existing length is %d", len(tk), s.keyLen)
+	}
+
+	if opts.TopK < 1 {
+		return pb.StoresFindResult{}, fmt.Errorf("opts.TopK = %d, must be >= 1", opts.TopK)
+	}
+
+	if s.keyLen == -1 {
+		s.keyLen = len(opts.Key.Floats)
+	} else {
+		if len(opts.Key.Floats) != s.keyLen {
+			return pb.StoresFindResult{}, fmt.Errorf("Try to add key with length %d when existing length is %d", len(opts.Key.Floats), s.keyLen)
+		}
+	}
+
+	if s.keysAreNormalized && isNormalized(tk) {
+		return s.StoresFindNormalized(opts)
+	} else {
+		if s.keysAreNormalized {
+			var sample []float32
+			if len(s.keys) > 5 {
+				sample = tk[:5]
+			} else {
+				sample = tk
+			}
+			log.Debug().Msgf("Trying to compare non-normalized key with normalized keys: %v", sample)
+		}
+
+		return s.StoresFindFallback(opts)
+	}
+}
--- a/backend/python/autogptq/autogptq.py
+++ b/backend/python/autogptq/autogptq.py
@@ -5,12 +5,14 @@ import signal
 import sys
 import os
 import time
+import base64

 import grpc
 import backend_pb2
 import backend_pb2_grpc
+
 from auto_gptq import AutoGPTQForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import TextGenerationPipeline

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -28,9 +30,18 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            if request.Device != "":
                device = request.Device

-            tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=request.UseFastTokenizer)
+            # support loading local model files
+            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)

-            model = AutoGPTQForCausalLM.from_quantized(request.Model,
+            # support model `Qwen/Qwen-VL-Chat-Int4`
+            if "qwen-vl" in request.Model.lower():
+                self.model_name = "Qwen-VL-Chat"
+                model = AutoModelForCausalLM.from_pretrained(model_path, 
+                    trust_remote_code=request.TrustRemoteCode,
+                    device_map="auto").eval()
+            else:
+                model = AutoGPTQForCausalLM.from_quantized(model_path,
                    model_basename=request.ModelBaseName,
                    use_safetensors=True,
                    trust_remote_code=request.TrustRemoteCode,
@@ -55,6 +66,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        if request.TopP != 0.0:
            top_p = request.TopP

+        
+        prompt_images = self.recompile_vl_prompt(request)
+        compiled_prompt = prompt_images[0]
+        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
+
        # Implement Predict RPC
        pipeline = TextGenerationPipeline(
            model=self.model, 
@@ -64,10 +80,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            top_p=top_p,
            repetition_penalty=penalty,
            )
-        t = pipeline(request.Prompt)[0]["generated_text"]
-        # Remove prompt from response if present
-        if request.Prompt in t:
-            t = t.replace(request.Prompt, "")
+        t = pipeline(compiled_prompt)[0]["generated_text"]
+        print(f"generated_text: {t}", file=sys.stderr)
+        
+        if compiled_prompt in t:
+            t = t.replace(compiled_prompt, "")
+        # house keeping. Remove the image files from /tmp folder
+        for img_path in prompt_images[1]:
+            try:
+                os.remove(img_path)
+            except Exception as e:
+                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)

        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))

@@ -78,6 +101,24 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # Not implemented yet
        return self.Predict(request, context)

+    def recompile_vl_prompt(self, request):
+        prompt = request.Prompt
+        image_paths = []
+
+        if "qwen-vl" in self.model_name.lower():
+            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
+            # Then, save the image file paths to an array "image_paths".
+            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
+            for i, img in enumerate(request.Images):
+                timestamp = str(int(time.time() * 1000))  # Generate timestamp
+                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
+                with open(img_path, "wb") as f:
+                    f.write(base64.b64decode(img))
+                image_paths.append(img_path)
+                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
+        else:
+            prompt = request.Prompt
+        return (prompt, image_paths)

 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@@ -1,3 +1,7 @@
+####
+# Attention! This file is abandoned. 
+# Please use the ../common-env/transformers/transformers.yml file to manage dependencies.
+###
 name: autogptq
 channels:
  - defaults
@@ -24,12 +28,12 @@ dependencies:
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
-      - accelerate==0.23.0
+      - accelerate==0.27.0
      - aiohttp==3.8.5
      - aiosignal==1.3.1
      - async-timeout==4.0.3
      - attrs==23.1.0
-      - auto-gptq==0.4.2
+      - auto-gptq==0.7.1
      - certifi==2023.7.22
      - charset-normalizer==3.3.0
      - datasets==2.14.5
@@ -59,6 +63,7 @@ dependencies:
      - nvidia-nccl-cu12==2.18.1
      - nvidia-nvjitlink-cu12==12.2.140
      - nvidia-nvtx-cu12==12.1.105
+      - optimum==1.17.1
      - packaging==23.2
      - pandas==2.1.1
      - peft==0.5.0
@@ -75,9 +80,11 @@ dependencies:
      - six==1.16.0
      - sympy==1.12
      - tokenizers==0.14.0
-      - torch==2.1.0
      - tqdm==4.66.1
+      - torch==2.2.1
+      - torchvision==0.17.1
      - transformers==4.34.0
+      - transformers_stream_generator==0.0.5
      - triton==2.1.0
      - typing-extensions==4.8.0
      - tzdata==2023.3
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -25,7 +25,7 @@ if [ -d "/opt/intel" ]; then
    # Intel GPU: If the directory exists, we assume we are using the intel image
    # (no conda env)
    # https://github.com/intel/intel-extension-for-pytorch/issues/538
-    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed
+    pip install intel-extension-for-transformers datasets sentencepiece tiktoken neural_speed optimum[openvino]
 fi

 if [ "$PIP_CACHE_PURGE" = true ] ; then
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@@ -24,10 +24,11 @@ dependencies:
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
-      - accelerate==0.23.0
+      - accelerate==0.27.0
      - aiohttp==3.8.5
      - aiosignal==1.3.1
      - async-timeout==4.0.3
+      - auto-gptq==0.7.1
      - attrs==23.1.0
      - bark==0.1.5
      - bitsandbytes==0.43.0
@@ -69,6 +70,7 @@ dependencies:
      - nvidia-nccl-cu12==2.18.1
      - nvidia-nvjitlink-cu12==12.2.140
      - nvidia-nvtx-cu12==12.1.105
+      - optimum==1.17.1
      - packaging==23.2
      - pandas
      - peft==0.5.0
@@ -88,6 +90,7 @@ dependencies:
      - sympy==1.12
      - tokenizers
      - torch==2.1.2
+      - torchvision==0.16.2
      - torchaudio==2.1.2
      - tqdm==4.66.1
      - triton==2.1.0
@@ -95,7 +98,6 @@ dependencies:
      - tzdata==2023.3
      - urllib3==1.26.17
      - xxhash==3.4.1
-      - auto-gptq==0.6.0
      - yarl==1.9.2
      - soundfile
      - langid
@@ -116,5 +118,6 @@ dependencies:
      - vocos
      - vllm==0.3.2
      - transformers>=4.38.2  # Updated Version
+      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@@ -26,7 +26,8 @@ dependencies:
  - pip:
      - --pre
      - --extra-index-url https://download.pytorch.org/whl/nightly/
-      - accelerate==0.23.0
+      - accelerate==0.27.0
+      - auto-gptq==0.7.1
      - aiohttp==3.8.5
      - aiosignal==1.3.1
      - async-timeout==4.0.3
@@ -82,7 +83,6 @@ dependencies:
      - triton==2.1.0
      - typing-extensions==4.8.0
      - tzdata==2023.3
-      - auto-gptq==0.6.0
      - urllib3==1.26.17
      - xxhash==3.4.1
      - yarl==1.9.2
@@ -90,6 +90,7 @@ dependencies:
      - langid
      - wget
      - unidecode
+      - optimum==1.17.1
      - pyopenjtalk-prebuilt
      - pypinyin
      - inflect
@@ -105,5 +106,6 @@ dependencies:
      - vocos
      - vllm==0.3.2
      - transformers>=4.38.2  # Updated Version
+      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@@ -24,15 +24,17 @@ dependencies:
  - xz=5.4.2=h5eee18b_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
-      - accelerate==0.23.0
+      - accelerate==0.27.0
      - aiohttp==3.8.5
      - aiosignal==1.3.1
+      - auto-gptq==0.7.1
      - async-timeout==4.0.3
      - attrs==23.1.0
      - bark==0.1.5
      - boto3==1.28.61
      - botocore==1.31.61
      - certifi==2023.7.22
+      - coloredlogs==15.0.1
      - TTS==0.22.0
      - charset-normalizer==3.3.0
      - datasets==2.14.5
@@ -47,6 +49,7 @@ dependencies:
      - funcy==2.0
      - grpcio==1.59.0
      - huggingface-hub
+      - humanfriendly==10.0
      - idna==3.4
      - jinja2==3.1.2
      - jmespath==1.0.1
@@ -56,6 +59,10 @@ dependencies:
      - multiprocess==0.70.15
      - networkx
      - numpy==1.26.0
+      - onnx==1.15.0
+      - openvino==2024.0.0
+      - openvino-telemetry==2023.2.1
+      - optimum[openvino]==1.17.1
      - packaging==23.2
      - pandas
      - peft==0.5.0
@@ -75,12 +82,12 @@ dependencies:
      - sympy==1.12
      - tokenizers
      - torch==2.1.2
+      - torchvision==0.16.2
      - torchaudio==2.1.2
      - tqdm==4.66.1
      - triton==2.1.0
      - typing-extensions==4.8.0
      - tzdata==2023.3
-      - auto-gptq==0.6.0
      - urllib3==1.26.17
      - xxhash==3.4.1
      - yarl==1.9.2
@@ -103,5 +110,6 @@ dependencies:
      - vocos
      - vllm==0.3.2
      - transformers>=4.38.2  # Updated Version
+      - transformers_stream_generator==0.0.5
      - xformers==0.0.23.post1  
 prefix: /opt/conda/envs/transformers
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@@ -8,6 +8,8 @@ import argparse
 import signal
 import sys
 import os
+from threading import Thread
+import asyncio

 import time
 import backend_pb2
@@ -17,13 +19,12 @@ import grpc
 import torch
 import torch.cuda

+
 XPU=os.environ.get("XPU", "0") == "1"
 if XPU:
-    import intel_extension_for_pytorch as ipex
-    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
-    from transformers import AutoTokenizer, AutoModel, set_seed
+    from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer
 else:
-    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
+    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig, TextIteratorStreamer


 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -81,6 +82,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            compute=torch.bfloat16

        self.CUDA = request.CUDA
+        self.OV=False

        device_map="cpu"

@@ -105,23 +107,61 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    bnb_4bit_compute_dtype = None,
                    load_in_8bit=True,                                   
                )
-                                                   
-    
+                                               
        try:
            if request.Type == "AutoModelForCausalLM":
                if XPU:
-                    if quantization == "xpu_4bit":
+                    import intel_extension_for_pytorch as ipex
+                    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
+
+                    device_map="xpu"
+                    compute=torch.float16
+                    if request.Quantization == "xpu_4bit":
                        xpu_4bit = True
-                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
-                                              device_map="xpu", load_in_4bit=xpu_4bit)
+                        xpu_8bit = False
+                    elif request.Quantization == "xpu_8bit":
+                        xpu_4bit = False
+                        xpu_8bit = True
+                    else:
+                        xpu_4bit = False
+                        xpu_8bit = False
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, 
+                                                                      trust_remote_code=request.TrustRemoteCode, 
+                                                                      use_safetensors=True,
+                                                                      device_map=device_map, 
+                                                                      load_in_4bit=xpu_4bit, 
+                                                                      load_in_8bit=xpu_8bit, 
+                                                                      torch_dtype=compute)
                else:
-                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, quantization_config=quantization, device_map=device_map, torch_dtype=compute)
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, 
+                                                                      trust_remote_code=request.TrustRemoteCode, 
+                                                                      use_safetensors=True, 
+                                                                      quantization_config=quantization, 
+                                                                      device_map=device_map, 
+                                                                      torch_dtype=compute)
+            elif request.Type == "OVModelForCausalLM":
+                from optimum.intel.openvino import OVModelForCausalLM
+                from openvino.runtime import Core
+
+                if "GPU" in Core().available_devices:
+                    device_map="GPU"
+                else:
+                    device_map="CPU"
+                self.model = OVModelForCausalLM.from_pretrained(model_name, 
+                                                                compile=True, 
+                                                                device=device_map)
+                self.OV = True
            else:
-                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,  use_safetensors=True,  quantization_config=quantization, device_map=device_map, torch_dtype=compute)
+                self.model = AutoModel.from_pretrained(model_name, 
+                                                       trust_remote_code=request.TrustRemoteCode,  
+                                                       use_safetensors=True,  
+                                                       quantization_config=quantization, 
+                                                       device_map=device_map, 
+                                                       torch_dtype=compute)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
            self.XPU = False

-            if XPU:
+            if XPU and self.OV == False:
                self.XPU = True
                try:
                    print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
@@ -130,6 +170,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    print("Not using XPU:", err, file=sys.stderr)

        except Exception as err:
+            print("Error:", err, file=sys.stderr)
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        # Implement your logic here for the LoadModel service
        # Replace this with your desired response
@@ -167,7 +208,72 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        print("Embeddings:", sentence_embeddings, file=sys.stderr)
        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])

-    def Predict(self, request, context):
+    async def _predict(self, request, context, streaming=False): 
+        set_seed(request.Seed)
+        if request.TopP == 0:
+            request.TopP = 0.9
+
+        max_tokens = 200
+        if request.Tokens > 0:
+            max_tokens = request.Tokens
+
+        inputs = self.tokenizer(request.Prompt, return_tensors="pt")
+        if self.CUDA:
+            inputs = inputs.to("cuda")
+        if XPU and self.OV == False:
+            inputs = inputs.to("xpu")
+            streaming = False
+
+        if streaming:
+            streamer=TextIteratorStreamer(self.tokenizer,
+                                        skip_prompt=True,
+                                        skip_special_tokens=True)
+            config=dict(inputs,
+                        max_new_tokens=max_tokens, 
+                        temperature=request.Temperature, 
+                        top_p=request.TopP,
+                        top_k=request.TopK, 
+                        do_sample=True,
+                        attention_mask=inputs["attention_mask"],
+                        eos_token_id=self.tokenizer.eos_token_id,
+                        pad_token_id=self.tokenizer.eos_token_id,
+                        streamer=streamer)
+            thread=Thread(target=self.model.generate, kwargs=config)
+            thread.start()
+            generated_text = ""
+            try:
+                for new_text in streamer:
+                    generated_text += new_text
+                    yield backend_pb2.Reply(message=bytes(new_text, encoding='utf-8'))
+            finally:
+                thread.join()
+        else:
+            if XPU and self.OV == False:
+                outputs = self.model.generate(inputs["input_ids"],
+                                    max_new_tokens=max_tokens, 
+                                    temperature=request.Temperature, 
+                                    top_p=request.TopP,
+                                    top_k=request.TopK, 
+                                    do_sample=True,
+                                    pad_token=self.tokenizer.eos_token_id)
+            else:
+                outputs = self.model.generate(inputs["input_ids"],
+                        max_new_tokens=max_tokens, 
+                        temperature=request.Temperature, 
+                        top_p=request.TopP,
+                        top_k=request.TopK, 
+                        do_sample=True,
+                        attention_mask=inputs["attention_mask"],
+                        eos_token_id=self.tokenizer.eos_token_id,
+                        pad_token_id=self.tokenizer.eos_token_id)
+            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
+
+        if streaming:
+            return
+
+        yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
+
+    async def Predict(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters.

@@ -178,26 +284,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        Returns:
            backend_pb2.Reply: The predict result.
        """
-        set_seed(request.Seed)
-        if request.TopP == 0:
-            request.TopP = 0.9
+        gen = self._predict(request, context, streaming=False)
+        res = await gen.__anext__()
+        return res

-        max_tokens = 200
-        if request.Tokens > 0:
-            max_tokens = request.Tokens
-
-        inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
-        if self.CUDA:
-            inputs = inputs.to("cuda")
-        if XPU:
-            inputs = inputs.to("xpu")
-
-        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP, do_sample=True, pad_token_id=self.tokenizer.eos_token_id)
-        generated_text = self.tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
-
-        return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
+    async def PredictStream(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters, and streams the results.

@@ -208,31 +299,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        Returns:
            backend_pb2.Result: The predict stream result.
        """
-        yield self.Predict(request, context)
+        iterations = self._predict(request, context, streaming=True)
+        try:
+            async for iteration in iterations:
+                yield iteration
+        finally:
+            await iterations.aclose()

-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+async def serve(address):
+    # Start asyncio gRPC server
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    # Bind the server to the address
    server.add_insecure_port(address)
-    server.start()
+
+    # Gracefully shutdown the server on SIGTERM or SIGINT
+    loop = asyncio.get_event_loop()
+    for sig in (signal.SIGINT, signal.SIGTERM):
+        loop.add_signal_handler(
+            sig, lambda: asyncio.ensure_future(server.stop(5))
+        )
+
+    # Start the server
+    await server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
+    # Wait for the server to be terminated
+    await server.wait_for_termination()

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
@@ -241,4 +334,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    serve(args.addr)
+    asyncio.run(serve(args.addr))
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -1,6 +1,7 @@
 package backend

 import (
+	"math/rand"
 	"os"
 	"path/filepath"

@@ -33,12 +34,20 @@ func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []mode
 	return opts
 }

+func getSeed(c config.BackendConfig) int32 {
+	seed := int32(*c.Seed)
+	if seed == config.RAND_SEED {
+		seed = rand.Int31()
+	}
+
+	return seed
+}
+
 func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}
-
 	return &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
@@ -54,7 +63,7 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
 		ContextSize:          int32(*c.ContextSize),
-		Seed:                 int32(*c.Seed),
+		Seed:                 getSeed(c),
 		NBatch:               int32(b),
 		NoMulMatQ:            c.NoMulMatQ,
 		DraftModel:           c.DraftModel,
@@ -129,13 +138,13 @@ func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOption
 		NKeep:               int32(c.Keep),
 		Batch:               int32(c.Batch),
 		IgnoreEOS:           c.IgnoreEOS,
-		Seed:                int32(*c.Seed),
+		Seed:                getSeed(c),
 		FrequencyPenalty:    float32(c.FrequencyPenalty),
 		MLock:               *c.MMlock,
 		MMap:                *c.MMap,
 		MainGPU:             c.MainGPU,
 		TensorSplit:         c.TensorSplit,
-		TailFreeSamplingZ:   float32(c.TFZ),
-		TypicalP:            float32(c.TypicalP),
+		TailFreeSamplingZ:   float32(*c.TFZ),
+		TypicalP:            float32(*c.TypicalP),
 	}
 }
--- a/core/backend/stores.go
+++ b/core/backend/stores.go
@@ -0,0 +1,23 @@
+package backend
+
+import (
+	"github.com/go-skynet/LocalAI/core/config"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc"
+	"github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) (grpc.Backend, error) {
+    if storeName == "" {
+      storeName = "default"
+    }
+
+    sc := []model.Option{
+      model.WithBackendString(model.LocalStoreBackend),
+      model.WithAssetDir(appConfig.AssetsDestination),
+      model.WithModel(storeName),
+    }
+
+    return sl.BackendLoader(sc...)
+}
+
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -15,11 +15,13 @@ type ApplicationConfig struct {
 	ConfigFile                          string
 	ModelPath                           string
 	UploadLimitMB, Threads, ContextSize int
+	DisableWelcomePage                  bool
 	F16                                 bool
 	Debug, DisableMessage               bool
 	ImageDir                            string
 	AudioDir                            string
 	UploadDir                           string
+	ConfigsDir                          string
 	CORS                                bool
 	PreloadJSONModels                   string
 	PreloadModelsFromPath               string
@@ -104,6 +106,10 @@ var EnableWatchDogBusyCheck = func(o *ApplicationConfig) {
 	o.WatchDogBusy = true
 }

+var DisableWelcomePage = func(o *ApplicationConfig) {
+	o.DisableWelcomePage = true
+}
+
 func SetWatchDogBusyTimeout(t time.Duration) AppOption {
 	return func(o *ApplicationConfig) {
 		o.WatchDogBusyTimeout = t
@@ -163,7 +169,7 @@ func WithStringGalleries(galls string) AppOption {
 		}
 		var galleries []gallery.Gallery
 		if err := json.Unmarshal([]byte(galls), &galleries); err != nil {
-			log.Error().Msgf("failed loading galleries: %s", err.Error())
+			log.Error().Err(err).Msg("failed loading galleries")
 		}
 		o.Galleries = append(o.Galleries, galleries...)
 	}
@@ -252,12 +258,33 @@ func WithUploadDir(uploadDir string) AppOption {
 	}
 }

+func WithConfigsDir(configsDir string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.ConfigsDir = configsDir
+	}
+}
+
 func WithApiKeys(apiKeys []string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ApiKeys = apiKeys
 	}
 }

+// ToConfigLoaderOptions returns a slice of ConfigLoader Option.
+// Some options defined at the application level are going to be passed as defaults for
+// all the configuration for the models.
+// This includes for instance the context size or the number of threads.
+// If a model doesn't set configs directly to the config model file
+// it will use the defaults defined here.
+func (o *ApplicationConfig) ToConfigLoaderOptions() []ConfigLoaderOption {
+	return []ConfigLoaderOption{
+		LoadOptionContextSize(o.ContextSize),
+		LoadOptionDebug(o.Debug),
+		LoadOptionF16(o.F16),
+		LoadOptionThreads(o.Threads),
+	}
+}
+
 // func WithMetrics(meter *metrics.Metrics) AppOption {
 // 	return func(o *StartupOptions) {
 // 		o.Metrics = meter
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -4,9 +4,9 @@ import (
 	"errors"
 	"fmt"
 	"io/fs"
-	"math/rand"
 	"os"
 	"path/filepath"
+	"sort"
 	"strings"
 	"sync"

@@ -19,6 +19,10 @@ import (
 	"github.com/charmbracelet/glamour"
 )

+const (
+	RAND_SEED = -1
+)
+
 type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`
@@ -185,17 +189,32 @@ func (c *BackendConfig) ShouldCallSpecificFunction() bool {
 }

 func (c *BackendConfig) FunctionToCall() string {
-	return c.functionCallNameString
+	if c.functionCallNameString != "" &&
+		c.functionCallNameString != "none" && c.functionCallNameString != "auto" {
+		return c.functionCallNameString
+	}
+
+	return c.functionCallString
 }

-func (cfg *BackendConfig) SetDefaults(debug bool, threads, ctx int, f16 bool) {
-	defaultTopP := 0.7
-	defaultTopK := 80
+func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
+	lo := &LoadOptions{}
+	lo.Apply(opts...)
+
+	ctx := lo.ctxSize
+	threads := lo.threads
+	f16 := lo.f16
+	debug := lo.debug
+	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
+	defaultTopP := 0.95
+	defaultTopK := 40
 	defaultTemp := 0.9
 	defaultMaxTokens := 2048
 	defaultMirostat := 2
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
+	defaultTypicalP := 1.0
+	defaultTFZ := 1.0

 	// Try to offload all GPU layers (if GPU is found)
 	defaultNGPULayers := 99999999
@@ -205,7 +224,7 @@ func (cfg *BackendConfig) SetDefaults(debug bool, threads, ctx int, f16 bool) {

 	if cfg.Seed == nil {
 		//  random number generator seed
-		defaultSeed := int(rand.Int31())
+		defaultSeed := RAND_SEED
 		cfg.Seed = &defaultSeed
 	}

@@ -213,6 +232,14 @@ func (cfg *BackendConfig) SetDefaults(debug bool, threads, ctx int, f16 bool) {
 		cfg.TopK = &defaultTopK
 	}

+	if cfg.TypicalP == nil {
+		cfg.TypicalP = &defaultTypicalP
+	}
+
+	if cfg.TFZ == nil {
+		cfg.TFZ = &defaultTFZ
+	}
+
 	if cfg.MMap == nil {
 		// MMap is enabled by default
 		cfg.MMap = &trueV
@@ -276,8 +303,12 @@ func (cfg *BackendConfig) SetDefaults(debug bool, threads, ctx int, f16 bool) {
 		cfg.F16 = &f16
 	}

+	if cfg.Debug == nil {
+		cfg.Debug = &falseV
+	}
+
 	if debug {
-		cfg.Debug = &debug
+		cfg.Debug = &trueV
 	}
 }

@@ -329,9 +360,6 @@ func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
 // Load a config file for a model
 func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {

-	lo := &LoadOptions{}
-	lo.Apply(opts...)
-
 	// Load a config file if present after the model name
 	cfg := &BackendConfig{
 		PredictionOptions: schema.PredictionOptions{
@@ -346,7 +374,9 @@ func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 		// Try loading a model config file
 		modelConfig := filepath.Join(modelPath, modelName+".yaml")
 		if _, err := os.Stat(modelConfig); err == nil {
-			if err := cl.LoadBackendConfig(modelConfig); err != nil {
+			if err := cl.LoadBackendConfig(
+				modelConfig, opts...,
+			); err != nil {
 				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
 			}
 			cfgExisting, exists = cl.GetBackendConfig(modelName)
@@ -356,7 +386,7 @@ func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 		}
 	}

-	cfg.SetDefaults(lo.debug, lo.threads, lo.ctxSize, lo.f16)
+	cfg.SetDefaults(opts...)

 	return cfg, nil
 }
@@ -367,9 +397,6 @@ func NewBackendConfigLoader() *BackendConfigLoader {
 	}
 }
 func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
-	lo := &LoadOptions{}
-	lo.Apply(opts...)
-
 	c := &[]*BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
@@ -380,7 +407,7 @@ func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendC
 	}

 	for _, cc := range *c {
-		cc.SetDefaults(lo.debug, lo.threads, lo.ctxSize, lo.f16)
+		cc.SetDefaults(opts...)
 	}

 	return *c, nil
@@ -399,7 +426,7 @@ func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig,
 		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
 	}

-	c.SetDefaults(lo.debug, lo.threads, lo.ctxSize, lo.f16)
+	c.SetDefaults(opts...)
 	return c, nil
 }

@@ -443,6 +470,11 @@ func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
 	for _, v := range cl.configs {
 		res = append(res, v)
 	}
+
+	sort.SliceStable(res, func(i, j int) bool {
+		return res[i].Name < res[j].Name
+	})
+
 	return res
 }

--- a/core/http/api.go
+++ b/core/http/api.go
@@ -6,6 +6,9 @@ import (
 	"os"
 	"strings"

+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/gofiber/swagger" // swagger handler
+
 	"github.com/go-skynet/LocalAI/core/http/endpoints/elevenlabs"
 	"github.com/go-skynet/LocalAI/core/http/endpoints/localai"
 	"github.com/go-skynet/LocalAI/core/http/endpoints/openai"
@@ -40,9 +43,23 @@ func readAuthHeader(c *fiber.Ctx) string {
 	return authHeader
 }

+// @title LocalAI API
+// @version 2.0.0
+// @description The LocalAI Rest API.
+// @termsOfService
+// @contact.name LocalAI
+// @contact.url https://localai.io
+// @license.name MIT
+// @license.url https://raw.githubusercontent.com/mudler/LocalAI/master/LICENSE
+// @BasePath /
+// @securityDefinitions.apikey BearerAuth
+// @in header
+// @name Authorization
+
 func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
+		Views:                 renderEngine(),
 		BodyLimit:             appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
 		DisableStartupMessage: appConfig.DisableMessage,
 		// Override default error handler
@@ -155,8 +172,27 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		}{Version: internal.PrintableVersion()})
 	})

-	// Load upload json
-	openai.LoadUploadConfig(appConfig.UploadDir)
+	// Make sure directories exists
+	os.MkdirAll(appConfig.ImageDir, 0755)
+	os.MkdirAll(appConfig.AudioDir, 0755)
+	os.MkdirAll(appConfig.UploadDir, 0755)
+	os.MkdirAll(appConfig.ConfigsDir, 0755)
+	os.MkdirAll(appConfig.ModelPath, 0755)
+
+	// Load config jsons
+	utils.LoadConfig(appConfig.UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
+	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
+	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
+
+	app.Get("/swagger/*", swagger.HandlerDefault) // default
+
+	welcomeRoute(
+		app,
+		cl,
+		ml,
+		appConfig,
+		auth,
+	)

 	modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
 	app.Post("/models/apply", auth, modelGalleryEndpointService.ApplyModelGalleryEndpoint())
@@ -172,6 +208,13 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	// Elevenlabs
 	app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))

+	// Stores
+	sl := model.NewModelLoader("")
+	app.Post("/stores/set", auth, localai.StoresSetEndpoint(sl, appConfig))
+	app.Post("/stores/delete", auth, localai.StoresDeleteEndpoint(sl, appConfig))
+	app.Post("/stores/get", auth, localai.StoresGetEndpoint(sl, appConfig))
+	app.Post("/stores/find", auth, localai.StoresFindEndpoint(sl, appConfig))
+
 	// openAI compatible API endpoint

 	// chat
@@ -182,6 +225,26 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, ml, appConfig))
 	app.Post("/edits", auth, openai.EditEndpoint(cl, ml, appConfig))

+	// assistant
+	app.Get("/v1/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Get("/assistants", auth, openai.ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants", auth, openai.CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id", auth, openai.DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id", auth, openai.GetAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id", auth, openai.ModifyAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files", auth, openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Post("/v1/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id/files", auth, openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", auth, openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", auth, openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+
 	// files
 	app.Post("/v1/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
 	app.Post("/files", auth, openai.UploadFilesEndpoint(cl, appConfig))
@@ -229,14 +292,18 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi

 	// Experimental Backend Statistics Module
 	backendMonitor := services.NewBackendMonitor(cl, ml, appConfig) // Split out for now
-	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
-	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))
+	app.Get("/backend/monitor", auth, localai.BackendMonitorEndpoint(backendMonitor))
+	app.Post("/backend/shutdown", auth, localai.BackendShutdownEndpoint(backendMonitor))

 	// models
 	app.Get("/v1/models", auth, openai.ListModelsEndpoint(cl, ml))
 	app.Get("/models", auth, openai.ListModelsEndpoint(cl, ml))

-	app.Get("/metrics", localai.LocalAIMetricsEndpoint())
+	app.Get("/metrics", auth, localai.LocalAIMetricsEndpoint())
+
+	// Define a custom 404 handler
+	// Note: keep this at the bottom!
+	app.Use(notFoundHandler)

 	return app, nil
 }
--- a/core/http/api_test.go
+++ b/core/http/api_test.go
@@ -15,6 +15,7 @@ import (

 	"github.com/go-skynet/LocalAI/core/config"
 	. "github.com/go-skynet/LocalAI/core/http"
+	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/core/startup"

 	"github.com/go-skynet/LocalAI/pkg/downloader"
@@ -122,6 +123,75 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 	return
 }

+func postRequestJSON[B any](url string, bodyJson *B) error {
+	payload, err := json.Marshal(bodyJson)
+	if err != nil {
+		return err
+	}
+
+	GinkgoWriter.Printf("POST %s: %s\n", url, string(payload))
+
+	req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
+	if err != nil {
+		return err
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err
+	}
+
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body))
+	}
+
+	return nil
+}
+
+func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *B2) error {
+	payload, err := json.Marshal(reqJson)
+	if err != nil {
+		return err
+	}
+
+	GinkgoWriter.Printf("POST %s: %s\n", url, string(payload))
+
+	req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
+	if err != nil {
+		return err
+	}
+
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return err
+	}
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 400 {
+		return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body))
+	}
+
+	return json.Unmarshal(body, respJson)
+}
+
 //go:embed backend-assets/*
 var backendAssets embed.FS

@@ -666,15 +736,15 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
-		It("can generate completions", func() {
-			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: testPrompt})
+		It("can generate completions via ggml", func() {
+			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel.ggml", Prompt: testPrompt})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
 		})

-		It("can generate chat completions ", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
+		It("can generate chat completions via ggml", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel.ggml", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: testPrompt}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
@@ -836,6 +906,78 @@ var _ = Describe("API test", func() {
 				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
 			})
 		})
+
+		// See tests/integration/stores_test
+		Context("Stores", Label("stores"), func() {
+
+			It("sets, gets, finds and deletes entries", func() {
+				ks := [][]float32{
+					{0.1, 0.2, 0.3},
+					{0.4, 0.5, 0.6},
+					{0.7, 0.8, 0.9},
+				}
+				vs := []string{
+					"test1",
+					"test2",
+					"test3",
+				}
+				setBody := schema.StoresSet{
+					Keys:   ks,
+					Values: vs,
+				}
+
+				url := "http://127.0.0.1:9090/stores/"
+				err := postRequestJSON(url+"set", &setBody)
+				Expect(err).ToNot(HaveOccurred())
+
+				getBody := schema.StoresGet{
+					Keys: ks,
+				}
+				var getRespBody schema.StoresGetResponse
+				err = postRequestResponseJSON(url+"get", &getBody, &getRespBody)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(getRespBody.Keys)).To(Equal(len(ks)))
+
+				for i, v := range getRespBody.Keys {
+					if v[0] == 0.1 {
+						Expect(getRespBody.Values[i]).To(Equal("test1"))
+					} else if v[0] == 0.4 {
+						Expect(getRespBody.Values[i]).To(Equal("test2"))
+					} else {
+						Expect(getRespBody.Values[i]).To(Equal("test3"))
+					}
+				}
+
+				deleteBody := schema.StoresDelete{
+					Keys: [][]float32{
+						{0.1, 0.2, 0.3},
+					},
+				}
+				err = postRequestJSON(url+"delete", &deleteBody)
+				Expect(err).ToNot(HaveOccurred())
+
+				findBody := schema.StoresFind{
+					Key:  []float32{0.1, 0.3, 0.7},
+					Topk: 10,
+				}
+
+				var findRespBody schema.StoresFindResponse
+				err = postRequestResponseJSON(url+"find", &findBody, &findRespBody)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(findRespBody.Keys)).To(Equal(2))
+
+				for i, v := range findRespBody.Keys {
+					if v[0] == 0.4 {
+						Expect(findRespBody.Values[i]).To(Equal("test2"))
+					} else {
+						Expect(findRespBody.Values[i]).To(Equal("test3"))
+					}
+
+					Expect(findRespBody.Similarities[i]).To(BeNumerically(">=", -1))
+					Expect(findRespBody.Similarities[i]).To(BeNumerically("<=", 1))
+				}
+			})
+		})
 	})

 	Context("Config file", func() {
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -11,6 +11,12 @@ import (
 	"github.com/rs/zerolog/log"
 )

+// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
+// @Summary Generates audio from the input text.
+// @Param  voice-id	path string	true	"Account ID"
+// @Param request body schema.TTSRequest true "query params"
+// @Success 200 {string} binary	 "Response"
+// @Router /v1/text-to-speech/{voice-id} [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -0,0 +1,121 @@
+package localai
+
+import (
+	"github.com/go-skynet/LocalAI/core/backend"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/store"
+	"github.com/gofiber/fiber/v2"
+)
+
+func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresSet)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		vals := make([][]byte, len(input.Values))
+		for i, v := range input.Values {
+			vals[i] = []byte(v)
+		}
+
+		err = store.SetCols(c.Context(), sb, input.Keys, vals)
+		if err != nil {
+			return err
+		}
+
+		return c.Send(nil)
+	}
+}
+
+func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresDelete)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
+			return err
+		}
+
+		return c.Send(nil)
+	}
+}
+
+func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresGet)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
+		if err != nil {
+			return err
+		}
+
+		res := schema.StoresGetResponse{
+			Keys:   keys,
+			Values: make([]string, len(vals)),
+		}
+
+		for i, v := range vals {
+			res.Values[i] = string(v)
+		}
+
+		return c.JSON(res)
+	}
+}
+
+func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(schema.StoresFind)
+
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		sb, err := backend.StoreBackend(sl, appConfig, input.Store)
+		if err != nil {
+			return err
+		}
+
+		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
+		if err != nil {
+			return err
+		}
+
+		res := schema.StoresFindResponse{
+			Keys:         keys,
+			Values:       make([]string, len(vals)),
+			Similarities: similarities,
+		}
+
+		for i, v := range vals {
+			res.Values[i] = string(v)
+		}
+
+		return c.JSON(res)
+	}
+}
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -11,6 +11,11 @@ import (
 	"github.com/rs/zerolog/log"
 )

+// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
+// @Summary Generates audio from the input text.
+// @Param request body schema.TTSRequest true "query params"
+// @Success 200 {string} binary	 "Response"
+// @Router /v1/audio/speech [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -0,0 +1,521 @@
+package openai
+
+import (
+	"fmt"
+	"net/http"
+	"sort"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"github.com/go-skynet/LocalAI/core/config"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// ToolType defines a type for tool options
+type ToolType string
+
+const (
+	CodeInterpreter ToolType = "code_interpreter"
+	Retrieval       ToolType = "retrieval"
+	Function        ToolType = "function"
+
+	MaxCharacterInstructions  = 32768
+	MaxCharacterDescription   = 512
+	MaxCharacterName          = 256
+	MaxToolsSize              = 128
+	MaxFileIdSize             = 20
+	MaxCharacterMetadataKey   = 64
+	MaxCharacterMetadataValue = 512
+)
+
+type Tool struct {
+	Type ToolType `json:"type"`
+}
+
+// Assistant represents the structure of an assistant object from the OpenAI API.
+type Assistant struct {
+	ID           string            `json:"id"`                     // The unique identifier of the assistant.
+	Object       string            `json:"object"`                 // Object type, which is "assistant".
+	Created      int64             `json:"created"`                // The time at which the assistant was created.
+	Model        string            `json:"model"`                  // The model ID used by the assistant.
+	Name         string            `json:"name,omitempty"`         // The name of the assistant.
+	Description  string            `json:"description,omitempty"`  // The description of the assistant.
+	Instructions string            `json:"instructions,omitempty"` // The system instructions that the assistant uses.
+	Tools        []Tool            `json:"tools,omitempty"`        // A list of tools enabled on the assistant.
+	FileIDs      []string          `json:"file_ids,omitempty"`     // A list of file IDs attached to this assistant.
+	Metadata     map[string]string `json:"metadata,omitempty"`     // Set of key-value pairs attached to the assistant.
+}
+
+var (
+	Assistants           = []Assistant{} // better to return empty array instead of "null"
+	AssistantsConfigFile = "assistants.json"
+)
+
+type AssistantRequest struct {
+	Model        string            `json:"model"`
+	Name         string            `json:"name,omitempty"`
+	Description  string            `json:"description,omitempty"`
+	Instructions string            `json:"instructions,omitempty"`
+	Tools        []Tool            `json:"tools,omitempty"`
+	FileIDs      []string          `json:"file_ids,omitempty"`
+	Metadata     map[string]string `json:"metadata,omitempty"`
+}
+
+// CreateAssistantEndpoint is the OpenAI Assistant API endpoint https://platform.openai.com/docs/api-reference/assistants/createAssistant
+// @Summary Create an assistant with a model and instructions.
+// @Param request body AssistantRequest true "query params"
+// @Success 200 {object} Assistant "Response"
+// @Router /v1/assistants [post]
+func CreateAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		request := new(AssistantRequest)
+		if err := c.BodyParser(request); err != nil {
+			log.Warn().AnErr("Unable to parse AssistantRequest", err)
+			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{"error": "Cannot parse JSON"})
+		}
+
+		if !modelExists(ml, request.Model) {
+			log.Warn().Msgf("Model: %s was not found in list of models.", request.Model)
+			return c.Status(fiber.StatusBadRequest).SendString("Model " + request.Model + " not found")
+		}
+
+		if request.Tools == nil {
+			request.Tools = []Tool{}
+		}
+
+		if request.FileIDs == nil {
+			request.FileIDs = []string{}
+		}
+
+		if request.Metadata == nil {
+			request.Metadata = make(map[string]string)
+		}
+
+		id := "asst_" + strconv.FormatInt(generateRandomID(), 10)
+
+		assistant := Assistant{
+			ID:           id,
+			Object:       "assistant",
+			Created:      time.Now().Unix(),
+			Model:        request.Model,
+			Name:         request.Name,
+			Description:  request.Description,
+			Instructions: request.Instructions,
+			Tools:        request.Tools,
+			FileIDs:      request.FileIDs,
+			Metadata:     request.Metadata,
+		}
+
+		Assistants = append(Assistants, assistant)
+		utils.SaveConfig(appConfig.ConfigsDir, AssistantsConfigFile, Assistants)
+		return c.Status(fiber.StatusOK).JSON(assistant)
+	}
+}
+
+var currentId int64 = 0
+
+func generateRandomID() int64 {
+	atomic.AddInt64(&currentId, 1)
+	return currentId
+}
+
+func ListAssistantsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		// Because we're altering the existing assistants list we should just duplicate it for now.
+		returnAssistants := Assistants
+		// Parse query parameters
+		limitQuery := c.Query("limit", "20")
+		orderQuery := c.Query("order", "desc")
+		afterQuery := c.Query("after")
+		beforeQuery := c.Query("before")
+
+		// Convert string limit to integer
+		limit, err := strconv.Atoi(limitQuery)
+		if err != nil {
+			return c.Status(http.StatusBadRequest).SendString(fmt.Sprintf("Invalid limit query value: %s", limitQuery))
+		}
+
+		// Sort assistants
+		sort.SliceStable(returnAssistants, func(i, j int) bool {
+			if orderQuery == "asc" {
+				return returnAssistants[i].Created < returnAssistants[j].Created
+			}
+			return returnAssistants[i].Created > returnAssistants[j].Created
+		})
+
+		// After and before cursors
+		if afterQuery != "" {
+			returnAssistants = filterAssistantsAfterID(returnAssistants, afterQuery)
+		}
+		if beforeQuery != "" {
+			returnAssistants = filterAssistantsBeforeID(returnAssistants, beforeQuery)
+		}
+
+		// Apply limit
+		if limit < len(returnAssistants) {
+			returnAssistants = returnAssistants[:limit]
+		}
+
+		return c.JSON(returnAssistants)
+	}
+}
+
+// FilterAssistantsBeforeID filters out those assistants whose ID comes before the given ID
+// We assume that the assistants are already sorted
+func filterAssistantsBeforeID(assistants []Assistant, id string) []Assistant {
+	idInt, err := strconv.Atoi(id)
+	if err != nil {
+		return assistants // Return original slice if invalid id format is provided
+	}
+
+	var filteredAssistants []Assistant
+
+	for _, assistant := range assistants {
+		aid, err := strconv.Atoi(strings.TrimPrefix(assistant.ID, "asst_"))
+		if err != nil {
+			continue // Skip if invalid id in assistant
+		}
+
+		if aid < idInt {
+			filteredAssistants = append(filteredAssistants, assistant)
+		}
+	}
+
+	return filteredAssistants
+}
+
+// FilterAssistantsAfterID filters out those assistants whose ID comes after the given ID
+// We assume that the assistants are already sorted
+func filterAssistantsAfterID(assistants []Assistant, id string) []Assistant {
+	idInt, err := strconv.Atoi(id)
+	if err != nil {
+		return assistants // Return original slice if invalid id format is provided
+	}
+
+	var filteredAssistants []Assistant
+
+	for _, assistant := range assistants {
+		aid, err := strconv.Atoi(strings.TrimPrefix(assistant.ID, "asst_"))
+		if err != nil {
+			continue // Skip if invalid id in assistant
+		}
+
+		if aid > idInt {
+			filteredAssistants = append(filteredAssistants, assistant)
+		}
+	}
+
+	return filteredAssistants
+}
+
+func modelExists(ml *model.ModelLoader, modelName string) (found bool) {
+	found = false
+	models, err := ml.ListModels()
+	if err != nil {
+		return
+	}
+
+	for _, model := range models {
+		if model == modelName {
+			found = true
+			return
+		}
+	}
+	return
+}
+
+func DeleteAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	type DeleteAssistantResponse struct {
+		ID      string `json:"id"`
+		Object  string `json:"object"`
+		Deleted bool   `json:"deleted"`
+	}
+
+	return func(c *fiber.Ctx) error {
+		assistantID := c.Params("assistant_id")
+		if assistantID == "" {
+			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
+		}
+
+		for i, assistant := range Assistants {
+			if assistant.ID == assistantID {
+				Assistants = append(Assistants[:i], Assistants[i+1:]...)
+				utils.SaveConfig(appConfig.ConfigsDir, AssistantsConfigFile, Assistants)
+				return c.Status(fiber.StatusOK).JSON(DeleteAssistantResponse{
+					ID:      assistantID,
+					Object:  "assistant.deleted",
+					Deleted: true,
+				})
+			}
+		}
+
+		log.Warn().Msgf("Unable to find assistant %s for deletion", assistantID)
+		return c.Status(fiber.StatusNotFound).JSON(DeleteAssistantResponse{
+			ID:      assistantID,
+			Object:  "assistant.deleted",
+			Deleted: false,
+		})
+	}
+}
+
+func GetAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		assistantID := c.Params("assistant_id")
+		if assistantID == "" {
+			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
+		}
+
+		for _, assistant := range Assistants {
+			if assistant.ID == assistantID {
+				return c.Status(fiber.StatusOK).JSON(assistant)
+			}
+		}
+
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
+	}
+}
+
+type AssistantFile struct {
+	ID          string `json:"id"`
+	Object      string `json:"object"`
+	CreatedAt   int64  `json:"created_at"`
+	AssistantID string `json:"assistant_id"`
+}
+
+var (
+	AssistantFiles           []AssistantFile
+	AssistantsFileConfigFile = "assistantsFile.json"
+)
+
+type AssistantFileRequest struct {
+	FileID string `json:"file_id"`
+}
+
+type DeleteAssistantFileResponse struct {
+	ID      string `json:"id"`
+	Object  string `json:"object"`
+	Deleted bool   `json:"deleted"`
+}
+
+func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		request := new(AssistantFileRequest)
+		if err := c.BodyParser(request); err != nil {
+			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{"error": "Cannot parse JSON"})
+		}
+
+		assistantID := c.Params("assistant_id")
+		if assistantID == "" {
+			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
+		}
+
+		for _, assistant := range Assistants {
+			if assistant.ID == assistantID {
+				if len(assistant.FileIDs) > MaxFileIdSize {
+					return c.Status(fiber.StatusBadRequest).SendString(fmt.Sprintf("Max files %d for assistant %s reached.", MaxFileIdSize, assistant.Name))
+				}
+
+				for _, file := range UploadedFiles {
+					if file.ID == request.FileID {
+						assistant.FileIDs = append(assistant.FileIDs, request.FileID)
+						assistantFile := AssistantFile{
+							ID:          file.ID,
+							Object:      "assistant.file",
+							CreatedAt:   time.Now().Unix(),
+							AssistantID: assistant.ID,
+						}
+						AssistantFiles = append(AssistantFiles, assistantFile)
+						utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
+						return c.Status(fiber.StatusOK).JSON(assistantFile)
+					}
+				}
+
+				return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find file_id: %s", request.FileID))
+			}
+		}
+
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find "))
+	}
+}
+
+func ListAssistantFilesEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	type ListAssistantFiles struct {
+		Data   []File
+		Object string
+	}
+
+	return func(c *fiber.Ctx) error {
+		assistantID := c.Params("assistant_id")
+		if assistantID == "" {
+			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
+		}
+
+		limitQuery := c.Query("limit", "20")
+		order := c.Query("order", "desc")
+		limit, err := strconv.Atoi(limitQuery)
+		if err != nil || limit < 1 || limit > 100 {
+			limit = 20 // Default to 20 if there's an error or the limit is out of bounds
+		}
+
+		// Sort files by CreatedAt depending on the order query parameter
+		if order == "asc" {
+			sort.Slice(AssistantFiles, func(i, j int) bool {
+				return AssistantFiles[i].CreatedAt < AssistantFiles[j].CreatedAt
+			})
+		} else { // default to "desc"
+			sort.Slice(AssistantFiles, func(i, j int) bool {
+				return AssistantFiles[i].CreatedAt > AssistantFiles[j].CreatedAt
+			})
+		}
+
+		// Limit the number of files returned
+		var limitedFiles []AssistantFile
+		hasMore := false
+		if len(AssistantFiles) > limit {
+			hasMore = true
+			limitedFiles = AssistantFiles[:limit]
+		} else {
+			limitedFiles = AssistantFiles
+		}
+
+		response := map[string]interface{}{
+			"object": "list",
+			"data":   limitedFiles,
+			"first_id": func() string {
+				if len(limitedFiles) > 0 {
+					return limitedFiles[0].ID
+				}
+				return ""
+			}(),
+			"last_id": func() string {
+				if len(limitedFiles) > 0 {
+					return limitedFiles[len(limitedFiles)-1].ID
+				}
+				return ""
+			}(),
+			"has_more": hasMore,
+		}
+
+		return c.Status(fiber.StatusOK).JSON(response)
+	}
+}
+
+func ModifyAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		request := new(AssistantRequest)
+		if err := c.BodyParser(request); err != nil {
+			log.Warn().AnErr("Unable to parse AssistantRequest", err)
+			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{"error": "Cannot parse JSON"})
+		}
+
+		assistantID := c.Params("assistant_id")
+		if assistantID == "" {
+			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
+		}
+
+		for i, assistant := range Assistants {
+			if assistant.ID == assistantID {
+				newAssistant := Assistant{
+					ID:           assistantID,
+					Object:       assistant.Object,
+					Created:      assistant.Created,
+					Model:        request.Model,
+					Name:         request.Name,
+					Description:  request.Description,
+					Instructions: request.Instructions,
+					Tools:        request.Tools,
+					FileIDs:      request.FileIDs, // todo: should probably verify fileids exist
+					Metadata:     request.Metadata,
+				}
+
+				// Remove old one and replace with new one
+				Assistants = append(Assistants[:i], Assistants[i+1:]...)
+				Assistants = append(Assistants, newAssistant)
+				utils.SaveConfig(appConfig.ConfigsDir, AssistantsConfigFile, Assistants)
+				return c.Status(fiber.StatusOK).JSON(newAssistant)
+			}
+		}
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
+	}
+}
+
+func DeleteAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		assistantID := c.Params("assistant_id")
+		fileId := c.Params("file_id")
+		if assistantID == "" {
+			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id and file_id are required")
+		}
+		// First remove file from assistant
+		for i, assistant := range Assistants {
+			if assistant.ID == assistantID {
+				for j, fileId := range assistant.FileIDs {
+					if fileId == fileId {
+						Assistants[i].FileIDs = append(Assistants[i].FileIDs[:j], Assistants[i].FileIDs[j+1:]...)
+
+						// Check if the file exists in the assistantFiles slice
+						for i, assistantFile := range AssistantFiles {
+							if assistantFile.ID == fileId {
+								// Remove the file from the assistantFiles slice
+								AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
+								utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
+								return c.Status(fiber.StatusOK).JSON(DeleteAssistantFileResponse{
+									ID:      fileId,
+									Object:  "assistant.file.deleted",
+									Deleted: true,
+								})
+							}
+						}
+					}
+				}
+
+				log.Warn().Msgf("Unable to locate file_id: %s in assistants: %s. Continuing to delete assistant file.", fileId, assistantID)
+				for i, assistantFile := range AssistantFiles {
+					if assistantFile.AssistantID == assistantID {
+
+						AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
+						utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
+
+						return c.Status(fiber.StatusNotFound).JSON(DeleteAssistantFileResponse{
+							ID:      fileId,
+							Object:  "assistant.file.deleted",
+							Deleted: true,
+						})
+					}
+				}
+			}
+		}
+		log.Warn().Msgf("Unable to find assistant: %s", assistantID)
+
+		return c.Status(fiber.StatusNotFound).JSON(DeleteAssistantFileResponse{
+			ID:      fileId,
+			Object:  "assistant.file.deleted",
+			Deleted: false,
+		})
+	}
+}
+
+func GetAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		assistantID := c.Params("assistant_id")
+		fileId := c.Params("file_id")
+		if assistantID == "" {
+			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id and file_id are required")
+		}
+
+		for _, assistantFile := range AssistantFiles {
+			if assistantFile.AssistantID == assistantID {
+				if assistantFile.ID == fileId {
+					return c.Status(fiber.StatusOK).JSON(assistantFile)
+				}
+				return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId))
+			}
+		}
+		return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID))
+	}
+}
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -0,0 +1,456 @@
+package openai
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/stretchr/testify/assert"
+	"io"
+	"io/ioutil"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+)
+
+var configsDir string = "/tmp/localai/configs"
+
+type MockLoader struct {
+	models []string
+}
+
+func tearDown() func() {
+	return func() {
+		UploadedFiles = []File{}
+		Assistants = []Assistant{}
+		AssistantFiles = []AssistantFile{}
+		_ = os.Remove(filepath.Join(configsDir, AssistantsConfigFile))
+		_ = os.Remove(filepath.Join(configsDir, AssistantsFileConfigFile))
+	}
+}
+
+func TestAssistantEndpoints(t *testing.T) {
+	// Preparing the mocked objects
+	cl := &config.BackendConfigLoader{}
+	//configsDir := "/tmp/localai/configs"
+	modelPath := "/tmp/localai/model"
+	var ml = model.NewModelLoader(modelPath)
+
+	appConfig := &config.ApplicationConfig{
+		ConfigsDir:    configsDir,
+		UploadLimitMB: 10,
+		UploadDir:     "test_dir",
+		ModelPath:     modelPath,
+	}
+
+	_ = os.RemoveAll(appConfig.ConfigsDir)
+	_ = os.MkdirAll(appConfig.ConfigsDir, 0755)
+	_ = os.MkdirAll(modelPath, 0755)
+	os.Create(filepath.Join(modelPath, "ggml-gpt4all-j"))
+
+	app := fiber.New(fiber.Config{
+		BodyLimit: 20 * 1024 * 1024, // sets the limit to 20MB.
+	})
+
+	// Create a Test Server
+	app.Get("/assistants", ListAssistantsEndpoint(cl, ml, appConfig))
+	app.Post("/assistants", CreateAssistantEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id", DeleteAssistantEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id", GetAssistantEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id", ModifyAssistantEndpoint(cl, ml, appConfig))
+
+	app.Post("/files", UploadFilesEndpoint(cl, appConfig))
+	app.Get("/assistants/:assistant_id/files", ListAssistantFilesEndpoint(cl, ml, appConfig))
+	app.Post("/assistants/:assistant_id/files", CreateAssistantFileEndpoint(cl, ml, appConfig))
+	app.Delete("/assistants/:assistant_id/files/:file_id", DeleteAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/assistants/:assistant_id/files/:file_id", GetAssistantFileEndpoint(cl, ml, appConfig))
+
+	t.Run("CreateAssistantEndpoint", func(t *testing.T) {
+		t.Cleanup(tearDown())
+		ar := &AssistantRequest{
+			Model:        "ggml-gpt4all-j",
+			Name:         "3.5-turbo",
+			Description:  "Test Assistant",
+			Instructions: "You are computer science teacher answering student questions",
+			Tools:        []Tool{{Type: Function}},
+			FileIDs:      nil,
+			Metadata:     nil,
+		}
+
+		resultAssistant, resp, err := createAssistant(app, *ar)
+		assert.NoError(t, err)
+		assert.Equal(t, fiber.StatusOK, resp.StatusCode)
+
+		assert.Equal(t, 1, len(Assistants))
+		//t.Cleanup(cleanupAllAssistants(t, app, []string{resultAssistant.ID}))
+
+		assert.Equal(t, ar.Name, resultAssistant.Name)
+		assert.Equal(t, ar.Model, resultAssistant.Model)
+		assert.Equal(t, ar.Tools, resultAssistant.Tools)
+		assert.Equal(t, ar.Description, resultAssistant.Description)
+		assert.Equal(t, ar.Instructions, resultAssistant.Instructions)
+		assert.Equal(t, ar.FileIDs, resultAssistant.FileIDs)
+		assert.Equal(t, ar.Metadata, resultAssistant.Metadata)
+	})
+
+	t.Run("ListAssistantsEndpoint", func(t *testing.T) {
+		var ids []string
+		var resultAssistant []Assistant
+		for i := 0; i < 4; i++ {
+			ar := &AssistantRequest{
+				Model:        "ggml-gpt4all-j",
+				Name:         fmt.Sprintf("3.5-turbo-%d", i),
+				Description:  fmt.Sprintf("Test Assistant - %d", i),
+				Instructions: fmt.Sprintf("You are computer science teacher answering student questions - %d", i),
+				Tools:        []Tool{{Type: Function}},
+				FileIDs:      []string{"fid-1234"},
+				Metadata:     map[string]string{"meta": "data"},
+			}
+
+			//var err error
+			ra, _, err := createAssistant(app, *ar)
+			// Because we create the assistants so fast all end up with the same created time.
+			time.Sleep(time.Second)
+			resultAssistant = append(resultAssistant, ra)
+			assert.NoError(t, err)
+			ids = append(ids, resultAssistant[i].ID)
+		}
+
+		t.Cleanup(cleanupAllAssistants(t, app, ids))
+
+		tests := []struct {
+			name                 string
+			reqURL               string
+			expectedStatus       int
+			expectedResult       []Assistant
+			expectedStringResult string
+		}{
+			{
+				name:           "Valid Usage - limit only",
+				reqURL:         "/assistants?limit=2",
+				expectedStatus: http.StatusOK,
+				expectedResult: Assistants[:2], // Expecting the first two assistants
+			},
+			{
+				name:           "Valid Usage - order asc",
+				reqURL:         "/assistants?order=asc",
+				expectedStatus: http.StatusOK,
+				expectedResult: Assistants, // Expecting all assistants in ascending order
+			},
+			{
+				name:           "Valid Usage - order desc",
+				reqURL:         "/assistants?order=desc",
+				expectedStatus: http.StatusOK,
+				expectedResult: []Assistant{Assistants[3], Assistants[2], Assistants[1], Assistants[0]}, // Expecting all assistants in descending order
+			},
+			{
+				name:           "Valid Usage - after specific ID",
+				reqURL:         "/assistants?after=2",
+				expectedStatus: http.StatusOK,
+				// Note this is correct because it's put in descending order already
+				expectedResult: Assistants[:3], // Expecting assistants after (excluding) ID 2
+			},
+			{
+				name:           "Valid Usage - before specific ID",
+				reqURL:         "/assistants?before=4",
+				expectedStatus: http.StatusOK,
+				expectedResult: Assistants[2:], // Expecting assistants before (excluding) ID 3.
+			},
+			{
+				name:                 "Invalid Usage - non-integer limit",
+				reqURL:               "/assistants?limit=two",
+				expectedStatus:       http.StatusBadRequest,
+				expectedStringResult: "Invalid limit query value: two",
+			},
+			{
+				name:           "Invalid Usage - non-existing id in after",
+				reqURL:         "/assistants?after=100",
+				expectedStatus: http.StatusOK,
+				expectedResult: []Assistant(nil), // Expecting empty list as there are no IDs above 100
+			},
+		}
+
+		for _, tt := range tests {
+			t.Run(tt.name, func(t *testing.T) {
+				request := httptest.NewRequest(http.MethodGet, tt.reqURL, nil)
+				response, err := app.Test(request)
+				assert.NoError(t, err)
+				assert.Equal(t, tt.expectedStatus, response.StatusCode)
+				if tt.expectedStatus != fiber.StatusOK {
+					all, _ := ioutil.ReadAll(response.Body)
+					assert.Equal(t, tt.expectedStringResult, string(all))
+				} else {
+					var result []Assistant
+					err = json.NewDecoder(response.Body).Decode(&result)
+					assert.NoError(t, err)
+
+					assert.Equal(t, tt.expectedResult, result)
+				}
+			})
+		}
+	})
+
+	t.Run("DeleteAssistantEndpoint", func(t *testing.T) {
+		ar := &AssistantRequest{
+			Model:        "ggml-gpt4all-j",
+			Name:         "3.5-turbo",
+			Description:  "Test Assistant",
+			Instructions: "You are computer science teacher answering student questions",
+			Tools:        []Tool{{Type: Function}},
+			FileIDs:      nil,
+			Metadata:     nil,
+		}
+
+		resultAssistant, _, err := createAssistant(app, *ar)
+		assert.NoError(t, err)
+
+		target := fmt.Sprintf("/assistants/%s", resultAssistant.ID)
+		deleteReq := httptest.NewRequest(http.MethodDelete, target, nil)
+		_, err = app.Test(deleteReq)
+		assert.NoError(t, err)
+		assert.Equal(t, 0, len(Assistants))
+	})
+
+	t.Run("GetAssistantEndpoint", func(t *testing.T) {
+		ar := &AssistantRequest{
+			Model:        "ggml-gpt4all-j",
+			Name:         "3.5-turbo",
+			Description:  "Test Assistant",
+			Instructions: "You are computer science teacher answering student questions",
+			Tools:        []Tool{{Type: Function}},
+			FileIDs:      nil,
+			Metadata:     nil,
+		}
+
+		resultAssistant, _, err := createAssistant(app, *ar)
+		assert.NoError(t, err)
+		t.Cleanup(cleanupAllAssistants(t, app, []string{resultAssistant.ID}))
+
+		target := fmt.Sprintf("/assistants/%s", resultAssistant.ID)
+		request := httptest.NewRequest(http.MethodGet, target, nil)
+		response, err := app.Test(request)
+		assert.NoError(t, err)
+
+		var getAssistant Assistant
+		err = json.NewDecoder(response.Body).Decode(&getAssistant)
+		assert.NoError(t, err)
+
+		assert.Equal(t, resultAssistant.ID, getAssistant.ID)
+	})
+
+	t.Run("ModifyAssistantEndpoint", func(t *testing.T) {
+		ar := &AssistantRequest{
+			Model:        "ggml-gpt4all-j",
+			Name:         "3.5-turbo",
+			Description:  "Test Assistant",
+			Instructions: "You are computer science teacher answering student questions",
+			Tools:        []Tool{{Type: Function}},
+			FileIDs:      nil,
+			Metadata:     nil,
+		}
+
+		resultAssistant, _, err := createAssistant(app, *ar)
+		assert.NoError(t, err)
+
+		modifiedAr := &AssistantRequest{
+			Model:        "ggml-gpt4all-j",
+			Name:         "4.0-turbo",
+			Description:  "Modified Test Assistant",
+			Instructions: "You are math teacher answering student questions",
+			Tools:        []Tool{{Type: CodeInterpreter}},
+			FileIDs:      nil,
+			Metadata:     nil,
+		}
+
+		modifiedArJson, err := json.Marshal(modifiedAr)
+		assert.NoError(t, err)
+
+		target := fmt.Sprintf("/assistants/%s", resultAssistant.ID)
+		request := httptest.NewRequest(http.MethodPost, target, strings.NewReader(string(modifiedArJson)))
+		request.Header.Set(fiber.HeaderContentType, "application/json")
+
+		modifyResponse, err := app.Test(request)
+		assert.NoError(t, err)
+		var getAssistant Assistant
+		err = json.NewDecoder(modifyResponse.Body).Decode(&getAssistant)
+
+		t.Cleanup(cleanupAllAssistants(t, app, []string{getAssistant.ID}))
+
+		assert.Equal(t, resultAssistant.ID, getAssistant.ID) // IDs should match even if contents change
+		assert.Equal(t, modifiedAr.Tools, getAssistant.Tools)
+		assert.Equal(t, modifiedAr.Name, getAssistant.Name)
+		assert.Equal(t, modifiedAr.Instructions, getAssistant.Instructions)
+		assert.Equal(t, modifiedAr.Description, getAssistant.Description)
+	})
+
+	t.Run("CreateAssistantFileEndpoint", func(t *testing.T) {
+		t.Cleanup(tearDown())
+		file, assistant, err := createFileAndAssistant(t, app, appConfig)
+		assert.NoError(t, err)
+
+		afr := AssistantFileRequest{FileID: file.ID}
+		af, _, err := createAssistantFile(app, afr, assistant.ID)
+
+		assert.NoError(t, err)
+		assert.Equal(t, assistant.ID, af.AssistantID)
+	})
+	t.Run("ListAssistantFilesEndpoint", func(t *testing.T) {
+		t.Cleanup(tearDown())
+		file, assistant, err := createFileAndAssistant(t, app, appConfig)
+		assert.NoError(t, err)
+
+		afr := AssistantFileRequest{FileID: file.ID}
+		af, _, err := createAssistantFile(app, afr, assistant.ID)
+		assert.NoError(t, err)
+
+		assert.Equal(t, assistant.ID, af.AssistantID)
+	})
+	t.Run("GetAssistantFileEndpoint", func(t *testing.T) {
+		t.Cleanup(tearDown())
+		file, assistant, err := createFileAndAssistant(t, app, appConfig)
+		assert.NoError(t, err)
+
+		afr := AssistantFileRequest{FileID: file.ID}
+		af, _, err := createAssistantFile(app, afr, assistant.ID)
+		assert.NoError(t, err)
+		t.Cleanup(cleanupAssistantFile(t, app, af.ID, af.AssistantID))
+
+		target := fmt.Sprintf("/assistants/%s/files/%s", assistant.ID, file.ID)
+		request := httptest.NewRequest(http.MethodGet, target, nil)
+		response, err := app.Test(request)
+		assert.NoError(t, err)
+
+		var assistantFile AssistantFile
+		err = json.NewDecoder(response.Body).Decode(&assistantFile)
+		assert.NoError(t, err)
+
+		assert.Equal(t, af.ID, assistantFile.ID)
+		assert.Equal(t, af.AssistantID, assistantFile.AssistantID)
+	})
+	t.Run("DeleteAssistantFileEndpoint", func(t *testing.T) {
+		t.Cleanup(tearDown())
+		file, assistant, err := createFileAndAssistant(t, app, appConfig)
+		assert.NoError(t, err)
+
+		afr := AssistantFileRequest{FileID: file.ID}
+		af, _, err := createAssistantFile(app, afr, assistant.ID)
+		assert.NoError(t, err)
+
+		cleanupAssistantFile(t, app, af.ID, af.AssistantID)()
+
+		assert.Empty(t, AssistantFiles)
+	})
+
+}
+
+func createFileAndAssistant(t *testing.T, app *fiber.App, o *config.ApplicationConfig) (File, Assistant, error) {
+	ar := &AssistantRequest{
+		Model:        "ggml-gpt4all-j",
+		Name:         "3.5-turbo",
+		Description:  "Test Assistant",
+		Instructions: "You are computer science teacher answering student questions",
+		Tools:        []Tool{{Type: Function}},
+		FileIDs:      nil,
+		Metadata:     nil,
+	}
+
+	assistant, _, err := createAssistant(app, *ar)
+	if err != nil {
+		return File{}, Assistant{}, err
+	}
+	t.Cleanup(cleanupAllAssistants(t, app, []string{assistant.ID}))
+
+	file := CallFilesUploadEndpointWithCleanup(t, app, "test.txt", "file", "fine-tune", 5, o)
+	t.Cleanup(func() {
+		_, err := CallFilesDeleteEndpoint(t, app, file.ID)
+		assert.NoError(t, err)
+	})
+	return file, assistant, nil
+}
+
+func createAssistantFile(app *fiber.App, afr AssistantFileRequest, assistantId string) (AssistantFile, *http.Response, error) {
+	afrJson, err := json.Marshal(afr)
+	if err != nil {
+		return AssistantFile{}, nil, err
+	}
+
+	target := fmt.Sprintf("/assistants/%s/files", assistantId)
+	request := httptest.NewRequest(http.MethodPost, target, strings.NewReader(string(afrJson)))
+	request.Header.Set(fiber.HeaderContentType, "application/json")
+	request.Header.Set("OpenAi-Beta", "assistants=v1")
+
+	resp, err := app.Test(request)
+	if err != nil {
+		return AssistantFile{}, resp, err
+	}
+
+	var assistantFile AssistantFile
+	all, err := ioutil.ReadAll(resp.Body)
+	err = json.NewDecoder(strings.NewReader(string(all))).Decode(&assistantFile)
+	if err != nil {
+		return AssistantFile{}, resp, err
+	}
+
+	return assistantFile, resp, nil
+}
+
+func createAssistant(app *fiber.App, ar AssistantRequest) (Assistant, *http.Response, error) {
+	assistant, err := json.Marshal(ar)
+	if err != nil {
+		return Assistant{}, nil, err
+	}
+
+	request := httptest.NewRequest(http.MethodPost, "/assistants", strings.NewReader(string(assistant)))
+	request.Header.Set(fiber.HeaderContentType, "application/json")
+	request.Header.Set("OpenAi-Beta", "assistants=v1")
+
+	resp, err := app.Test(request)
+	if err != nil {
+		return Assistant{}, resp, err
+	}
+
+	bodyString, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return Assistant{}, resp, err
+	}
+
+	var resultAssistant Assistant
+	err = json.NewDecoder(strings.NewReader(string(bodyString))).Decode(&resultAssistant)
+
+	return resultAssistant, resp, nil
+}
+
+func cleanupAllAssistants(t *testing.T, app *fiber.App, ids []string) func() {
+	return func() {
+		for _, assistant := range ids {
+			target := fmt.Sprintf("/assistants/%s", assistant)
+			deleteReq := httptest.NewRequest(http.MethodDelete, target, nil)
+			_, err := app.Test(deleteReq)
+			if err != nil {
+				t.Fatalf("Failed to delete assistant %s: %v", assistant, err)
+			}
+		}
+	}
+}
+
+func cleanupAssistantFile(t *testing.T, app *fiber.App, fileId, assistantId string) func() {
+	return func() {
+		target := fmt.Sprintf("/assistants/%s/files/%s", assistantId, fileId)
+		request := httptest.NewRequest(http.MethodDelete, target, nil)
+		request.Header.Set(fiber.HeaderContentType, "application/json")
+		request.Header.Set("OpenAi-Beta", "assistants=v1")
+
+		resp, err := app.Test(request)
+		assert.NoError(t, err)
+
+		var dafr DeleteAssistantFileResponse
+		err = json.NewDecoder(resp.Body).Decode(&dafr)
+		assert.NoError(t, err)
+		assert.True(t, dafr.Deleted)
+	}
+}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -20,6 +20,11 @@ import (
 	"github.com/valyala/fasthttp"
 )

+// ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create
+// @Summary Generate a chat completions for a given prompt and model.
+// @Param request body schema.OpenAIRequest true "query params"
+// @Success 200 {object} schema.OpenAIResponse "Response"
+// @Router /v1/chat/completions [post]
 func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	emptyMessage := ""
 	id := uuid.New().String()
@@ -79,7 +84,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup

 			result, err := handleQuestion(config, req, ml, startupOptions, results[0].arguments, prompt)
 			if err != nil {
-				log.Error().Msgf("error handling question: %s", err.Error())
+				log.Error().Err(err).Msg("error handling question")
 				return
 			}

@@ -180,6 +185,8 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			input.Grammar = grammar.JSONBNF
 		}

+		config.Grammar = input.Grammar
+
 		// process functions if we have any defined or if we have a function call string
 		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
 			log.Debug().Msgf("Response needs to process functions")
@@ -231,7 +238,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup

 			// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
 			// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
-			if i.FunctionCall != nil && i.Role == "assistant" {
+			if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
 				roleFn := "assistant_function_call"
 				r := config.Roles[roleFn]
 				if r != "" {
@@ -241,6 +248,11 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			r := config.Roles[role]
 			contentExists := i.Content != nil && i.StringContent != ""

+			fcall := i.FunctionCall
+			if len(i.ToolCalls) > 0 {
+				fcall = i.ToolCalls
+			}
+
 			// First attempt to populate content via a chat message specific template
 			if config.TemplateConfig.ChatMessage != "" {
 				chatMessageData := model.ChatMessageTemplateData{
@@ -248,12 +260,15 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 					Role:         r,
 					RoleName:     role,
 					Content:      i.StringContent,
+					FunctionCall: fcall,
 					FunctionName: i.Name,
+					LastMessage:  messageIndex == (len(input.Messages) - 1),
+					Function:     config.Grammar != "" && (messageIndex == (len(input.Messages) - 1)),
 					MessageIndex: messageIndex,
 				}
 				templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
 				if err != nil {
-					log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, config.TemplateConfig.ChatMessage, err)
+					log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
 				} else {
 					if templatedChatMessage == "" {
 						log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
@@ -263,35 +278,49 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 					content = templatedChatMessage
 				}
 			}
+
+			marshalAnyRole := func(f any) {
+				j, err := json.Marshal(f)
+				if err == nil {
+					if contentExists {
+						content += "\n" + fmt.Sprint(r, " ", string(j))
+					} else {
+						content = fmt.Sprint(r, " ", string(j))
+					}
+				}
+			}
+			marshalAny := func(f any) {
+				j, err := json.Marshal(f)
+				if err == nil {
+					if contentExists {
+						content += "\n" + string(j)
+					} else {
+						content = string(j)
+					}
+				}
+			}
 			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
 			if content == "" {
 				if r != "" {
 					if contentExists {
 						content = fmt.Sprint(r, i.StringContent)
 					}
+
 					if i.FunctionCall != nil {
-						j, err := json.Marshal(i.FunctionCall)
-						if err == nil {
-							if contentExists {
-								content += "\n" + fmt.Sprint(r, " ", string(j))
-							} else {
-								content = fmt.Sprint(r, " ", string(j))
-							}
-						}
+						marshalAnyRole(i.FunctionCall)
+					}
+					if i.ToolCalls != nil {
+						marshalAnyRole(i.ToolCalls)
 					}
 				} else {
 					if contentExists {
 						content = fmt.Sprint(i.StringContent)
 					}
 					if i.FunctionCall != nil {
-						j, err := json.Marshal(i.FunctionCall)
-						if err == nil {
-							if contentExists {
-								content += "\n" + string(j)
-							} else {
-								content = string(j)
-							}
-						}
+						marshalAny(i.FunctionCall)
+					}
+					if i.ToolCalls != nil {
+						marshalAny(i.ToolCalls)
 					}
 				}
 				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
@@ -426,7 +455,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				case noActionsToRun:
 					result, err := handleQuestion(config, input, ml, startupOptions, results[0].arguments, predInput)
 					if err != nil {
-						log.Error().Msgf("error handling question: %s", err.Error())
+						log.Error().Err(err).Msg("error handling question")
 						return
 					}
 					*c = append(*c, schema.Choice{
@@ -536,13 +565,13 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m

 	predFunc, err := backend.ModelInference(input.Context, prompt, images, ml, *config, o, nil)
 	if err != nil {
-		log.Error().Msgf("inference error: %s", err.Error())
+		log.Error().Err(err).Msg("model inference failed")
 		return "", err
 	}

 	prediction, err := predFunc()
 	if err != nil {
-		log.Error().Msgf("inference error: %s", err.Error())
+		log.Error().Err(err).Msg("prediction failed")
 		return "", err
 	}
 	return backend.Finetune(*config, prompt, prediction.Response), nil
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -20,7 +20,11 @@ import (
 	"github.com/valyala/fasthttp"
 )

-// https://platform.openai.com/docs/api-reference/completions
+// CompletionEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/completions
+// @Summary Generate completions for a given prompt and model.
+// @Param request body schema.OpenAIRequest true "query params"
+// @Success 200 {object} schema.OpenAIResponse "Response"
+// @Router /v1/completions [post]
 func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	id := uuid.New().String()
 	created := int(time.Now().Unix())
@@ -69,6 +73,8 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			input.Grammar = grammar.JSONBNF
 		}

+		config.Grammar = input.Grammar
+
 		log.Debug().Msgf("Parameter Config: %+v", config)

 		if input.Stream {
--- a/core/http/endpoints/openai/embeddings.go
+++ b/core/http/endpoints/openai/embeddings.go
@@ -16,7 +16,11 @@ import (
 	"github.com/rs/zerolog/log"
 )

-// https://platform.openai.com/docs/api-reference/embeddings
+// EmbeddingsEndpoint is the OpenAI Embeddings API endpoint https://platform.openai.com/docs/api-reference/embeddings
+// @Summary Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms.
+// @Param request body schema.OpenAIRequest true "query params"
+// @Success 200 {object} schema.OpenAIResponse "Response"
+// @Router /v1/embeddings [post]
 func EmbeddingsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		model, input, err := readRequest(c, ml, appConfig, true)
--- a/core/http/endpoints/openai/files.go
+++ b/core/http/endpoints/openai/files.go
@@ -1,23 +1,22 @@
 package openai

 import (
-	"encoding/json"
 	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
+	"sync/atomic"
 	"time"

 	"github.com/go-skynet/LocalAI/core/config"

 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
 )

-var uploadedFiles []File
+var UploadedFiles []File

-const uploadedFilesFile = "uploadedFiles.json"
+const UploadedFilesFile = "uploadedFiles.json"

 // File represents the structure of a file object from the OpenAI API.
 type File struct {
@@ -29,38 +28,6 @@ type File struct {
 	Purpose   string    `json:"purpose"`    // The purpose of the file (e.g., "fine-tune", "classifications", etc.)
 }

-func saveUploadConfig(uploadDir string) {
-	file, err := json.MarshalIndent(uploadedFiles, "", " ")
-	if err != nil {
-		log.Error().Msgf("Failed to JSON marshal the uploadedFiles: %s", err)
-	}
-
-	err = os.WriteFile(filepath.Join(uploadDir, uploadedFilesFile), file, 0644)
-	if err != nil {
-		log.Error().Msgf("Failed to save uploadedFiles to file: %s", err)
-	}
-}
-
-func LoadUploadConfig(uploadPath string) {
-	uploadFilePath := filepath.Join(uploadPath, uploadedFilesFile)
-
-	_, err := os.Stat(uploadFilePath)
-	if os.IsNotExist(err) {
-		log.Debug().Msgf("No uploadedFiles file found at %s", uploadFilePath)
-		return
-	}
-
-	file, err := os.ReadFile(uploadFilePath)
-	if err != nil {
-		log.Error().Msgf("Failed to read file: %s", err)
-	} else {
-		err = json.Unmarshal(file, &uploadedFiles)
-		if err != nil {
-			log.Error().Msgf("Failed to JSON unmarshal the file into uploadedFiles: %s", err)
-		}
-	}
-}
-
 // UploadFilesEndpoint https://platform.openai.com/docs/api-reference/files/create
 func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
@@ -95,7 +62,7 @@ func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
 		}

 		f := File{
-			ID:        fmt.Sprintf("file-%d", time.Now().Unix()),
+			ID:        fmt.Sprintf("file-%d", getNextFileId()),
 			Object:    "file",
 			Bytes:     int(file.Size),
 			CreatedAt: time.Now(),
@@ -103,12 +70,19 @@ func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
 			Purpose:   purpose,
 		}

-		uploadedFiles = append(uploadedFiles, f)
-		saveUploadConfig(appConfig.UploadDir)
+		UploadedFiles = append(UploadedFiles, f)
+		utils.SaveConfig(appConfig.UploadDir, UploadedFilesFile, UploadedFiles)
 		return c.Status(fiber.StatusOK).JSON(f)
 	}
 }

+var currentFileId int64 = 0
+
+func getNextFileId() int64 {
+	atomic.AddInt64(&currentId, 1)
+	return currentId
+}
+
 // ListFilesEndpoint https://platform.openai.com/docs/api-reference/files/list
 func ListFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	type ListFiles struct {
@@ -121,9 +95,9 @@ func ListFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Applica

 		purpose := c.Query("purpose")
 		if purpose == "" {
-			listFiles.Data = uploadedFiles
+			listFiles.Data = UploadedFiles
 		} else {
-			for _, f := range uploadedFiles {
+			for _, f := range UploadedFiles {
 				if purpose == f.Purpose {
 					listFiles.Data = append(listFiles.Data, f)
 				}
@@ -140,7 +114,7 @@ func getFileFromRequest(c *fiber.Ctx) (*File, error) {
 		return nil, fmt.Errorf("file_id parameter is required")
 	}

-	for _, f := range uploadedFiles {
+	for _, f := range UploadedFiles {
 		if id == f.ID {
 			return &f, nil
 		}
@@ -184,14 +158,14 @@ func DeleteFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
 		}

 		// Remove upload from list
-		for i, f := range uploadedFiles {
+		for i, f := range UploadedFiles {
 			if f.ID == file.ID {
-				uploadedFiles = append(uploadedFiles[:i], uploadedFiles[i+1:]...)
+				UploadedFiles = append(UploadedFiles[:i], UploadedFiles[i+1:]...)
 				break
 			}
 		}

-		saveUploadConfig(appConfig.UploadDir)
+		utils.SaveConfig(appConfig.UploadDir, UploadedFilesFile, UploadedFiles)
 		return c.JSON(DeleteStatus{
 			Id:      file.ID,
 			Object:  "file",
--- a/core/http/endpoints/openai/files_test.go
+++ b/core/http/endpoints/openai/files_test.go
@@ -11,6 +11,8 @@ import (
 	"path/filepath"
 	"strings"

+	"github.com/rs/zerolog/log"
+
 	"github.com/go-skynet/LocalAI/core/config"

 	utils2 "github.com/go-skynet/LocalAI/pkg/utils"
@@ -73,6 +75,7 @@ func TestUploadFileExceedSizeLimit(t *testing.T) {
 	app.Get("/files/:file_id/content", GetFilesContentsEndpoint(loader, option))

 	t.Run("UploadFilesEndpoint file size exceeds limit", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		resp, err := CallFilesUploadEndpoint(t, app, "foo.txt", "file", "fine-tune", 11, option)
 		assert.NoError(t, err)

@@ -80,46 +83,54 @@ func TestUploadFileExceedSizeLimit(t *testing.T) {
 		assert.Contains(t, bodyToString(resp, t), "exceeds upload limit")
 	})
 	t.Run("UploadFilesEndpoint purpose not defined", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		resp, _ := CallFilesUploadEndpoint(t, app, "foo.txt", "file", "", 5, option)

 		assert.Equal(t, fiber.StatusBadRequest, resp.StatusCode)
 		assert.Contains(t, bodyToString(resp, t), "Purpose is not defined")
 	})
 	t.Run("UploadFilesEndpoint file already exists", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		f1 := CallFilesUploadEndpointWithCleanup(t, app, "foo.txt", "file", "fine-tune", 5, option)

 		resp, err := CallFilesUploadEndpoint(t, app, "foo.txt", "file", "fine-tune", 5, option)
 		fmt.Println(f1)
-		fmt.Printf("ERror: %v", err)
+		fmt.Printf("ERror: %v\n", err)
+		fmt.Printf("resp: %+v\n", resp)

 		assert.Equal(t, fiber.StatusBadRequest, resp.StatusCode)
 		assert.Contains(t, bodyToString(resp, t), "File already exists")
 	})
 	t.Run("UploadFilesEndpoint file uploaded successfully", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		file := CallFilesUploadEndpointWithCleanup(t, app, "test.txt", "file", "fine-tune", 5, option)

 		// Check if file exists in the disk
-		filePath := filepath.Join(option.UploadDir, utils2.SanitizeFileName("test.txt"))
+		testName := strings.Split(t.Name(), "/")[1]
+		fileName := testName + "-test.txt"
+		filePath := filepath.Join(option.UploadDir, utils2.SanitizeFileName(fileName))
 		_, err := os.Stat(filePath)

 		assert.False(t, os.IsNotExist(err))
 		assert.Equal(t, file.Bytes, 5242880)
 		assert.NotEmpty(t, file.CreatedAt)
-		assert.Equal(t, file.Filename, "test.txt")
+		assert.Equal(t, file.Filename, fileName)
 		assert.Equal(t, file.Purpose, "fine-tune")
 	})
 	t.Run("ListFilesEndpoint without purpose parameter", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		resp, err := CallListFilesEndpoint(t, app, "")
 		assert.NoError(t, err)

 		assert.Equal(t, 200, resp.StatusCode)

 		listFiles := responseToListFile(t, resp)
-		if len(listFiles.Data) != len(uploadedFiles) {
-			t.Errorf("Expected %v files, got %v files", len(uploadedFiles), len(listFiles.Data))
+		if len(listFiles.Data) != len(UploadedFiles) {
+			t.Errorf("Expected %v files, got %v files", len(UploadedFiles), len(listFiles.Data))
 		}
 	})
 	t.Run("ListFilesEndpoint with valid purpose parameter", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		_ = CallFilesUploadEndpointWithCleanup(t, app, "test.txt", "file", "fine-tune", 5, option)

 		resp, err := CallListFilesEndpoint(t, app, "fine-tune")
@@ -131,6 +142,7 @@ func TestUploadFileExceedSizeLimit(t *testing.T) {
 		}
 	})
 	t.Run("ListFilesEndpoint with invalid query parameter", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		resp, err := CallListFilesEndpoint(t, app, "not-so-fine-tune")
 		assert.NoError(t, err)
 		assert.Equal(t, 200, resp.StatusCode)
@@ -142,6 +154,7 @@ func TestUploadFileExceedSizeLimit(t *testing.T) {
 		}
 	})
 	t.Run("GetFilesContentsEndpoint get file content", func(t *testing.T) {
+		t.Cleanup(tearDown())
 		req := httptest.NewRequest("GET", "/files", nil)
 		resp, _ := app.Test(req)
 		assert.Equal(t, 200, resp.StatusCode)
@@ -175,8 +188,10 @@ func CallFilesContentEndpoint(t *testing.T, app *fiber.App, fileId string) (*htt
 }

 func CallFilesUploadEndpoint(t *testing.T, app *fiber.App, fileName, tag, purpose string, fileSize int, appConfig *config.ApplicationConfig) (*http.Response, error) {
+	testName := strings.Split(t.Name(), "/")[1]
+
 	// Create a file that exceeds the limit
-	file := createTestFile(t, fileName, fileSize, appConfig)
+	file := createTestFile(t, testName+"-"+fileName, fileSize, appConfig)

 	// Creating a new HTTP Request
 	body, writer := newMultipartFile(file.Name(), tag, purpose)
@@ -188,7 +203,8 @@ func CallFilesUploadEndpoint(t *testing.T, app *fiber.App, fileName, tag, purpos

 func CallFilesUploadEndpointWithCleanup(t *testing.T, app *fiber.App, fileName, tag, purpose string, fileSize int, appConfig *config.ApplicationConfig) File {
 	// Create a file that exceeds the limit
-	file := createTestFile(t, fileName, fileSize, appConfig)
+	testName := strings.Split(t.Name(), "/")[1]
+	file := createTestFile(t, testName+"-"+fileName, fileSize, appConfig)

 	// Creating a new HTTP Request
 	body, writer := newMultipartFile(file.Name(), tag, purpose)
@@ -199,11 +215,12 @@ func CallFilesUploadEndpointWithCleanup(t *testing.T, app *fiber.App, fileName,
 	assert.NoError(t, err)
 	f := responseToFile(t, resp)

-	id := f.ID
-	t.Cleanup(func() {
-		_, err := CallFilesDeleteEndpoint(t, app, id)
-		assert.NoError(t, err)
-	})
+	//id := f.ID
+	//t.Cleanup(func() {
+	//	_, err := CallFilesDeleteEndpoint(t, app, id)
+	//	assert.NoError(t, err)
+	//	assert.Empty(t, UploadedFiles)
+	//})

 	return f

@@ -240,7 +257,8 @@ func createTestFile(t *testing.T, name string, sizeMB int, option *config.Applic
 		t.Fatalf("Error MKDIR: %v", err)
 	}

-	file, _ := os.Create(name)
+	file, err := os.Create(name)
+	assert.NoError(t, err)
 	file.WriteString(strings.Repeat("a", sizeMB*1024*1024)) // sizeMB MB File

 	t.Cleanup(func() {
@@ -280,7 +298,7 @@ func responseToListFile(t *testing.T, resp *http.Response) ListFiles {

 	err := json.NewDecoder(strings.NewReader(responseToString)).Decode(&listFiles)
 	if err != nil {
-		fmt.Printf("Failed to decode response: %s", err)
+		log.Error().Err(err).Msg("failed to decode response")
 	}

 	return listFiles
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -44,7 +44,7 @@ func downloadFile(url string) (string, error) {
 	return out.Name(), err
 }

-// https://platform.openai.com/docs/api-reference/images/create
+//

 /*
 *
@@ -59,6 +59,11 @@ func downloadFile(url string) (string, error) {

 *
 */
+// ImageEndpoint is the OpenAI Image generation API endpoint https://platform.openai.com/docs/api-reference/images/create
+// @Summary Creates an image given a prompt.
+// @Param request body schema.OpenAIRequest true "query params"
+// @Success 200 {object} schema.OpenAIResponse "Response"
+// @Router /v1/images/generations [post]
 func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		m, input, err := readRequest(c, ml, appConfig, false)
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -146,7 +146,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque

 	if input.ToolsChoice != nil {
 		var toolChoice grammar.Tool
-		json.Unmarshal([]byte(input.ToolsChoice.(string)), &toolChoice)
+
+		switch content := input.ToolsChoice.(type) {
+		case string:
+			_ = json.Unmarshal([]byte(content), &toolChoice)
+		case map[string]interface{}:
+			dat, _ := json.Marshal(content)
+			_ = json.Unmarshal(dat, &toolChoice)
+		}
 		input.FunctionCall = map[string]interface{}{
 			"name": toolChoice.Function.Name,
 		}
@@ -185,6 +192,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 		config.RepeatPenalty = input.RepeatPenalty
 	}

+	if input.FrequencyPenalty != 0 {
+		config.FrequencyPenalty = input.FrequencyPenalty
+	}
+
+	if input.PresencePenalty != 0 {
+		config.PresencePenalty = input.PresencePenalty
+	}
+
 	if input.Keep != 0 {
 		config.Keep = input.Keep
 	}
@@ -201,7 +216,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 		config.Seed = input.Seed
 	}

-	if input.TypicalP != 0 {
+	if input.TypicalP != nil {
 		config.TypicalP = input.TypicalP
 	}

--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -16,7 +16,13 @@ import (
 	"github.com/rs/zerolog/log"
 )

-// https://platform.openai.com/docs/api-reference/audio/create
+// TranscriptEndpoint is the OpenAI Whisper API endpoint https://platform.openai.com/docs/api-reference/audio/create
+// @Summary Transcribes audio into the input language.
+// @accept multipart/form-data
+// @Param model formData string true "model"
+// @Param file formData file true "file"
+// @Success 200 {object} map[string]string	 "Response"
+// @Router /v1/audio/transcriptions [post]
 func TranscriptEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		m, input, err := readRequest(c, ml, appConfig, false)
--- a/core/http/render.go
+++ b/core/http/render.go
@@ -0,0 +1,80 @@
+package http
+
+import (
+	"embed"
+	"fmt"
+	"html/template"
+	"net/http"
+
+	"github.com/Masterminds/sprig/v3"
+	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	fiberhtml "github.com/gofiber/template/html/v2"
+	"github.com/russross/blackfriday"
+)
+
+//go:embed views/*
+var viewsfs embed.FS
+
+func notFoundHandler(c *fiber.Ctx) error {
+	// Check if the request accepts JSON
+	if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
+		// The client expects a JSON response
+		c.Status(fiber.StatusNotFound).JSON(schema.ErrorResponse{
+			Error: &schema.APIError{Message: "Resource not found", Code: fiber.StatusNotFound},
+		})
+	} else {
+		// The client expects an HTML response
+		c.Status(fiber.StatusNotFound).Render("views/404", fiber.Map{})
+	}
+	return nil
+}
+
+func welcomeRoute(
+	app *fiber.App,
+	cl *config.BackendConfigLoader,
+	ml *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	auth func(*fiber.Ctx) error,
+) {
+	if appConfig.DisableWelcomePage {
+		return
+	}
+
+	models, _ := ml.ListModels()
+	backendConfigs := cl.GetAllBackendConfigs()
+
+	app.Get("/", auth, func(c *fiber.Ctx) error {
+		summary := fiber.Map{
+			"Title":             "LocalAI API - " + internal.PrintableVersion(),
+			"Version":           internal.PrintableVersion(),
+			"Models":            models,
+			"ModelsConfig":      backendConfigs,
+			"ApplicationConfig": appConfig,
+		}
+
+		if string(c.Context().Request.Header.ContentType()) == "application/json" || len(c.Accepts("html")) == 0 {
+			// The client expects a JSON response
+			return c.Status(fiber.StatusOK).JSON(summary)
+		} else {
+			// Render index
+			return c.Render("views/index", summary)
+		}
+	})
+
+}
+
+func renderEngine() *fiberhtml.Engine {
+	engine := fiberhtml.NewFileSystem(http.FS(viewsfs), ".html")
+	engine.AddFuncMap(sprig.FuncMap())
+	engine.AddFunc("MDToHTML", markDowner)
+	return engine
+}
+
+func markDowner(args ...interface{}) template.HTML {
+	s := blackfriday.MarkdownCommon([]byte(fmt.Sprintf("%s", args...)))
+	return template.HTML(s)
+}
--- a/core/http/views/404.html
+++ b/core/http/views/404.html
@@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<html lang="en">
+
+{{template "views/partials/head" .}}
+
+<body class="bg-black text-white">
+<div class="flex flex-col min-h-screen">
+   
+    {{template "views/partials/navbar" .}}
+    
+    <div class="container mx-auto px-4 flex-grow">
+        <div class="header text-center py-12">
+            <h1 class="text-5xl font-bold">Welcome to your LocalAI instance!</h1>
+            <div class="mt-6">
+         <!--       <a href="/" aria-label="HomePage" alt="HomePage">           
+                    <img class="mx-auto w-1/4 h-auto" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo">            
+                </a>
+            -->
+            </div>
+            <p class="mt-4 text-lg">The FOSS alternative to OpenAI, Claude, ...</p>
+            <a href="https://localai.io" target="_blank" class="mt-4 inline-block bg-blue-500 text-white py-2 px-4 rounded transition duration-300 ease-in-out hover:bg-blue-700"><i class="fas fa-book-reader pr-2"></i>Documentation</a>
+        </div>
+
+        <div class="models mt-12">
+            <h2 class="text-center text-3xl font-semibold">Nothing found!</h2>
+        </div>
+    </div>
+
+    {{template "views/partials/footer" .}}
+</div>
+
+</body>
+</html>
--- a/core/http/views/index.html
+++ b/core/http/views/index.html
@@ -0,0 +1,52 @@
+<!DOCTYPE html>
+<html lang="en">
+{{template "views/partials/head" .}}
+
+<body class="bg-gray-900 text-gray-200">
+<div class="flex flex-col min-h-screen">
+   
+    {{template "views/partials/navbar" .}}
+    
+    <div class="container mx-auto px-4 flex-grow">
+        <div class="header text-center py-12">
+            <h1 class="text-5xl font-bold text-gray-100">Welcome to <i>your</i> LocalAI instance!</h1>
+            <div class="mt-6">
+                <!-- Logo can be uncommented and updated with a valid URL -->
+            </div>
+            <p class="mt-4 text-lg">The FOSS alternative to OpenAI, Claude, ...</p>
+            <a href="https://localai.io" target="_blank" class="mt-4 inline-block bg-blue-500 text-white py-2 px-4 rounded-lg shadow transition duration-300 ease-in-out hover:bg-blue-700 hover:shadow-lg">
+                <i class="fas fa-book-reader pr-2"></i>Documentation
+            </a>
+        </div>
+
+        <div class="models mt-12">
+            <h2 class="text-center text-3xl font-semibold text-gray-100">Installed models</h2>
+            <p class="text-center mt-4 text-xl">We have {{len .ModelsConfig}} pre-loaded models available.</p>
+            <ul class="mt-8 space-y-4">
+                {{ range .ModelsConfig }}
+                <li class="bg-gray-800 border border-gray-700 p-4 rounded-lg">
+                    <div class="flex justify-between items-center">
+                        <p class="font-bold text-white flex items-center"><i class="fas fa-brain pr-2"></i>{{.Name}}</p>
+                        {{ if .Backend }}
+                        <!-- Badge for Backend -->
+                        <span class="inline-block bg-blue-500 text-white py-1 px-3 rounded-full text-xs">
+                            {{.Backend}}
+                        </span>
+                        {{ else }}
+                        <span class="inline-block bg-yellow-500 text-white py-1 px-3 rounded-full text-xs">
+                            auto
+                        </span>
+                        {{ end }}
+                    </div>
+                    <!-- Additional details can go here -->
+                </li>
+                {{ end }}
+            </ul>
+        </div>
+    </div>
+
+    {{template "views/partials/footer" .}}
+</div>
+
+</body>
+</html>
--- a/core/http/views/partials/footer.html
+++ b/core/http/views/partials/footer.html
@@ -0,0 +1,4 @@
+<footer class="text-center py-8">
+    LocalAI Version {{.Version}}<br>
+    <a href='https://localai.io' class="text-blue-400 hover:text-blue-600" target="_blank">LocalAI</a> © 2023-2024 <a href='https://mudler.pm' class="text-blue-400 hover:text-blue-600" target="_blank">Ettore Di Giacinto</a>
+</footer>
--- a/core/http/views/partials/head.html
+++ b/core/http/views/partials/head.html
@@ -0,0 +1,13 @@
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{{.Title}}</title>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wght@400;500&display=swap" rel="stylesheet">
+    <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.1.1/css/all.min.css">
+    <style>
+        body {
+            font-family: 'Inter', sans-serif;
+        }
+    </style>
+</head>
--- a/core/http/views/partials/navbar.html
+++ b/core/http/views/partials/navbar.html
@@ -0,0 +1,16 @@
+<nav class="bg-gray-800 shadow-lg">
+    <div class="container mx-auto px-4 py-4">
+        <div class="flex items-center justify-between">
+            <div class="flex items-center">
+                <!-- Logo Image: Replace 'logo_url_here' with your actual logo URL -->
+                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
+            </div>
+            <div>
+                <a href="/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-home pr-2"></i>Home</a>
+                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+                <a href="/swagger/" class="text-gray-400 hover:text-white px-3 py-2 rounded"><i class="fas fa-code pr-2"></i> API</a>
+            </div>
+        </div>
+    </div>
+</nav>
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -20,3 +20,40 @@ type TTSRequest struct {
 	Voice   string `json:"voice" yaml:"voice"`
 	Backend string `json:"backend" yaml:"backend"`
 }
+
+type StoresSet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresDelete struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys"`
+}
+
+type StoresGet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys" yaml:"keys"`
+}
+
+type StoresGetResponse struct {
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresFind struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Key  []float32 `json:"key" yaml:"key"`
+	Topk int       `json:"topk" yaml:"topk"`
+}
+
+type StoresFindResponse struct {
+	Keys         [][]float32 `json:"keys" yaml:"keys"`
+	Values       []string    `json:"values" yaml:"values"`
+	Similarities []float32   `json:"similarities" yaml:"similarities"`
+}
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -108,7 +108,7 @@ type ChatCompletionResponseFormat struct {
 type OpenAIRequest struct {
 	PredictionOptions

-	Context context.Context    `json:"-"`
+	Context context.Context  `json:"-"`
 	Cancel  context.CancelFunc `json:"-"`

 	// whisper
--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@@ -24,11 +24,12 @@ type PredictionOptions struct {
 	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
 	Keep          int     `json:"n_keep" yaml:"n_keep"`

-	FrequencyPenalty float64 `json:"frequency_penalty" yaml:"frequency_penalty"`
-	TFZ              float64 `json:"tfz" yaml:"tfz"`
+	FrequencyPenalty float64  `json:"frequency_penalty" yaml:"frequency_penalty"`
+	PresencePenalty  float64  `json:"presence_penalty" yaml:"presence_penalty"`
+	TFZ              *float64 `json:"tfz" yaml:"tfz"`

-	TypicalP float64 `json:"typical_p" yaml:"typical_p"`
-	Seed     *int    `json:"seed" yaml:"seed"`
+	TypicalP *float64 `json:"typical_p" yaml:"typical_p"`
+	Seed     *int     `json:"seed" yaml:"seed"`

 	NegativePrompt      string  `json:"negative_prompt" yaml:"negative_prompt"`
 	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
--- a/core/services/backend_monitor.go
+++ b/core/services/backend_monitor.go
@@ -63,7 +63,7 @@ func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*schema.Backe
 	pid, err := bm.modelLoader.GetGRPCPID(backend)

 	if err != nil {
-		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
+		log.Error().Err(err).Str("model", model).Msg("failed to find GRPC pid")
 		return nil, err
 	}

@@ -71,26 +71,26 @@ func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*schema.Backe
 	backendProcess, err := gopsutil.NewProcess(int32(pid))

 	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
+		log.Error().Err(err).Str("model", model).Int("pid", pid).Msg("error getting process info")
 		return nil, err
 	}

 	memInfo, err := backendProcess.MemoryInfo()

 	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
+		log.Error().Err(err).Str("model", model).Int("pid", pid).Msg("error getting memory info")
 		return nil, err
 	}

 	memPercent, err := backendProcess.MemoryPercent()
 	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
+		log.Error().Err(err).Str("model", model).Int("pid", pid).Msg("error getting memory percent")
 		return nil, err
 	}

 	cpuPercent, err := backendProcess.CPUPercent()
 	if err != nil {
-		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
+		log.Error().Err(err).Str("model", model).Int("pid", pid).Msg("error getting cpu percent")
 		return nil, err
 	}

--- a/core/startup/config_file_watcher.go
+++ b/core/startup/config_file_watcher.go
@@ -85,7 +85,7 @@ func WatchConfigDirectory(configDir string, appConfig *config.ApplicationConfig)
 				if !ok {
 					return
 				}
-				log.Error().Msgf("WatchConfigDirectory goroutine error: %+v", err)
+				log.Error().Err(err).Msg("error encountered while watching config directory")
 			}
 		}
 	}()
--- a/core/startup/startup.go
+++ b/core/startup/startup.go
@@ -58,18 +58,20 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 	cl := config.NewBackendConfigLoader()
 	ml := model.NewModelLoader(options.ModelPath)

-	if err := cl.LoadBackendConfigsFromPath(options.ModelPath); err != nil {
-		log.Error().Msgf("error loading config files: %s", err.Error())
+	configLoaderOpts := options.ToConfigLoaderOptions()
+
+	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
+		log.Error().Err(err).Msg("error loading config files")
 	}

 	if options.ConfigFile != "" {
-		if err := cl.LoadBackendConfigFile(options.ConfigFile); err != nil {
-			log.Error().Msgf("error loading config file: %s", err.Error())
+		if err := cl.LoadBackendConfigFile(options.ConfigFile, configLoaderOpts...); err != nil {
+			log.Error().Err(err).Msg("error loading config file")
 		}
 	}

 	if err := cl.Preload(options.ModelPath); err != nil {
-		log.Error().Msgf("error downloading models: %s", err.Error())
+		log.Error().Err(err).Msg("error downloading models")
 	}

 	if options.PreloadJSONModels != "" {
--- a/docs/content/docs/features/stores.md
+++ b/docs/content/docs/features/stores.md
@@ -0,0 +1,97 @@
+
+++
+disableToc = false
+title = "💾 Stores"
+
+weight = 18
+url = '/stores'
+++
+
+Stores are an experimental feature to help with querying data using similarity search. It is
+a low level API that consists of only `get`, `set`, `delete` and `find`.
+
+For example if you have an embedding of some text and want to find text with similar embeddings.
+You can create embeddings for chunks of all your text then compare them against the embedding of the text you
+are searching on.
+
+An embedding here meaning a vector of numbers that represent some information about the text. The
+embeddings are created from an A.I. model such as BERT or a more traditional method such as word
+frequency.
+
+Previously you would have to integrate with an external vector database or library directly.
+With the stores feature you can now do it through the LocalAI API. 
+
+Note however that doing a similarity search on embeddings is just one way to do retrieval. A higher level
+API can take this into account, so this may not be the best place to start.
+
+## API overview
+
+There is an internal gRPC API and an external facing HTTP JSON API. We'll just discuss the external HTTP API,
+however the HTTP API mirrors the gRPC API. Consult `pkg/store/client` for internal usage.
+
+Everything is in columnar format meaning that instead of getting an array of objects with a key and a value each. 
+You instead get two separate arrays of keys and values.
+
+Keys are arrays of floating point numbers with a maximum width of 32bits. Values are strings (in gRPC they are bytes).
+
+The key vectors must all be the same length and it's best for search performance if they are normalized. When
+addings keys it will be detected if they are not normalized and what length they are.
+
+All endpoints accept a `store` field which specifies which store to operate on. Presently they are created
+on the fly and there is only one store backend so no configuration is required.
+
+## Set
+
+To set some keys you can do
+
+```
+curl -X POST http://localhost:8080/stores/set \
+     -H "Content-Type: application/json" \
+     -d '{"keys": [[0.1, 0.2], [0.3, 0.4]], "values": ["foo", "bar"]}'
+```
+
+Setting the same keys again will update their values.
+
+On success 200 OK is returned with no body.
+
+## Get
+
+To get some keys you can do
+
+```
+curl -X POST http://localhost:8080/stores/get \
+     -H "Content-Type: application/json" \
+     -d '{"keys": [[0.1, 0.2]]}'
+```
+
+Both the keys and values are returned, e.g: `{"keys":[[0.1,0.2]],"values":["foo"]}`
+
+The order of the keys is not preserved! If a key does not exist then nothing is returned.
+
+## Delete
+
+To delete keys and values you can do
+
+```
+curl -X POST http://localhost:8080/stores/delete \
+     -H "Content-Type: application/json" \
+     -d '{"keys": [[0.1, 0.2]]}'
+```
+
+If a key doesn't exist then it is ignored.
+
+On success 200 OK is returned with no body.
+
+## Find
+
+To do a similarity search you can do
+
+```
+curl -X POST http://localhost:8080/stores/find 
+     -H "Content-Type: application/json" \
+     -d '{"topk": 2, "key": [0.2, 0.1]}'
+```
+
+`topk` limits the number of results returned. The result value is the same as `get`,
+except that it also includes an array of `similarities`. Where `1.0` is the maximum similarity.
+They are returned in the order of most similar to least.
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@@ -304,6 +304,7 @@ The backend will automatically download the required files in order to run the m
 | Type | Description |
 | --- | --- |
 | `AutoModelForCausalLM` | `AutoModelForCausalLM` is a model that can be used to generate sequences. |
+| `OVModelForCausalLM` | for OpenVINO models |
 | N/A | Defaults to `AutoModel` |


@@ -324,4 +325,35 @@ curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d
   "prompt": "Hello, my name is",
   "temperature": 0.1, "top_p": 0.1
 }'
+```
+
+#### Examples
+
+##### OpenVINO
+
+A model configuration file for openvion and starling model:
+
+```yaml
+name: starling-openvino
+backend: transformers
+parameters:
+  model: fakezeta/Starling-LM-7B-beta-openvino-int8
+context_size: 8192
+threads: 6
+f16: true
+type: OVModelForCausalLM
+stopwords:
+- <|end_of_turn|>
+- <|endoftext|>
+prompt_cache_path: "cache"
+prompt_cache_all: true
+template:
+  chat_message: |
+    {{if eq .RoleName "system"}}{{.Content}}<|end_of_turn|>{{end}}{{if eq .RoleName "assistant"}}<|end_of_turn|>GPT4 Correct Assistant: {{.Content}}<|end_of_turn|>{{end}}{{if eq .RoleName "user"}}GPT4 Correct User: {{.Content}}{{end}}
+
+  chat: |
+    {{.Input}}<|end_of_turn|>GPT4 Correct Assistant:
+
+  completion: |
+    {{.Input}}
 ```
--- a/docs/content/docs/getting-started/build.md
+++ b/docs/content/docs/getting-started/build.md
@@ -15,19 +15,7 @@ LocalAI's extensible architecture allows you to add your own backends, which can

 In some cases you might want to re-build LocalAI from source (for instance to leverage Apple Silicon acceleration), or to build a custom container image with your own backends. This section contains instructions on how to build LocalAI from source.

-#### Container image

-Requirements:
-
- Docker or podman, or a container engine
-
-In order to build the `LocalAI` container image locally you can use `docker`, for example:
-
-```
-# build the image
-docker build -t localai .
-docker run localai
-```

 #### Build LocalAI locally

@@ -45,6 +33,8 @@ To install the dependencies follow the instructions below:
 {{< tabs tabTotal="3"  >}}
 {{% tab tabName="Apple" %}}

+Install `xcode` from the App Store
+
 ```bash
 brew install abseil cmake go grpc protobuf wget
 ```
@@ -109,12 +99,35 @@ docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS

 {{% /alert %}}

+#### Container image
+
+Requirements:
+
+- Docker or podman, or a container engine
+
+In order to build the `LocalAI` container image locally you can use `docker`, for example:
+
+```
+# build the image
+docker build -t localai .
+docker run localai
+```
+
+There are some build arguments that can be used to customize the build:
+
+| Variable | Default | Description |
+| ---------------------| ------- | ----------- |
+| `IMAGE_TYPE`         |   `extras`      | Build type. Available: `core`, `extras` |
+
+
 ### Example: Build on mac

-Building on Mac (M1 or M2) works, but you may need to install some prerequisites using `brew`. 
+Building on Mac (M1, M2 or M3) works, but you may need to install some prerequisites using `brew`. 

 The below has been tested by one mac user and found to work. Note that this doesn't use Docker to run the server:

+Install `xcode` from the Apps Store (needed for metalkit)
+
 ```
 # install build dependencies
 brew install abseil cmake go grpc protobuf wget
@@ -146,8 +159,20 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso
   }'
 ```

-### Build with Image generation support
+#### Troublshooting mac

+If you encounter errors regarding a missing utility metal, install `Xcode` from the App Store.
+If completions are slow, ensure that `gpu-layers` in your model yaml matches the number of layers from the model in use (or simply use a high number such as 256).
+If you a get a compile error: `error: only virtual member functions can be marked 'final'`, reinstall all the necessary brew packages, clean the build, and try again.
+
+```
+# reinstall build dependencies
+brew reinstall abseil cmake go grpc protobuf wget
+
+make clean
+
+make build
+```

 **Requirements**: OpenCV, Gomp

@@ -239,13 +264,12 @@ make BUILD_TYPE=sycl_f32 build # for float32
 #### Metal (Apple Silicon)

 ```
-make BUILD_TYPE=metal build
+make build

-# Set `gpu_layers: 1` to your YAML model config file and `f16: true`
-# Note: only models quantized with q4_0 are supported!
+# correct build type is automatically used on mac (BUILD_TYPE=metal)
+# Set `gpu_layers: 256` (or equal to the number of model layers) to your YAML model config file and `f16: true`
 ```

-
 ### Windows compatibility

 Make sure to give enough resources to the running container. See https://github.com/go-skynet/LocalAI/issues/2
--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -10,17 +10,8 @@ icon = "rocket_launch"

 **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run [LLMs]({{%relref "docs/features/text-generation" %}}), generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families and architectures.

-## Installation Methods
-
 LocalAI is available as a container image and binary, compatible with various container engines like Docker, Podman, and Kubernetes. Container images are published on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest) and [Docker Hub](https://hub.docker.com/r/localai/localai). Binaries can be downloaded from [GitHub](https://github.com/mudler/LocalAI/releases).

-
-{{% alert icon="💡" %}}
-
-**Hardware Requirements:** The hardware requirements for LocalAI vary based on the model size and quantization method used. For performance benchmarks with different backends, such as `llama.cpp`, visit [this link](https://github.com/ggerganov/llama.cpp#memorydisk-requirements). The `rwkv` backend is noted for its lower resource consumption.
-
-{{% /alert %}}
-
 ## Prerequisites

 Before you begin, ensure you have a container engine installed if you are not using the binaries. Suitable options include Docker or Podman. For installation instructions, refer to the following guides:
@@ -29,171 +20,286 @@ Before you begin, ensure you have a container engine installed if you are not us
 - [Install Podman (Linux)](https://podman.io/getting-started/installation)
 - [Install Docker engine (Servers)](https://docs.docker.com/engine/install/#get-started)

-
-## Running Models
-
-> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/manual" %}})_.
-
-LocalAI allows one-click runs with popular models. It downloads the model and starts the API with the model loaded. 
-
-There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
-
 {{% alert icon="💡" %}}

-To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI/tree/master/examples/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
+**Hardware Requirements:** The hardware requirements for LocalAI vary based on the model size and quantization method used. For performance benchmarks with different backends, such as `llama.cpp`, visit [this link](https://github.com/ggerganov/llama.cpp#memorydisk-requirements). The `rwkv` backend is noted for its lower resource consumption.
+
 {{% /alert %}}

-{{< tabs tabTotal="3" >}}
-{{% tab tabName="CPU-only" %}}
+## Running LocalAI with All-in-One (AIO) Images

-> 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
+> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/manual" %}}) or [Run other models]({{%relref "docs/getting-started/run-other-models" %}}) to use an already-configured model_.

-| Model | Category | Docker command |
-| --- | --- | --- |
-| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
-| 🌋 [llava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` |
-| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
-| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
-| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
-| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
-| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
-| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
-| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
-| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X)  | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
-| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
-| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
-| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
-| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
-| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
-| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
-| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
-| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
-{{% /tab %}}
-{{% tab tabName="GPU (CUDA 11)" %}}
+LocalAI's All-in-One (AIO) images are pre-configured with a set of models and backends to fully leverage almost all the LocalAI featureset. 
+
+These images are available for both CPU and GPU environments. The AIO images are designed to be easy to use and requires no configuration.
+
+It suggested to use the AIO images if you don't want to configure the models to run on LocalAI. If you want to run specific models, you can use the [manual method]({{%relref "docs/getting-started/manual" %}}).
+
+The AIO Images comes pre-configured with the following features:
+- Text to Speech (TTS)
+- Speech to Text
+- Function calling
+- Large Language Models (LLM) for text generation
+- Image generation
+- Embedding server


-> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
-
-| Model | Category | Docker command |
-| --- | --- | --- |
-| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
-| 🌋 [llava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` |
-| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
-| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
-| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
-| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
-| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
-| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
-| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
-| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
-| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
-| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
-| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
-| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
-| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) |  ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
-| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
-| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
-| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
-{{% /tab %}}
-
-
-{{% tab tabName="GPU (CUDA 12)" %}}
-
-> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
-
-| Model | Category | Docker command |
-| --- | --- | --- |
-| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
-| 🌋 [llava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` |
-| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
-| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
-| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
-| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
-| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
-| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
-| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
-| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
-| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
-| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
-| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
-| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
-| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
-| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
-| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
-| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
-{{% /tab %}}
-
-{{< /tabs >}}
-
-{{% alert icon="💡" %}}
-**Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
+Start the image with Docker:

 ```bash
-docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
+docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
+# For Nvidia GPUs:
+# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-11
+# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-nvidia-cuda-12
+```
+
+
+Or with a docker-compose file:
+
+```yaml
+version: "3.9"
+services:
+  api:
+    image: localai/localai:latest-aio-cpu
+    # For a specific version:
+    # image: localai/localai:{{< version >}}-aio-cpu
+    # For Nvidia GPUs decomment one of the following (cuda11 or cuda12):
+    # image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-11
+    # image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-12
+    # image: localai/localai:latest-aio-gpu-nvidia-cuda-11
+    # image: localai/localai:latest-aio-gpu-nvidia-cuda-12
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 5
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      # ...
+    volumes:
+      - ./models:/build/models:cached
+    # decomment the following piece if running with Nvidia GPUs
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: 1
+    #           capabilities: [gpu]
+```
+
+For a list of all the container-images available, see [Container images]({{%relref "docs/reference/container-images" %}}). To learn more about All-in-one images instead, see [All-in-one Images]({{%relref "docs/reference/aio-images" %}}).
+
+{{% alert icon="💡" %}}
+
+**Models caching**: The **AIO** image will download the needed models on the first run if not already present and store those in `/build/models` inside the container. The AIO models will be automatically updated with new versions of AIO images.
+
+You can change the directory inside the container by specifying a `MODELS_PATH` environment variable (or `--models-path`). 
+
+If you want to use a named model or a local directory, you can mount it as a volume to `/build/models`:
+
+```bash
+docker run -p 8080:8080 --name local-ai -ti -v $PWD/models:/build/models localai/localai:latest-aio-cpu
+```
+
+or associate a volume:
+
+```bash
+docker volume create localai-models
+docker run -p 8080:8080 --name local-ai -ti -v localai-models:/build/models localai/localai:latest-aio-cpu
 ```

 {{% /alert %}}

-## Container images
+## Try it out

-LocalAI provides a variety of images to support different environments. These images are available on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags) and [Docker Hub](https://hub.docker.com/r/localai/localai).
+LocalAI does not ship a webui by default, however you can use 3rd party projects to interact with it (see also [Integrations]({{%relref "docs/integrations" %}}) ). However, you can test out the API endpoints using `curl`, you can find few examples below.

-For GPU Acceleration support for Nvidia video graphic cards, use the Nvidia/CUDA images, if you don't have a GPU, use the CPU images. If you have AMD or Mac Silicon, see the [build section]({{%relref "docs/getting-started/build" %}}).
+### Text Generation
+
+Creates a model response for the given chat conversation. [OpenAI documentation](https://platform.openai.com/docs/api-reference/chat/create).
+
+<details>
+
+```bash
+curl http://localhost:8080/v1/chat/completions \
+      -H "Content-Type: application/json" \
+      -d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}] }' 
+```
+
+</details>
+
+### GPT Vision
+
+Understand images.
+
+<details>
+
+```bash
+curl http://localhost:8080/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{ 
+        "model": "gpt-4-vision-preview", 
+        "messages": [
+          {
+            "role": "user", "content": [
+              {"type":"text", "text": "What is in the image?"},
+              {
+                "type": "image_url", 
+                "image_url": {
+                  "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" 
+                }
+              }
+            ], 
+          "temperature": 0.9
+          }
+        ]
+      }' 
+```
+
+</details>
+
+### Function calling
+
+Call functions
+
+<details>
+
+```bash
+curl https://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [
+      {
+        "role": "user",
+        "content": "What is the weather like in Boston?"
+      }
+    ],
+    "tools": [
+      {
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": ["celsius", "fahrenheit"]
+              }
+            },
+            "required": ["location"]
+          }
+        }
+      }
+    ],
+    "tool_choice": "auto"
+  }'
+```
+
+</details>
+
+### Image Generation
+
+Creates an image given a prompt. [OpenAI documentation](https://platform.openai.com/docs/api-reference/images/create).
+
+<details>
+
+```bash
+curl http://localhost:8080/v1/images/generations \
+      -H "Content-Type: application/json" -d '{
+          "prompt": "A cute baby sea otter",
+          "size": "256x256"
+        }'
+```
+
+</details>
+
+### Text to speech
+
+
+Generates audio from the input text. [OpenAI documentation](https://platform.openai.com/docs/api-reference/audio/createSpeech).
+
+<details>
+
+```bash
+curl http://localhost:8080/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "tts-1",
+    "input": "The quick brown fox jumped over the lazy dog.",
+    "voice": "alloy"
+  }' \
+  --output speech.mp3
+```
+
+</details>
+
+
+### Audio Transcription
+
+Transcribes audio into the input language. [OpenAI Documentation](https://platform.openai.com/docs/api-reference/audio/createTranscription).
+
+<details>
+
+Download first a sample to transcribe:
+
+```bash
+wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg 
+```
+
+Send the example audio file to the transcriptions endpoint :
+```bash
+curl http://localhost:8080/v1/audio/transcriptions \
+    -H "Content-Type: multipart/form-data" \
+    -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+```
+
+</details>
+
+### Embeddings Generation
+
+Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. [OpenAI Embeddings](https://platform.openai.com/docs/api-reference/embeddings).
+
+<details>
+
+```bash
+curl http://localhost:8080/embeddings \
+    -X POST -H "Content-Type: application/json" \
+    -d '{ 
+        "input": "Your text string goes here", 
+        "model": "text-embedding-ada-002"
+      }'
+```
+
+</details>

 {{% alert icon="💡" %}}

-**Available Images Types**:
+Don't use the model file as `model` in the request unless you want to handle the prompt template for yourself.

- Images ending with `-core` are smaller images without predownload python dependencies. Use these images if you plan to use `llama.cpp`, `stablediffusion-ncn`, `tinydream` or `rwkv` backends - if you are not sure which one to use, do **not** use these images.
- FFMpeg is **not** included in the default images due to [its licensing](https://www.ffmpeg.org/legal.html). If you need FFMpeg, use the images ending with `-ffmpeg`. Note that `ffmpeg` is needed in case of using `audio-to-text` LocalAI's features.
- If using old and outdated CPUs and no GPUs you might need to set `REBUILD` to `true` as environment variable along with options to disable the flags which your CPU does not support, however note that inference will perform poorly and slow. See also [flagset compatibility]({{%relref "docs/getting-started/build#cpu-flagset-compatibility" %}}).
+Use the model names like you would do with OpenAI like in the examples below. For instance `gpt-4-vision-preview`, or `gpt-4`.

 {{% /alert %}}

-{{< tabs tabTotal="3" >}}
-{{% tab tabName="Vanilla / CPU Images" %}}
-
-| Description | Quay | Docker Hub                                   |
-| --- | --- |-----------------------------------------------|
-| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master` | `localai/localai:master`                      |
-| Latest tag | `quay.io/go-skynet/local-ai:latest` | `localai/localai:latest`                      |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}` | `localai/localai:{{< version >}}`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg` | `localai/localai:{{< version >}}-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg-core` | `localai/localai:{{< version >}}-ffmpeg-core` |
-
-{{% /tab %}}
-
-{{% tab tabName="GPU Images CUDA 11" %}}
-
-| Description | Quay | Docker Hub                                                  |
-| --- | --- |-------------------------------------------------------------|
-| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-cublas-cuda11` | `localai/localai:master-cublas-cuda11`                      |
-| Latest tag | `quay.io/go-skynet/local-ai:latest-cublas-cuda11` | `localai/localai:latest-cublas-cuda11`                      |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11` | `localai/localai:{{< version >}}-cublas-cuda11`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-ffmpeg` | `localai/localai:{{< version >}}-cublas-cuda11-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-ffmpeg-core` | `localai/localai:{{< version >}}-cublas-cuda11-ffmpeg-core` |
-
-{{% /tab %}}
-
-{{% tab tabName="GPU Images CUDA 12" %}}
-
-| Description | Quay | Docker Hub                                                  |
-| --- | --- |-------------------------------------------------------------|
-| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-cublas-cuda12` | `localai/localai:master-cublas-cuda12`                      |
-| Latest tag | `quay.io/go-skynet/local-ai:latest-cublas-cuda12` | `localai/localai:latest-cublas-cuda12`                      |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12` | `localai/localai:{{< version >}}-cublas-cuda12`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-ffmpeg` | `localai/localai:{{< version >}}-cublas-cuda12-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-ffmpeg-core` | `localai/localai:{{< version >}}-cublas-cuda12-ffmpeg-core` |
-
-{{% /tab %}}
-
-{{< /tabs >}}
-
 ## What's next?

+There is much more to explore! run any model from huggingface, video generation, and voice cloning with LocalAI, check out the [features]({{%relref "docs/features" %}}) section for a full overview.
+
 Explore further resources and community contributions:

- [Community How to's](https://io.midori-ai.xyz/howtos/)
+- [Build LocalAI and the container image]({{%relref "docs/getting-started/build" %}})
+- [Run models manually]({{%relref "docs/getting-started/manual" %}})
+- [Run other models]({{%relref "docs/getting-started/run-other-models" %}})
+- [Container images]({{%relref "docs/reference/container-images" %}})
+- [All-in-one Images]({{%relref "docs/reference/aio-images" %}})
 - [Examples](https://github.com/mudler/LocalAI/tree/master/examples#examples)
-
-[![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)](https://github.com/mudler/LocalAI/tree/master/examples#examples)
--- a/docs/content/docs/getting-started/run-other-models.md
+++ b/docs/content/docs/getting-started/run-other-models.md
@@ -0,0 +1,126 @@
+++
+disableToc = false
+title = "Run other Models"
+weight = 3
+icon = "rocket_launch"
+
+++
+
+## Running other models
+
+> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/manual" %}})_.
+
+To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/manual" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
+
+To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.
+
+There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
+
+{{% alert icon="💡" %}}
+
+To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI/tree/master/examples/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
+{{% /alert %}}
+
+{{< tabs tabTotal="3" >}}
+{{% tab tabName="CPU-only" %}}
+
+> 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
+
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
+| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bakllava``` |
+| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.5``` |
+| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-mistral``` |
+| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-vicuna``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
+| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
+| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
+| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
+| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
+| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
+| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X)  | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
+| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
+| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
+| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
+| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
+| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
+| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
+| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
+| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
+| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core hermes-2-pro-mistral``` |
+{{% /tab %}}
+
+{{% tab tabName="GPU (CUDA 11)" %}}
+
+
+> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
+
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
+| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bakllava``` |
+| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.5``` |
+| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-mistral``` |
+| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-vicuna``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
+| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
+| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
+| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
+| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
+| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
+| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
+| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
+| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
+| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
+| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
+| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) |  ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
+| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
+| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
+| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
+| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core hermes-2-pro-mistral``` |
+{{% /tab %}}
+
+
+{{% tab tabName="GPU (CUDA 12)" %}}
+
+> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
+
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
+| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bakllava``` |
+| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.5``` |
+| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-mistral``` |
+| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-vicuna``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
+| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
+| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
+| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
+| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
+| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
+| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
+| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
+| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
+| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
+| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
+| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
+| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
+| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
+| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
+| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core hermes-2-pro-mistral``` |
+{{% /tab %}}
+
+{{< /tabs >}}
+
+{{% alert icon="💡" %}}
+**Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
+
+```bash
+docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
+```
+
+{{% /alert %}}
--- a/docs/content/docs/overview.md
+++ b/docs/content/docs/overview.md
@@ -31,14 +31,14 @@ icon = "info"
 </a>
 </p>

-[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
-[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)
-
-> 💡 Get help - [❓FAQ](https://localai.io/faq/) [❓How tos](https://io.midori-ai.xyz/howtos/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [💭Discord](https://discord.gg/uJAeKSAGDy)
->
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
-
-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families and architectures. Does not require GPU. It is maintained by [mudler](https://github.com/mudler).
+<p align="center">
+<a href="https://hub.docker.com/r/localai/localai" target="blank">
+<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
+</a>
+<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
+<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
+</a>
+</p>

 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
@@ -47,6 +47,34 @@ icon = "info"
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>
+</p>
+
+> 💡 Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [💭Discord](https://discord.gg/uJAeKSAGDy)
+>
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+
+
+
+
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families and architectures. Does not require GPU. It is maintained by [mudler](https://github.com/mudler).
+
+
+## Start LocalAI
+
+Start the image with Docker to have a functional clone of OpenAI! 🚀:
+
+```bash
+docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
+# Do you have a Nvidia GPUs? Use this instead
+# CUDA 11
+# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-cuda-11
+# CUDA 12
+# docker run -p 8080:8080 --gpus all --name local-ai -ti localai/localai:latest-aio-gpu-cuda-12
+```
+
+See the [💻 Quickstart](https://localai.io/basics/getting_started/) for all the options and way you can run LocalAI!
+
+## What is LocalAI?

 In a nutshell:

@@ -61,8 +89,7 @@ LocalAI is focused on making the AI accessible to anyone. Any contribution, feed

 Note that this started just as a fun weekend project by [mudler](https://github.com/mudler) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!

-
-## 🚀 Features
+### 🚀 Features

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
@@ -73,6 +100,7 @@ Note that this started just as a fun weekend project by [mudler](https://github.
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
 - 🆕 [Vision API](https://localai.io/features/gpt-vision/)
+- 💾 [Stores](https://localai.io/features/stores)

 ## Contribute and help

--- a/docs/content/docs/reference/aio-images.md
+++ b/docs/content/docs/reference/aio-images.md
@@ -0,0 +1,52 @@
+
+++
+disableToc = false
+title = "All-In-One images"
+weight = 26
+++
+
+All-In-One images are images that come pre-configured with a set of models and backends to fully leverage almost all the LocalAI featureset. These images are available for both CPU and GPU environments. The AIO images are designed to be easy to use and requires no configuration. Models configuration can be found [here](https://github.com/mudler/LocalAI/tree/master/aio) separated by size.
+
+In the AIO images there are models configured with the names of OpenAI models, however, they are really backed by Open Source models. You can find the table below
+
+| Category | Model name | Real model |
+| Text Generation | `gpt-4` | `phi-2`(CPU) or `hermes-2-pro-mistral`(GPU) |
+| Multimodal | `gpt-4-vision-preview` | `bakllava`(CPU) or `llava-1.6-mistral`(GPU) |
+| Text generation | `stablediffusion` | `stablediffusion`(CPU) `dreamshaper-8` (GPU) |
+| Audio transcription | `whisper-1` | `whisper` with the `whisper-base` model |
+| Text to Audio | `tts-1` | the `en-us-amy-low.onnx` model with `rhasspy` |
+| Embeddings | `text-embedding-ada-002` | | 
+
+## Usage
+
+Select the image (CPU or GPU) and start the container with Docker:
+
+```bash
+# CPU example
+docker run -p 8080:8080 --name local-ai -ti localai/localai:latest-aio-cpu
+```
+
+LocalAI will automatically download all the required models, and the API will be available at [localhost:8080](http://localhost:8080/v1/models).
+
+## Available images
+
+| Description | Quay | Docker Hub                                   |
+| --- | --- |-----------------------------------------------|
+| Latest images for CPU | `quay.io/go-skynet/local-ai:latest-aio-cpu` | `localai/localai:latest-aio-cpu`                      |
+| Versioned image (e.g. for CPU) | `quay.io/go-skynet/local-ai:{{< version >}}-aio-cpu` | `localai/localai:{{< version >}}-aio-cpu`             |
+| Latest images for Nvidia GPU (CUDA11) | `quay.io/go-skynet/local-ai:latest-aio-gpu-nvidia-cuda-11` | `localai/localai:latest-aio-gpu-nvidia-cuda-11`                      |
+| Latest images for Nvidia GPU (CUDA12) | `quay.io/go-skynet/local-ai:latest-aio-gpu-nvidia-cuda-12` | `localai/localai:latest-aio-gpu-nvidia-cuda-12`                      |
+| Latest images for AMD GPU | `quay.io/go-skynet/local-ai:latest-aio-gpu-hipblas` | `localai/localai:latest-aio-gpu-hipblas`                      |
+| Latest images for Intel GPU (sycl f16) | `quay.io/go-skynet/local-ai:latest-aio-gpu-intel-f16` | `localai/localai:latest-aio-gpu-intel-f16`                      |
+| Latest images for Intel GPU (sycl f32) | `quay.io/go-skynet/local-ai:latest-aio-gpu-intel-f32` | `localai/localai:latest-aio-gpu-intel-f32`                      |
+
+## Available environment variables
+
+The AIO Images are inheriting the same environment variables as the base images and the environment of LocalAI (that you can inspect by calling `--help`). However, it supports additional environment variables available only from the container image
+
+| Variable | Default | Description |
+| ---------------------| ------- | ----------- |
+| `PROFILE` | Auto-detected | The size of the model to use. Available: `cpu`, `gpu-8g` |
+| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/getting-started/run-other-models" %}})) |
+
+
--- a/docs/content/docs/reference/container-images.md
+++ b/docs/content/docs/reference/container-images.md
@@ -0,0 +1,103 @@
+
+++
+disableToc = false
+title = "Available Container images"
+weight = 25
+++
+
+LocalAI provides a variety of images to support different environments. These images are available on [quay.io](https://quay.io/repository/go-skynet/local-ai?tab=tags) and [Docker Hub](https://hub.docker.com/r/localai/localai).
+
+> _For All-in-One image with a pre-configured set of models and backends, see the [AIO Images]({{%relref "docs/reference/aio-images" %}})._
+
+For GPU Acceleration support for Nvidia video graphic cards, use the Nvidia/CUDA images, if you don't have a GPU, use the CPU images. If you have AMD or Mac Silicon, see the [build section]({{%relref "docs/getting-started/build" %}}).
+
+{{% alert icon="💡" %}}
+
+**Available Images Types**:
+
+- Images ending with `-core` are smaller images without predownload python dependencies. Use these images if you plan to use `llama.cpp`, `stablediffusion-ncn`, `tinydream` or `rwkv` backends - if you are not sure which one to use, do **not** use these images.
+- Images containing the `aio` tag are all-in-one images with all the features enabled, and come with an opinionated set of configuration.
+- FFMpeg is **not** included in the default images due to [its licensing](https://www.ffmpeg.org/legal.html). If you need FFMpeg, use the images ending with `-ffmpeg`. Note that `ffmpeg` is needed in case of using `audio-to-text` LocalAI's features.
+- If using old and outdated CPUs and no GPUs you might need to set `REBUILD` to `true` as environment variable along with options to disable the flags which your CPU does not support, however note that inference will perform poorly and slow. See also [flagset compatibility]({{%relref "docs/getting-started/build#cpu-flagset-compatibility" %}}).
+
+{{% /alert %}}
+
+{{< tabs tabTotal="6" >}}
+{{% tab tabName="Vanilla / CPU Images" %}}
+
+| Description | Quay | Docker Hub                                   |
+| --- | --- |-----------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master` | `localai/localai:master`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest` | `localai/localai:latest`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}` | `localai/localai:{{< version >}}`             |
+| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg` | `localai/localai:{{< version >}}-ffmpeg`      |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg-core` | `localai/localai:{{< version >}}-ffmpeg-core` |
+
+{{% /tab %}}
+
+{{% tab tabName="GPU Images CUDA 11" %}}
+
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-cublas-cuda11` | `localai/localai:master-cublas-cuda11`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-cublas-cuda11` | `localai/localai:latest-cublas-cuda11`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11` | `localai/localai:{{< version >}}-cublas-cuda11`             |
+| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-ffmpeg` | `localai/localai:{{< version >}}-cublas-cuda11-ffmpeg`      |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-ffmpeg-core` | `localai/localai:{{< version >}}-cublas-cuda11-ffmpeg-core` |
+
+{{% /tab %}}
+
+{{% tab tabName="GPU Images CUDA 12" %}}
+
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-cublas-cuda12` | `localai/localai:master-cublas-cuda12`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-cublas-cuda12` | `localai/localai:latest-cublas-cuda12`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12` | `localai/localai:{{< version >}}-cublas-cuda12`             |
+| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-ffmpeg` | `localai/localai:{{< version >}}-cublas-cuda12-ffmpeg`      |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-ffmpeg-core` | `localai/localai:{{< version >}}-cublas-cuda12-ffmpeg-core` |
+
+{{% /tab %}}
+
+{{% tab tabName="Intel GPU (sycl f16)" %}}
+
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-sycl-f16` | `localai/localai:master-sycl-f16`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-sycl-f16` | `localai/localai:latest-sycl-f16`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16` | `localai/localai:{{< version >}}-sycl-f16`             |
+| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16-ffmpeg` | `localai/localai:{{< version >}}-sycl-f16-ffmpeg`      |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16-ffmpeg-core` | `localai/localai:{{< version >}}-sycl-f16-ffmpeg-core` |
+
+{{% /tab %}}
+
+{{% tab tabName="Intel GPU (sycl f32)" %}}
+
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-sycl-f32` | `localai/localai:master-sycl-f32`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-sycl-f32` | `localai/localai:latest-sycl-f32`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f32` | `localai/localai:{{< version >}}-sycl-f32`             |
+| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f32-ffmpeg` | `localai/localai:{{< version >}}-sycl-f32-ffmpeg`      |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f32-ffmpeg-core` | `localai/localai:{{< version >}}-sycl-f32-ffmpeg-core` |
+
+{{% /tab %}}
+
+{{% tab tabName="AMD GPU" %}}
+
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-hipblas` | `localai/localai:master-hipblas`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-hipblas` | `localai/localai:latest-hipblas`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-hipblas` | `localai/localai:{{< version >}}-hipblas`             |
+| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-hipblas-ffmpeg` | `localai/localai:{{< version >}}-hipblas-ffmpeg`      |
+| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-hipblas-ffmpeg-core` | `localai/localai:{{< version >}}-hipblas-ffmpeg-core` |
+
+{{% /tab %}}
+
+{{< /tabs >}}
+
+## See Also
+
+- [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}})
+- [AIO Images]({{%relref "docs/reference/aio-images" %}})
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.9.0"
+  "version": "v2.12.1"
 }
--- a/embedded/models/bakllava.yaml
+++ b/embedded/models/bakllava.yaml
@@ -0,0 +1,40 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: bakllava
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: bakllava-mmproj.gguf
+parameters:
+  model: bakllava.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+mirostat: 2
+mirostat_eta: 1.0
+mirostat_tau: 1.0
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: bakllava.gguf
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+- filename: bakllava-mmproj.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "bakllava",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/cerbero.yaml
+++ b/embedded/models/cerbero.yaml
@@ -0,0 +1,24 @@
+backend: llama
+context_size: 8192
+f16: false
+gpu_layers: 90
+name: cerbero
+mmap: false
+parameters:
+  model: huggingface://galatolo/cerbero-7b-gguf/ggml-model-Q8_0.gguf
+  top_k: 80
+  temperature: 0.2
+  top_p: 0.7
+template:
+  completion: "{{.Input}}"
+  chat: "Questa è una conversazione tra un umano ed un assistente AI.\n{{.Input}}\n[|Assistente|]  "
+roles:
+  user: "[|Umano|] "
+  system: "[|Umano|] "
+  assistant: "[|Assistente|] "
+
+stopwords:
+- "[|Umano|]"
+
+trimsuffix: 
+- "\n"
--- a/embedded/models/hermes-2-pro-mistral.yaml
+++ b/embedded/models/hermes-2-pro-mistral.yaml
@@ -0,0 +1,53 @@
+name: hermes-2-pro-mistral
+mmap: true
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}<tool_call>{{end}}
+    {{- if eq .RoleName "tool" }}<tool_result>{{end }}
+    {{- if .Content}}
+    {{.Content}}
+    {{- end }}
+    {{- if .FunctionCall}}{{toJson .FunctionCall}}{{end }}
+    {{- if .FunctionCall }}</tool_call>{{end }}
+    {{- if eq .RoleName "tool" }}</tool_result>{{end }}
+    <|im_end|>
+  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    Use the following pydantic model json schema for each tool call you will make:
+    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
+    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    <tool_call>
+    {'arguments': <args-dict>, 'name': <function-name>}
+    </tool_call>
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+    <tool_call>
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- "\n</tool_call>"
+- "\n\n\n"
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "hermes-2-pro-mistral",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/embedded/models/llava-1.5.yaml
+++ b/embedded/models/llava-1.5.yaml
@@ -0,0 +1,33 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: llava-1.5
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.5-7b-mmproj-Q8_0.gguf
+parameters:
+  model: llava-v1.5-7b-Q4_K.gguf
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: llava-v1.5-7b-Q4_K.gguf
+  uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-Q4_K.gguf
+- filename: llava-v1.5-7b-mmproj-Q8_0.gguf
+  uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-Q8_0.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "llava-1.5",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava-1.6-mistral.yaml
+++ b/embedded/models/llava-1.6-mistral.yaml
@@ -0,0 +1,33 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: llava-1.6-mistral
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
+parameters:
+  model: llava-v1.6-mistral-7b.gguf
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: llava-v1.6-mistral-7b.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q6_K.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "llava-1.6-mistral",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava-1.6-vicuna.yaml
+++ b/embedded/models/llava-1.6-vicuna.yaml
@@ -0,0 +1,37 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: llava-1.6-vicuna
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: mmproj-vicuna7b-f16.gguf
+parameters:
+  model: vicuna-7b-q5_k.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: vicuna-7b-q5_k.gguf
+  uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/vicuna-7b-q5_k.gguf
+- filename: mmproj-vicuna7b-f16.gguf
+  uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/mmproj-vicuna7b-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "llava-1.6-vicuna",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/phi-2-chat.yaml
+++ b/embedded/models/phi-2-chat.yaml
@@ -0,0 +1,25 @@
+name: phi-2-chat
+mmap: true
+parameters:
+  model: huggingface://l3utterfly/phi-2-layla-v1-chatml-gguf/phi-2-layla-v1-chatml-Q8_0.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+    {{if .Content}}{{.Content}}{{end}}
+    <|im_end|>
+  chat: |
+    {{.Input}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "phi-2-chat",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/embedded/models/phi-2-orange.yaml
+++ b/embedded/models/phi-2-orange.yaml
@@ -0,0 +1,30 @@
+name: phi-2-orange
+mmap: true
+parameters:
+  model: huggingface://l3utterfly/phi-2-orange-GGUF/phi-2-orange.Q6_K.gguf
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+    {{if .Content}}{{.Content}}{{end}}
+    <|im_end|>
+  chat: |
+    {{.Input}}
+    <|im_start|>assistant
+  completion: |
+    {{.Input}}
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+- <dummy32000>
+
+description: |
+  This model is a chatbot that can be used for general conversation.
+  [Model card](https://huggingface.co/TheBloke/phi-2-orange-GGUF)
+
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "phi-2-orange",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
--- a/Show More
+++ b/Show More