fixup: create piper libdir also when not built

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Merge branch 'master' into enable_gpu
2026-07-07 14:56:58 -04:00 · 2023-11-12 22:17:11 +01:00 · 2023-11-11 19:20:36 +01:00 · 2023-11-11 18:40:48 +01:00 · 2023-11-11 18:40:26 +01:00 · 2023-11-11 13:14:59 +01:00
191 changed files with 32377 additions and 962 deletions
--- a/.env
+++ b/.env
@@ -23,6 +23,12 @@ MODELS_PATH=/models
 ## Enable debug mode
 # DEBUG=true

+## Disables COMPEL (Diffusers)
+# COMPEL=0
+
+## Enable/Disable single backend (useful if only one GPU is available)
+# SINGLE_ACTIVE_BACKEND=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -44,3 +50,23 @@ MODELS_PATH=/models

 ## Specify a default upload limit in MB (whisper)
 # UPLOAD_LIMIT
+
+## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
+# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+
+### Advanced settings ###
+### Those are not really used by LocalAI, but from components in the stack ###
+##
+### Preload libraries
+# LD_PRELOAD=
+
+### Huggingface cache for models
+# HUGGINGFACE_HUB_CACHE=/usr/local/huggingface
+
+### Python backends GRPC max workers
+### Default number of workers for GRPC Python backends.
+### This actually controls wether a backend can process multiple requests or not.
+# PYTHON_GRPC_MAX_WORKERS=1
+
+### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
+# LLAMACPP_PARALLEL=1
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,16 +8,24 @@ This PR fixes #
 **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)**
 - [ ] Yes, I signed my commits.
 
-
 <!--
 Thank you for contributing to LocalAI! 

-Contributing Conventions:
+Contributing Conventions
+-------------------------

-1. Include descriptive PR titles with [<component-name>] prepended.
-2. Build and test your changes before submitting a PR. 
+The draft above helps to give a quick overview of your PR.
+
+Remember to remove this comment and to at least:
+
+1. Include descriptive PR titles with [<component-name>] prepended. We use [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/).
+2. Build and test your changes before submitting a PR (`make build`). 
 3. Sign your commits
+4. **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below).
+5. **X/Twitter handle:** we announce bigger features on X/Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out!

 By following the community's contribution conventions upfront, the review process will 
 be accelerated and your PR merged more quickly.
+
+If no one reviews your PR within a few days, please @-mention @mudler.
 -->
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,6 +12,9 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
+          - repository: "ggerganov/llama.cpp"
+            variable: "CPPLLAMA_VERSION"
+            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
@@ -41,7 +44,7 @@ jobs:
            branch: "master"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -14,15 +14,21 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  docker:
+  image-build:
    strategy:
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
          - build-type: 'cublas'
            cuda-major-version: 11
            cuda-minor-version: 7
@@ -37,11 +43,6 @@ jobs:
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
-          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
          - build-type: 'cublas'
            cuda-major-version: 11
            cuda-minor-version: 7
@@ -57,46 +58,57 @@ jobs:
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'

-    runs-on: ubuntu-latest
+    runs-on: arc-runner-set 
    steps:
-      - name: Release space from worker
+      - name: Force Install GIT latest
        run: |
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          df -h
-          echo
-          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-          sudo rm -rf /usr/local/lib/android
-          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-          sudo rm -rf /usr/share/dotnet
-          sudo apt-get remove -y '^mono-.*' || true
-          sudo apt-get remove -y '^ghc-.*' || true
-          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-          sudo apt-get remove -y 'php.*' || true
-          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-          sudo apt-get remove -y '^google-.*' || true
-          sudo apt-get remove -y azure-cli || true
-          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get autoremove -y
-          sudo apt-get clean
-          echo
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          sudo rm -rfv build || true
-          df -h
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
      - name: Checkout
-        uses: actions/checkout@v3
-
+        uses: actions/checkout@v4
+      # - name: Release space from worker
+      #   run: |
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     df -h
+      #     echo
+      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+      #     sudo rm -rf /usr/local/lib/android
+      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+      #     sudo rm -rf /usr/share/dotnet
+      #     sudo apt-get remove -y '^mono-.*' || true
+      #     sudo apt-get remove -y '^ghc-.*' || true
+      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+      #     sudo apt-get remove -y 'php.*' || true
+      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+      #     sudo apt-get remove -y '^google-.*' || true
+      #     sudo apt-get remove -y azure-cli || true
+      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+      #     sudo apt-get remove -y '^gfortran-.*' || true
+      #     sudo apt-get remove -y microsoft-edge-stable || true
+      #     sudo apt-get remove -y firefox || true
+      #     sudo apt-get remove -y powershell || true
+      #     sudo apt-get remove -y r-base-core || true
+      #     sudo apt-get autoremove -y
+      #     sudo apt-get clean
+      #     echo
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     sudo rm -rfv build || true
+      #     df -h
      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@v4
+        uses: docker/metadata-action@v5
        with:
          images: quay.io/go-skynet/local-ai
          tags: |
@@ -118,14 +130,14 @@ jobs:

      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          registry: quay.io
          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}

      - name: Build and push
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -19,7 +19,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
@@ -29,6 +29,12 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
+
      - name: Build
        id: build
        env:
@@ -60,18 +66,26 @@ jobs:
    runs-on: macOS-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          make dist
      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -0,0 +1,63 @@
+---
+name: 'GPU tests'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-latest:
+    runs-on: gpu
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
+      - name: Build
+        run: |
+          if [ ! -e /run/systemd/system ]; then
+            sudo mkdir /run/systemd/system
+          fi
+          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
+          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            BUILD_TYPE=cublas \
+            prepare-e2e run-e2e-image test-e2e
+      - name: Release space from worker ♻
+        if: always()
+        run: |
+          sudo rm -rf build || true
+          sudo rm -rf bin || true
+          sudo rm -rf dist || true
+          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
+          sudo cat logs.txt || true
+          sudo rm -rf logs.txt
+          make clean || true
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            teardown-e2e || true
+          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
+          docker system prune -f -a --volumes || true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,14 +14,46 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  ubuntu-latest:
+  tests-linux:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -35,38 +67,43 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
          sudo apt-get install -y ca-certificates cmake curl patch
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          sudo pip install -r extra/requirements.txt
+          
+          sudo rm -rfv /usr/bin/conda || true
+          PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface

-          sudo mkdir /build && sudo chmod -R 777 /build && cd /build && \
-          curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v1.11.0.tar.gz" | \
-          tar -xzvf - && \
-          mkdir -p "spdlog-1.11.0/build" && \
-          cd "spdlog-1.11.0/build" && \
-          cmake ..  && \
-          make -j8 && \
-          sudo cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
-          cd /build && \
-          mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
-          curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz" | \
-          tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
-          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+          # Pre-build piper before we start tests in order to have shared libraries in place
+          make go-piper && \
+          GO_TAGS="tts" make -C go-piper piper.o && \
+          sudo cp -rfv go-piper/piper/build/pi/lib/. /usr/lib/ && \
+
+          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
+          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
      - name: Test
        run: |
-          ESPEAK_DATA="/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data" GO_TAGS="tts stablediffusion" make test
+          GO_TAGS="stablediffusion tts" make test

-  macOS-latest:
+  tests-apple:
    runs-on: macOS-latest
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -76,6 +113,14 @@ jobs:
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
+      - name: Dependencies
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Test
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@ go-ggllm
 __pycache__/
 *.a
 get-sources
+/backend/cpp/llama/grpc-server
+/backend/cpp/llama/llama.cpp

 go-ggml-transformers
 go-gpt2
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,72 @@
+# Contributing to localAI
+
+Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
+
+## Table of Contents
+
+- [Getting Started](#getting-started)
+  - [Prerequisites](#prerequisites)
+  - [Setting up the Development Environment](#setting-up-the-development-environment)
+- [Contributing](#contributing)
+  - [Submitting an Issue](#submitting-an-issue)
+  - [Creating a Pull Request (PR)](#creating-a-pull-request-pr)
+- [Coding Guidelines](#coding-guidelines)
+- [Testing](#testing)
+- [Documentation](#documentation)
+- [Community and Communication](#community-and-communication)
+
+
+
+## Getting Started
+
+### Prerequisites
+
+- Golang [1.21]
+- Git
+- macOS/Linux
+
+### Setting up the Development Environment and running localAI in the local environment
+
+1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
+2. Navigate to the project directory: `cd LocalAI`
+3. Install the required dependencies: `make prepare`
+4. Run LocalAI: `make run`
+
+## Contributing
+
+We welcome contributions from everyone! To get started, follow these steps:
+
+### Submitting an Issue
+
+If you find a bug, have a feature request, or encounter any issues, please check the [issue tracker](https://github.com/go-skynet/LocalAI/issues) to see if a similar issue has already been reported. If not, feel free to [create a new issue](https://github.com/go-skynet/LocalAI/issues/new) and provide as much detail as possible.
+
+### Creating a Pull Request (PR)
+
+1. Fork the repository.
+2. Create a new branch with a descriptive name: `git checkout -b [branch name]`
+3. Make your changes and commit them.
+4. Push the changes to your fork: `git push origin [branch name]`
+5. Create a new pull request from your branch to the main project's `main` or `master` branch.
+6. Provide a clear description of your changes in the pull request.
+7. Make any requested changes during the review process.
+8. Once your PR is approved, it will be merged into the main project.
+
+## Coding Guidelines
+
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+
+## Testing
+
+`make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
+
+## Documentation
+
+- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
+
+## Community and Communication
+
+- You can reach out via the Github issue tracker.
+- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
+- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
+
+---
--- a/160
+++ b/160
@@ -1,22 +1,27 @@
 ARG GO_VERSION=1.21-bullseye
+ARG IMAGE_TYPE=extras
+# extras or core

-FROM golang:$GO_VERSION as requirements
+
+FROM golang:$GO_VERSION as requirements-core

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
 ARG CUDA_MINOR_VERSION=7
-ARG SPDLOG_VERSION="1.11.0"
-ARG PIPER_PHONEMIZE_VERSION='1.0.0'
 ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py"
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/run.sh,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/run.sh,vall-e-x:/build/extra/grpc/vall-e-x/run.sh,vllm:/build/extra/grpc/vllm/run.sh"
 ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl patch pip
+    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
+
+
+COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
+RUN update-ca-certificates

 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
@@ -30,63 +35,62 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

+# OpenBLAS requirements and stable diffusion
+RUN apt-get install -y \
+    libopenblas-dev \
+    libopencv-dev \ 
+    && apt-get clean
+
+# Set up OpenCV
+RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+
+WORKDIR /build
+
+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
+
 # Extras requirements
+FROM requirements-core as requirements-extras
+
+RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
+    apt-get update && \
+    apt-get install -y conda
+
 COPY extra/requirements.txt /build/extra/requirements.txt
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN if [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
-    fi
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install torch && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
-    fi
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+#RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+#        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+#    fi
+#RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+#        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+ #   fi
+#RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt

-WORKDIR /build
+# Vall-e-X
+RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt

-# OpenBLAS requirements
-RUN apt-get install -y libopenblas-dev
-
-# Stable Diffusion requirements
-RUN apt-get install -y libopencv-dev && \
-    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-
-
-# piper requirements
-# Use pre-compiled Piper phonemization library (includes onnxruntime)
-#RUN if echo "${GO_TAGS}" | grep -q "tts"; then \
-RUN test -n "$TARGETARCH" \
-    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
-
-RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz" | \
-    tar -xzvf - && \
-    mkdir -p "spdlog-${SPDLOG_VERSION}/build" && \
-    cd "spdlog-${SPDLOG_VERSION}/build" && \
-    cmake ..  && \
-    make -j8 && \
-    cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
-    cd /build && \
-    mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
-    curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH:-$(go env GOARCH)}${TARGETVARIANT}.tar.gz" | \
-    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
-    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
 # \
 #    ; fi

 ###################################
 ###################################

-FROM requirements as builder
+FROM requirements-${IMAGE_TYPE} as builder

 ARG GO_TAGS="stablediffusion tts"
-
+ARG GRPC_BACKENDS
+ARG BUILD_GRPC=true
+ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
@@ -101,21 +105,43 @@ RUN make prepare
 COPY . .
 COPY .git .

-RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build
+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+RUN if [ "${BUILD_GRPC}" = "true" ]; then \
+    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+      -DgRPC_BUILD_TESTS=OFF \
+       ../.. && make -j12 install && rm -rf grpc \
+    ; fi
+
+# Rebuild with defaults backends
+RUN make build
+
+RUN if [ ! -d "/build/go-piper/piper/build/pi/lib/" ]; then \
+    mkdir -p /build/go-piper/piper/build/pi/lib/ \
+    touch /build/go-piper/piper/build/pi/lib/keep \
+    ; fi

 ###################################
 ###################################

-FROM requirements
+FROM requirements-${IMAGE_TYPE}

 ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
+ARG IMAGE_TYPE=extras

 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

+ARG CUDA_MAJOR_VERSION=11
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
+ENV NVIDIA_VISIBLE_DEVICES=all
+
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
    apt-get install -y ffmpeg \
@@ -129,11 +155,49 @@ WORKDIR /build
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
 RUN make prepare-sources
+
+# Copy the binary
 COPY --from=builder /build/local-ai ./
-# To resolve exllama import error
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+
+# Copy shared libraries for piper
+COPY --from=builder /build/go-piper/piper/build/pi/lib/* /usr/lib/
+
+# do not let stablediffusion rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+
+## Duplicated from Makefile to avoid having a big layer that's hard to push
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/autogptq \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/bark \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/diffusers \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vllm \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vall-e-x \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/exllama \
+    ; fi
+
+# Copy VALLE-X as it's not a real "lib"
+RUN if [ -d /usr/lib/vall-e-x ]; then \
+    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
+    fi
+
+# we also copy exllama libs over to resolve exllama import error
+RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
    fi
+
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/195
+++ b/195
@@ -4,10 +4,12 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=bf63302a2be787674e6ca4227a8aaeb95a8eb6b1
+GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

+CPPLLAMA_VERSION?=a75fa576abba9d37f463580c379e4bbf1e1ad03c
+
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
@@ -26,23 +28,23 @@ WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
 BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d

 # go-piper version
-PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7
-
-# go-bloomz version
-BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
+PIPER_VERSION?=736f6fb639ab8e3397356e48eeb6bdcb9da88a78

 # stablediffusion version
 STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632

-# Go-ggllm
-GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
-
 export BUILD_TYPE?=
+export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
+export CMAKE_ARGS?=
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git

+TEST_DIR=/tmp/test
+
+RANDOM := $(shell bash -c 'echo $$RANDOM')
+
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
 LD_FLAGS?=
@@ -50,7 +52,6 @@ override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION
 override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"

 OPTIONAL_TARGETS?=
-ESPEAK_DATA?=

 OS := $(shell uname -s)
 ARCH := $(shell uname -m)
@@ -60,13 +61,19 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

+# Default Docker bridge IP
+E2E_BRIDGE_IP?=172.17.0.1
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif

-# workaround for rwkv.cpp
 ifeq ($(UNAME_S),Darwin)
-        CGO_LDFLAGS += -lcblas -framework Accelerate 
+	CGO_LDFLAGS += -lcblas -framework Accelerate
+ifneq ($(BUILD_TYPE),metal)
+    # explicit disable metal if on Darwin and metal is disabled
+	CMAKE_ARGS+=-DLLAMA_METAL=OFF
+endif
 endif

 ifeq ($(BUILD_TYPE),openblas)
@@ -78,6 +85,18 @@ ifeq ($(BUILD_TYPE),cublas)
 	export LLAMA_CUBLAS=1
 endif

+ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	# Llama-stable has no hipblas support, so override it here.
+	export STABLE_BUILD_TYPE=
+	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
+	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link
+endif
+
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
@@ -100,10 +119,18 @@ endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
+	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/go-piper/piper/src/cpp -I$(shell pwd)/go-piper/piper/build/fi/include -I$(shell pwd)/go-piper/piper/build/pi/include -I$(shell pwd)/go-piper/piper/build/si/include
+ 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/go-piper/piper/build/fi/lib -L$(shell pwd)/go-piper/piper/build/pi/lib -L$(shell pwd)/go-piper/piper/build/si/lib -lfmt -lspdlog
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
+
+# If empty, then we build all
+ifeq ($(GRPC_BACKENDS),)
+	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
+endif

 .PHONY: all test build vendor

@@ -114,14 +141,6 @@ gpt4all:
 	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
 	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

-## go-ggllm
-go-ggllm:
-	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
-	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
-
-go-ggllm/libggllm.a: go-ggllm
-	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
-
 ## go-piper
 go-piper:
 	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
@@ -148,14 +167,6 @@ go-rwkv:
 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-## bloomz
-bloomz:
-	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
-	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1
-
-bloomz/libbloomz.a: bloomz
-	cd bloomz && make libbloomz.a
-
 go-bert/libgobert.a: go-bert
 	$(MAKE) -C go-bert libgobert.a

@@ -165,14 +176,10 @@ backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

-backend-assets/espeak-ng-data:
+backend-assets/espeak-ng-data: go-piper
 	mkdir -p backend-assets/espeak-ng-data
-ifdef ESPEAK_DATA
-	@cp -rf $(ESPEAK_DATA)/. backend-assets/espeak-ng-data
-else
-	@echo "ESPEAK_DATA not set, skipping tts. Note that this will break the tts functionality."
-	@touch backend-assets/espeak-ng-data/keep
-endif
+	$(MAKE) -C go-piper piper.o
+	@cp -rf go-piper/piper/build/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data

 gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a
@@ -204,12 +211,12 @@ go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

 go-llama-stable/libbinding.a: go-llama-stable
-	$(MAKE) -C go-llama-stable BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-go-piper/libpiper_binding.a:
+go-piper/libpiper_binding.a: go-piper
 	$(MAKE) -C go-piper libpiper_binding.a example/main

-get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
 	touch $@

 replace:
@@ -218,10 +225,8 @@ replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
-	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
-	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -237,19 +242,17 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C whisper.cpp clean
 	$(MAKE) -C go-stable-diffusion clean
 	$(MAKE) -C go-bert clean
-	$(MAKE) -C bloomz clean
 	$(MAKE) -C go-piper clean
-	$(MAKE) -C go-ggllm clean
 	$(MAKE) build

-prepare: prepare-sources $(OPTIONAL_TARGETS) 
+prepare: prepare-sources $(OPTIONAL_TARGETS)
 	touch $@

 clean: ## Remove build related file
 	$(GOCMD) clean -cache
 	rm -f prepare
 	rm -rf ./go-llama
-	rm -rf ./gpt4all	
+	rm -rf ./gpt4all
 	rm -rf ./go-llama-stable
 	rm -rf ./go-gpt2
 	rm -rf ./go-stable-diffusion
@@ -257,12 +260,14 @@ clean: ## Remove build related file
 	rm -rf ./backend-assets
 	rm -rf ./go-rwkv
 	rm -rf ./go-bert
-	rm -rf ./bloomz
 	rm -rf ./whisper.cpp
 	rm -rf ./go-piper
-	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
+	rm -rf ./backend/cpp/grpc/grpc_repo
+	rm -rf ./backend/cpp/grpc/build
+	rm -rf ./backend/cpp/grpc/installed_packages
+	$(MAKE) -C backend/cpp/llama clean

 ## Build:

@@ -285,12 +290,12 @@ run: prepare ## run local-ai
 test-models/testmodel:
 	mkdir test-models
 	mkdir test-dir
-	wget https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
-	wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
-	wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O test-models/bert
-	wget https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
-	wget https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
-	wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
+	wget -q https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
+	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
+	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
+	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
+	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
+	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models

 prepare-test: grpcs
@@ -301,14 +306,34 @@ test: prepare test-models/testmodel grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

+prepare-e2e:
+	mkdir -p $(TEST_DIR)
+	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
+	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
+	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+
+run-e2e-image:
+	ls -liah $(abspath ./tests/e2e-fixtures)
+	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
+
+test-e2e:
+	@echo 'Running e2e tests'
+	BUILD_TYPE=$(BUILD_TYPE) \
+	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+
+teardown-e2e:
+	rm -rf $(TEST_DIR) || true
+	docker stop $$(docker ps -q --filter ancestor=localai-tests)
+
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
@@ -357,16 +382,24 @@ protogen-python:
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto

 ## GRPC
+# Note: it is duplicated in the Dockerfile
+prepare-extra-conda-environments:
+	$(MAKE) -C extra/grpc/autogptq
+	$(MAKE) -C extra/grpc/bark
+	$(MAKE) -C extra/grpc/diffusers
+	$(MAKE) -C extra/grpc/vllm
+	$(MAKE) -C extra/grpc/huggingface
+	$(MAKE) -C extra/grpc/vall-e-x
+	$(MAKE) -C extra/grpc/exllama
+

 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
-
 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -376,6 +409,37 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

+## BACKEND CPP LLAMA START
+# Sets the variables in case it has to build the gRPC locally.
+INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
+INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
+ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
+                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
+
+backend/cpp/llama/grpc-server:
+ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
+	backend/cpp/grpc/script/build_grpc.sh ${INSTALLED_PACKAGES}
+	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
+	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
+	export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
+	CMAKE_ARGS="${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server 
+else
+	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
+endif
+## BACKEND CPP LLAMA END
+		
+##
+backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
+	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
+endif
+
 backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -421,10 +485,6 @@ backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/

-backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
-
 backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
@@ -432,12 +492,15 @@ backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/

-backend-assets/grpc/stablediffusion: backend-assets/grpc go-stable-diffusion/libstablediffusion.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/
+backend-assets/grpc/stablediffusion: backend-assets/grpc
+	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
+		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
+		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
+	fi

 backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/

 backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.
+**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.

 <p align="center"><b>Follow LocalAI </b></p>

@@ -104,15 +104,30 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs

 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

-### 💡 Example: Use GPT4ALL-J model
+### 💡 Example: Use Luna-AI Llama model

-See the [documentation](https://localai.io/basics/getting_started/#example-use-gpt4all-j-model-with-docker-compose)
+See the [documentation](https://localai.io/basics/getting_started)

 ### 🔗 Resources

 - [How to build locally](https://localai.io/basics/build/index.html)
 - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
 - [Projects integrating LocalAI](https://localai.io/integrations/)
+- [How tos section](https://localai.io/howtos/) (curated by our community)
+  
+## Citation
+
+If you utilize this repository, data in a downstream project, please consider citing it with:
+
+```
+@misc{localai,
+  author = {Ettore Di Giacinto},
+  title = {LocalAI: The free, Open source OpenAI alternative},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/go-skynet/LocalAI}},
+```

 ## ❤️ Sponsors

@@ -127,6 +142,11 @@ A huge thank you to our generous sponsors who support this project:
 |  [Spectro Cloud](https://www.spectrocloud.com/)  |  
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

+And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 
+
+- [Sponsor list](https://github.com/sponsors/mudler)
+- JDAM00 (donating HW for the CI)
+
 ## 🌟 Star history

 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
--- a/api/api.go
+++ b/api/api.go
@@ -11,6 +11,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/assets"

 	"github.com/gofiber/fiber/v2"
@@ -120,6 +121,9 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// Default middleware config
 	app.Use(recover.New())
+	if options.Metrics != nil {
+		app.Use(metrics.APIMiddleware(options.Metrics))
+	}

 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
@@ -168,9 +172,14 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Post("/models/apply", auth, localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cl, galleryService.C, options.Galleries))
-	app.Get("/models/available", auth, localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
-	app.Get("/models/jobs/:uuid", auth, localai.GetOpStatusEndpoint(galleryService))
+	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
+	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
+	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
+	app.Get("/models/galleries", auth, modelGalleryService.ListModelGalleriesEndpoint())
+	app.Post("/models/galleries", auth, modelGalleryService.AddModelGalleryEndpoint())
+	app.Delete("/models/galleries", auth, modelGalleryService.RemoveModelGalleryEndpoint())
+	app.Get("/models/jobs/:uuid", auth, modelGalleryService.GetOpStatusEndpoint())
+	app.Get("/models/jobs", auth, modelGalleryService.GetAllStatusEndpoint())

 	// openAI compatible API endpoint

@@ -224,5 +233,7 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
 	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))

+	app.Get("/metrics", metrics.MetricsHandler())
+
 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -15,6 +15,7 @@ import (

 	. "github.com/go-skynet/LocalAI/api"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
@@ -162,8 +163,12 @@ var _ = Describe("API test", func() {
 				},
 			}

+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
+					options.WithMetrics(metricsService),
 					options.WithContext(c),
 					options.WithGalleries(galleries),
 					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
@@ -363,9 +368,10 @@ var _ = Describe("API test", func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
+				modelName := "codellama"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/openllama-3b-gguf.yaml",
-					Name:      "openllama_3b_gguf",
+					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
+					Name:      modelName,
 					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
 				})

@@ -378,17 +384,22 @@ var _ = Describe("API test", func() {
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))

-				By("testing completion")
-				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "})
+				By("testing chat")
+				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
+					{
+						Role:    "user",
+						Content: "How much is 2+2?",
+					},
+				}})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
-				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")))

 				By("testing functions")
 				resp2, err := client.CreateChatCompletion(
 					context.TODO(),
 					openai.ChatCompletionRequest{
-						Model: "openllama_3b_gguf",
+						Model: modelName,
 						Messages: []openai.ChatCompletionMessage{
 							{
 								Role:    "user",
@@ -424,7 +435,7 @@ var _ = Describe("API test", func() {
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res))
+				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
@@ -446,7 +457,7 @@ var _ = Describe("API test", func() {
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
+				}, "960s", "10s").Should(Equal(true))

 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
 				Expect(err).ToNot(HaveOccurred())
@@ -473,9 +484,13 @@ var _ = Describe("API test", func() {
 				},
 			}

+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
+					options.WithMetrics(metricsService),
 					options.WithAudioDir(tmpdir),
 					options.WithImageDir(tmpdir),
 					options.WithGalleries(galleries),
@@ -577,12 +592,15 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())

-			var err error
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					options.WithContext(c),
 					options.WithModelLoader(modelLoader),
+					options.WithMetrics(metricsService),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")
@@ -669,7 +687,7 @@ var _ = Describe("API test", func() {
 					Input: []string{"sun", "cat"},
 				},
 			)
-			Expect(err).ToNot(HaveOccurred())
+			Expect(err).ToNot(HaveOccurred(), err)
 			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
 			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))

@@ -786,10 +804,13 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())

-			var err error
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
+					options.WithMetrics(metricsService),
 					options.WithModelLoader(modelLoader),
 					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -20,6 +20,9 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			SchedulerType: c.Diffusers.SchedulerType,
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
+			LoraAdapter:   c.LoraAdapter,
+			LoraScale:     c.LoraScale,
+			LoraBase:      c.LoraBase,
 			IMG2IMG:       c.Diffusers.IMG2IMG,
 			CLIPModel:     c.Diffusers.ClipModel,
 			CLIPSubfolder: c.Diffusers.ClipSubFolder,
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -6,6 +6,7 @@ import (
 	"regexp"
 	"strings"
 	"sync"
+	"unicode/utf8"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
@@ -25,7 +26,7 @@ type TokenUsage struct {
 	Completion int
 }

-func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model

 	grpcOpts := gRPCModelOpts(c)
@@ -71,6 +72,7 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c
 	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
+		opts.Images = images

 		tokenUsage := TokenUsage{}

@@ -97,9 +99,23 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c

 		if tokenCallback != nil {
 			ss := ""
-			err := inferenceModel.PredictStream(ctx, opts, func(s []byte) {
-				tokenCallback(string(s), tokenUsage)
-				ss += string(s)
+
+			var partialRune []byte
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
+				partialRune = append(partialRune, chars...)
+
+				for len(partialRune) > 0 {
+					r, size := utf8.DecodeRune(partialRune)
+					if r == utf8.RuneError {
+						// incomplete rune, wait for more bytes
+						break
+					}
+
+					tokenCallback(string(r), tokenUsage)
+					ss += string(r)
+
+					partialRune = partialRune[size:]
+				}
 			})
 			return LLMResponse{
 				Response: ss,
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -38,26 +38,35 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 	}

 	return &pb.ModelOptions{
-		ContextSize:   int32(c.ContextSize),
-		Seed:          int32(c.Seed),
-		NBatch:        int32(b),
-		NoMulMatQ:     c.NoMulMatQ,
-		LoraAdapter:   c.LoraAdapter,
-		LoraBase:      c.LoraBase,
-		NGQA:          c.NGQA,
-		RMSNormEps:    c.RMSNormEps,
-		F16Memory:     c.F16,
-		MLock:         c.MMlock,
-		RopeFreqBase:  c.RopeFreqBase,
-		RopeFreqScale: c.RopeFreqScale,
-		NUMA:          c.NUMA,
-		Embeddings:    c.Embeddings,
-		LowVRAM:       c.LowVRAM,
-		NGPULayers:    int32(c.NGPULayers),
-		MMap:          c.MMap,
-		MainGPU:       c.MainGPU,
-		Threads:       int32(c.Threads),
-		TensorSplit:   c.TensorSplit,
+		ContextSize:    int32(c.ContextSize),
+		Seed:           int32(c.Seed),
+		NBatch:         int32(b),
+		NoMulMatQ:      c.NoMulMatQ,
+		DraftModel:     c.DraftModel,
+		AudioPath:      c.VallE.AudioPath,
+		Quantization:   c.Quantization,
+		MMProj:         c.MMProj,
+		YarnExtFactor:  c.YarnExtFactor,
+		YarnAttnFactor: c.YarnAttnFactor,
+		YarnBetaFast:   c.YarnBetaFast,
+		YarnBetaSlow:   c.YarnBetaSlow,
+		LoraAdapter:    c.LoraAdapter,
+		LoraBase:       c.LoraBase,
+		LoraScale:      c.LoraScale,
+		NGQA:           c.NGQA,
+		RMSNormEps:     c.RMSNormEps,
+		F16Memory:      c.F16,
+		MLock:          c.MMlock,
+		RopeFreqBase:   c.RopeFreqBase,
+		RopeFreqScale:  c.RopeFreqScale,
+		NUMA:           c.NUMA,
+		Embeddings:     c.Embeddings,
+		LowVRAM:        c.LowVRAM,
+		NGPULayers:     int32(c.NGPULayers),
+		MMap:           c.MMap,
+		MainGPU:        c.MainGPU,
+		Threads:        int32(c.Threads),
+		TensorSplit:    c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
@@ -78,6 +87,7 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
 	return &pb.PredictOptions{
 		Temperature:         float32(c.Temperature),
 		TopP:                float32(c.TopP),
+		NDraft:              c.NDraft,
 		TopK:                int32(c.TopK),
 		Tokens:              int32(c.Maxtokens),
 		Threads:             int32(c.Threads),
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -43,6 +43,13 @@ type Config struct {

 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`
+
+	// Vall-e-x
+	VallE VallE `yaml:"vall-e"`
+}
+
+type VallE struct {
+	AudioPath string `yaml:"audio_path"`
 }

 type FeatureFlag map[string]*bool
@@ -93,7 +100,18 @@ type LLMConfig struct {
 	NUMA            bool     `yaml:"numa"`
 	LoraAdapter     string   `yaml:"lora_adapter"`
 	LoraBase        string   `yaml:"lora_base"`
+	LoraScale       float32  `yaml:"lora_scale"`
 	NoMulMatQ       bool     `yaml:"no_mulmatq"`
+	DraftModel      string   `yaml:"draft_model"`
+	NDraft          int32    `yaml:"n_draft"`
+	Quantization    string   `yaml:"quantization"`
+	MMProj          string   `yaml:"mmproj"`
+
+	RopeScaling    string  `yaml:"rope_scaling"`
+	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
+	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
+	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
+	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }

 type AutoGPTQ struct {
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"slices"
 	"strings"
 	"sync"

@@ -27,6 +28,7 @@ type galleryOp struct {
 }

 type galleryOpStatus struct {
+	FileName           string  `json:"file_name"`
 	Error              error   `json:"error"`
 	Processed          bool    `json:"processed"`
 	Message            string  `json:"message"`
@@ -50,7 +52,6 @@ func NewGalleryService(modelPath string) *galleryApplier {
 	}
 }

-// prepareModel applies a
 func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {

 	config, err := gallery.GetGalleryConfigFromURL(req.URL)
@@ -76,6 +77,13 @@ func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
 	return g.statuses[s]
 }

+func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
+	g.Lock()
+	defer g.Unlock()
+
+	return g.statuses
+}
+
 func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 	go func() {
 		for {
@@ -94,7 +102,7 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {

 				// displayDownload displays the download progress
 				progressCallback := func(fileName string, current string, total string, percentage float64) {
-					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
+					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
 					utils.DisplayDownloadFunction(fileName, current, total, percentage)
 				}

@@ -176,18 +184,12 @@ func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galler
 	return processRequests(modelPath, s, cm, galleries, requests)
 }

-/// Endpoints
+/// Endpoint Service

-func GetOpStatusEndpoint(g *galleryApplier) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		status := g.getStatus(c.Params("uuid"))
-		if status == nil {
-			return fmt.Errorf("could not find any status for ID")
-		}
-
-		return c.JSON(status)
-	}
+type ModelGalleryService struct {
+	galleries      []gallery.Gallery
+	modelPath      string
+	galleryApplier *galleryApplier
 }

 type GalleryModel struct {
@@ -195,7 +197,31 @@ type GalleryModel struct {
 	gallery.GalleryModel
 }

-func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan galleryOp, galleries []gallery.Gallery) func(c *fiber.Ctx) error {
+func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
+	return ModelGalleryService{
+		galleries:      galleries,
+		modelPath:      modelPath,
+		galleryApplier: galleryApplier,
+	}
+}
+
+func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		status := mgs.galleryApplier.getStatus(c.Params("uuid"))
+		if status == nil {
+			return fmt.Errorf("could not find any status for ID")
+		}
+		return c.JSON(status)
+	}
+}
+
+func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		return c.JSON(mgs.galleryApplier.getAllStatus())
+	}
+}
+
+func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(GalleryModel)
 		// Get input data from the request body
@@ -207,11 +233,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 		if err != nil {
 			return err
 		}
-		g <- galleryOp{
+		mgs.galleryApplier.C <- galleryOp{
 			req:         input.GalleryModel,
 			id:          uuid.String(),
 			galleryName: input.ID,
-			galleries:   galleries,
+			galleries:   mgs.galleries,
 		}
 		return c.JSON(struct {
 			ID        string `json:"uuid"`
@@ -220,11 +246,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 	}
 }

-func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string) func(c *fiber.Ctx) error {
+func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing models from galleries: %+v", galleries)
+		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)

-		models, err := gallery.AvailableGalleryModels(galleries, basePath)
+		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
 		if err != nil {
 			return err
 		}
@@ -239,3 +265,56 @@ func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string)
 		return c.Send(dat)
 	}
 }
+
+// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
+func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s already exists", input.Name)
+		}
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Adding %+v to gallery list", *input)
+		mgs.galleries = append(mgs.galleries, *input)
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s is not currently registered", input.Name)
+		}
+		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		})
+		return c.Send(nil)
+	}
+}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -15,15 +16,20 @@ import (
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )

 func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	emptyMessage := ""
+	id := uuid.New().String()
+	created := int(time.Now().Unix())

 	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		initialMessage := schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
 			Object:  "chat.completion.chunk",
@@ -32,6 +38,8 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
 				Object:  "chat.completion.chunk",
@@ -73,6 +81,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}

+		if input.ResponseFormat == "json_object" {
+			input.Grammar = grammar.JSONBNF
+		}
+
 		// process functions if we have any defined or if we have a function call string
 		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
 			log.Debug().Msgf("Response needs to process functions")
@@ -132,14 +144,14 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}
 			}
 			r := config.Roles[role]
-			contentExists := i.Content != nil && *i.Content != ""
+			contentExists := i.Content != nil && i.StringContent != ""
 			// First attempt to populate content via a chat message specific template
 			if config.TemplateConfig.ChatMessage != "" {
 				chatMessageData := model.ChatMessageTemplateData{
 					SystemPrompt: config.SystemPrompt,
 					Role:         r,
 					RoleName:     role,
-					Content:      *i.Content,
+					Content:      i.StringContent,
 					MessageIndex: messageIndex,
 				}
 				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
@@ -158,7 +170,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			if content == "" {
 				if r != "" {
 					if contentExists {
-						content = fmt.Sprint(r, " ", *i.Content)
+						content = fmt.Sprint(r, i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -172,7 +184,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					}
 				} else {
 					if contentExists {
-						content = fmt.Sprint(*i.Content)
+						content = fmt.Sprint(i.StringContent)
 					}
 					if i.FunctionCall != nil {
 						j, err := json.Marshal(i.FunctionCall)
@@ -261,7 +273,9 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}

 				resp := &schema.OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							FinishReason: "stop",
@@ -324,7 +338,11 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
 					// Note: This costs (in term of CPU) another computation
 					config.Grammar = ""
-					predFunc, err := backend.ModelInference(input.Context, predInput, o.Loader, *config, o, nil)
+					images := []string{}
+					for _, m := range input.Messages {
+						images = append(images, m.StringImages...)
+					}
+					predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
 					if err != nil {
 						log.Error().Msgf("inference error: %s", err.Error())
 						return
@@ -355,6 +373,8 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}

 		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "chat.completion",
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -6,23 +6,31 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )

 // https://platform.openai.com/docs/api-reference/completions
 func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	id := uuid.New().String()
+	created := int(time.Now().Unix())
+
 	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
-				Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{
 					{
 						Index: 0,
@@ -57,6 +65,10 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

+		if input.ResponseFormat == "json_object" {
+			input.Grammar = grammar.JSONBNF
+		}
+
 		log.Debug().Msgf("Parameter Config: %+v", config)

 		if input.Stream {
@@ -108,7 +120,9 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				}

 				resp := &schema.OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							Index:        0,
@@ -156,6 +170,8 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		}

 		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "text_completion",
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -3,6 +3,7 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -10,6 +11,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"

 	"github.com/rs/zerolog/log"
 )
@@ -62,7 +64,11 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			result = append(result, r...)
 		}

+		id := uuid.New().String()
+		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "edit",
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -3,10 +3,12 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"

 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
@@ -57,10 +59,14 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}

+		id := uuid.New().String()
+		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
-			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:   items,
-			Object: "list",
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:    items,
+			Object:  "list",
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -5,11 +5,14 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
-	"github.com/go-skynet/LocalAI/api/schema"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -174,8 +177,12 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 		}

+		id := uuid.New().String()
+		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
-			Data: result,
+			ID:      id,
+			Created: created,
+			Data:    result,
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -23,8 +23,13 @@ func ComputeChoices(
 		n = 1
 	}

+	images := []string{}
+	for _, m := range req.Messages {
+		images = append(images, m.StringImages...)
+	}
+
 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, images, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -2,8 +2,11 @@ package openai

 import (
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"fmt"
+	"io/ioutil"
+	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
@@ -24,7 +27,7 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *sche
 	input.Cancel = cancel
 	// Get input data from the request body
 	if err := c.BodyParser(input); err != nil {
-		return "", nil, err
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
 	}

 	modelFile := input.Model
@@ -61,6 +64,37 @@ func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *sche
 	return modelFile, input, nil
 }

+// this function check if the string is an URL, if it's an URL downloads the image in memory
+// encodes it in base64 and returns the base64 string
+func getBase64Image(s string) (string, error) {
+	if strings.HasPrefix(s, "http") {
+		// download the image
+		resp, err := http.Get(s)
+		if err != nil {
+			return "", err
+		}
+		defer resp.Body.Close()
+
+		// read the image data into memory
+		data, err := ioutil.ReadAll(resp.Body)
+		if err != nil {
+			return "", err
+		}
+
+		// encode the image data in base64
+		encoded := base64.StdEncoding.EncodeToString(data)
+
+		// return the base64 string
+		return encoded, nil
+	}
+
+	// if the string instead is prefixed with "data:image/jpeg;base64,", drop it
+	if strings.HasPrefix(s, "data:image/jpeg;base64,") {
+		return strings.ReplaceAll(s, "data:image/jpeg;base64,", ""), nil
+	}
+	return "", fmt.Errorf("not valid string")
+}
+
 func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
@@ -129,6 +163,35 @@ func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 		}
 	}

+	// Decode each request's message content
+	index := 0
+	for i, m := range input.Messages {
+		switch content := m.Content.(type) {
+		case string:
+			input.Messages[i].StringContent = content
+		case []interface{}:
+			dat, _ := json.Marshal(content)
+			c := []schema.Content{}
+			json.Unmarshal(dat, &c)
+			for _, pp := range c {
+				if pp.Type == "text" {
+					input.Messages[i].StringContent = pp.Text
+				} else if pp.Type == "image_url" {
+					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
+					base64, err := getBase64Image(pp.ImageURL.URL)
+					if err == nil {
+						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+						// set a placeholder for each image
+						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
+						index++
+					} else {
+						fmt.Print("Failed encoding image", err)
+					}
+				}
+			}
+		}
+	}
+
 	if input.RepeatPenalty != 0 {
 		config.RepeatPenalty = input.RepeatPenalty
 	}
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -7,6 +7,7 @@ import (

 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/rs/zerolog/log"
 )

@@ -24,6 +25,7 @@ type Option struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
+	Metrics                             *metrics.Metrics

 	Galleries []gallery.Gallery

@@ -99,6 +101,7 @@ func WithStringGalleries(galls string) AppOption {
 	return func(o *Option) {
 		if galls == "" {
 			log.Debug().Msgf("no galleries to load")
+			o.Galleries = []gallery.Gallery{}
 			return
 		}
 		var galleries []gallery.Gallery
@@ -197,3 +200,9 @@ func WithApiKeys(apiKeys []string) AppOption {
 		o.ApiKeys = apiKeys
 	}
 }
+
+func WithMetrics(meter *metrics.Metrics) AppOption {
+	return func(o *Option) {
+		o.Metrics = meter
+	}
+}
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -55,11 +55,25 @@ type Choice struct {
 	Text         string   `json:"text,omitempty"`
 }

+type Content struct {
+	Type     string     `json:"type" yaml:"type"`
+	Text     string     `json:"text" yaml:"text"`
+	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
+}
+
+type ContentURL struct {
+	URL string `json:"url" yaml:"url"`
+}
+
 type Message struct {
 	// The message role
 	Role string `json:"role,omitempty" yaml:"role"`
 	// The message content
-	Content *string `json:"content" yaml:"content"`
+	Content interface{} `json:"content" yaml:"content"`
+
+	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
+	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
+
 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
 }
--- a/backend/cpp/grpc/.gitignore
+++ b/backend/cpp/grpc/.gitignore
@@ -0,0 +1,3 @@
+installed_packages/
+grpc_build/
+grpc_repo/
--- a/backend/cpp/grpc/script/build_grpc.sh
+++ b/backend/cpp/grpc/script/build_grpc.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# Builds locally from sources the packages needed by the llama cpp backend.
+
+# Makes sure a few base packages exist.
+# sudo apt-get --no-upgrade -y install g++ gcc binutils cmake git build-essential autoconf libtool pkg-config 
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+echo "Script directory: $SCRIPT_DIR"
+
+CPP_INSTALLED_PACKAGES_DIR=$1
+if [ -z ${CPP_INSTALLED_PACKAGES_DIR} ]; then 
+    echo "CPP_INSTALLED_PACKAGES_DIR env variable not set. Don't know where to install: failed."; 
+    echo
+    exit -1
+fi
+
+if [ -d "${CPP_INSTALLED_PACKAGES_DIR}" ]; then
+  echo "gRPC installation directory already exists. Nothing to do."
+  exit 0
+fi
+
+# The depth when cloning a git repo. 1 speeds up the clone when the repo history is not needed.
+GIT_CLONE_DEPTH=1
+
+NUM_BUILD_THREADS=$(nproc --ignore=1)
+
+# Google gRPC --------------------------------------------------------------------------------------
+TAG_LIB_GRPC="v1.59.0"
+GIT_REPO_LIB_GRPC="https://github.com/grpc/grpc.git"
+GRPC_REPO_DIR="${SCRIPT_DIR}/../grpc_repo"
+GRPC_BUILD_DIR="${SCRIPT_DIR}/../grpc_build"
+SRC_DIR_LIB_GRPC="${GRPC_REPO_DIR}/grpc"
+
+echo "SRC_DIR_LIB_GRPC: ${SRC_DIR_LIB_GRPC}"
+echo "GRPC_REPO_DIR: ${GRPC_REPO_DIR}"
+echo "GRPC_BUILD_DIR: ${GRPC_BUILD_DIR}"
+
+mkdir -pv ${GRPC_REPO_DIR}
+
+rm   -rf ${GRPC_BUILD_DIR}
+mkdir -pv ${GRPC_BUILD_DIR}
+
+mkdir -pv ${CPP_INSTALLED_PACKAGES_DIR}
+	
+if [ -d "${SRC_DIR_LIB_GRPC}" ]; then
+  echo "gRPC source already exists locally. Not cloned again."
+else  
+  ( cd ${GRPC_REPO_DIR} && \
+    git clone --depth ${GIT_CLONE_DEPTH} -b ${TAG_LIB_GRPC} ${GIT_REPO_LIB_GRPC} && \
+    cd ${SRC_DIR_LIB_GRPC} && \
+    git submodule update --init --recursive --depth ${GIT_CLONE_DEPTH} 
+  )    
+fi
+
+( cd ${GRPC_BUILD_DIR} && \
+  cmake -G "Unix Makefiles" \
+     -DCMAKE_BUILD_TYPE=Release \
+     -DgRPC_INSTALL=ON \
+     -DEXECUTABLE_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/bin \
+     -DLIBRARY_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/lib \
+     -DgRPC_BUILD_TESTS=OFF \
+     -DgRPC_BUILD_CSHARP_EXT=OFF \
+     -DgRPC_BUILD_GRPC_CPP_PLUGIN=ON \
+     -DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON \
+     -DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF \
+     -Dprotobuf_WITH_ZLIB=ON \
+     -DRE2_BUILD_TESTING=OFF \
+     -DCMAKE_INSTALL_PREFIX=${CPP_INSTALLED_PACKAGES_DIR}/ \
+     ${SRC_DIR_LIB_GRPC}  && \
+  cmake --build .  -- -j ${NUM_BUILD_THREADS} && \
+  cmake --build .  --target install -- -j ${NUM_BUILD_THREADS} 
+)
+
+rm -rf ${GRPC_BUILD_DIR}
+rm -rf ${GRPC_REPO_DIR}
+
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -0,0 +1,74 @@
+
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+set(TARGET myclip)
+add_library(${TARGET} clip.cpp clip.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+set(TARGET grpc-server)
+# END CLIP hack
+set(CMAKE_CXX_STANDARD 17)
+cmake_minimum_required(VERSION 3.15)
+set(TARGET grpc-server)
+set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+set(_REFLECTION grpc++_reflection)
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    link_directories("/opt/homebrew/lib")
+    include_directories("/opt/homebrew/include")
+endif()
+
+find_package(absl CONFIG REQUIRED)
+find_package(Protobuf CONFIG REQUIRED)
+find_package(gRPC CONFIG REQUIRED)
+
+find_program(_PROTOBUF_PROTOC protoc)
+set(_GRPC_GRPCPP grpc++)
+find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${Protobuf_INCLUDE_DIRS})
+
+message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+
+# Proto file
+get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
+
+add_custom_command(
+      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+      DEPENDS "${hw_proto}")
+
+# hw_grpc_proto
+add_library(hw_grpc_proto
+  ${hw_grpc_srcs}
+  ${hw_grpc_hdrs}
+  ${hw_proto_srcs}
+  ${hw_proto_hdrs} )
+
+add_executable(${TARGET} grpc-server.cpp json.hpp )
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+  absl::flags_parse
+  gRPC::${_REFLECTION}
+  gRPC::${_GRPC_GRPCPP}
+  protobuf::${_PROTOBUF_LIBPROTOBUF})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,50 @@
+
+LLAMA_VERSION?=d9b33fe95bd257b36c84ee5769cc048230067d6f
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+
+# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
+# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblast)
+	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+endif
+
+llama.cpp:
+	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
+	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
+
+llama.cpp/examples/grpc-server:
+	mkdir -p llama.cpp/examples/grpc-server
+	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+
+rebuild:
+	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+clean:
+	rm -rf llama.cpp
+	rm -rf grpc-server
+
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -1,25 +0,0 @@
-package main
-
-// GRPC Falcon server
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &falcon.LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/custom-ca-certs/.keep
+++ b/custom-ca-certs/.keep
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,4 +12,5 @@ services:
      - .env
    volumes:
      - ./models:/models:cached
+      - ./images/:/tmp/generated/images/
    command: ["/usr/bin/local-ai" ]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -5,7 +5,7 @@ cd /build

 if [ "$REBUILD" != "false" ]; then
 	rm -rf ./local-ai
-	ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build -j${BUILD_PARALLELISM:-1}
+	make build -j${BUILD_PARALLELISM:-1}
 else
 	echo "@@@@@"
 	echo "Skipping rebuild"
--- a/examples/README.md
+++ b/examples/README.md
@@ -159,14 +159,24 @@ Allows to run any LocalAI-compatible model as a backend on the servers of https:

 ### Continue

-<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
-
 _by [@gruberdev](https://github.com/gruberdev)_

+<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
+
 Demonstrates how to integrate an open-source copilot alternative that enhances code analysis, completion, and improvements. This approach seamlessly integrates with any LocalAI model, offering a more user-friendly experience.

 [Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/continue/)

+### Streamlit bot
+
+_by [@majoshi1](https://github.com/majoshi1)_
+
+![Screenshot](streamlit-bot/streamlit-bot.png)
+
+A chat bot made using `Streamlit` & LocalAI.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/streamlit-bot/)
+
 ## Want to contribute?

 Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/autoGPT/.env.example
+++ b/examples/autoGPT/.env.example
@@ -1,5 +1,9 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
-PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
+# see other options in the model gallery at https://github.com/go-skynet/model-gallery
+PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
--- a/examples/autoGPT/README.md
+++ b/examples/autoGPT/README.md
@@ -10,12 +10,16 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/autoGPT

+cp -rfv .env.example .env
+
+# Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
+vim .env
+
 docker-compose run --rm auto-gpt
 ```

 Note: The example automatically downloads the `gpt4all` model as it is under a permissive license. The GPT4All model does not seem to be enough to run AutoGPT. WizardLM-7b-uncensored seems to perform better (with `f16: true`).

-See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.

 ## Without docker

--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: backend monitor
+  type: http
+  seq: 4
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
+  body: none
+  auth: none
+}
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -0,0 +1,21 @@
+meta {
+  name: backend-shutdown
+  type: http
+  seq: 3
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,5 @@
+{
+  "version": "1",
+  "name": "LocalAI Test Requests",
+  "type": "collection"
+}
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -0,0 +1,6 @@
+vars {
+  HOST: localhost
+  PORT: 8080
+  DEFAULT_MODEL: gpt-3.5-turbo
+  PROTOCOL: http://
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: get models list
+  type: http
+  seq: 2
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,25 @@
+meta {
+  name: Generate image
+  type: http
+  seq: 1
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+    "prompt": "<positive prompt>|<negative prompt>",
+    "model": "model-name",
+    "step": 51,
+    "size": "1024x1024",
+    "image": ""
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,24 @@
+meta {
+  name: -completions
+  type: http
+  seq: 4
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "prompt": "function downloadFile(string url, string outputPath) {",
+      "max_tokens": 256,
+      "temperature": 0.5
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,23 @@
+meta {
+  name: -edits
+  type: http
+  seq: 5
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "What day of the wek is it?",
+      "instruction": "Fix the spelling mistakes"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: -embeddings
+  type: http
+  seq: 6
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,30 @@
+meta {
+  name: chat completion -simple- 1 message-
+  type: http
+  seq: 4
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+    "model": "{{DEFAULT_MODEL}}",
+    "messages": [
+      {
+        "role": "user",
+        "content": "How could one use friction to cook an egg?"
+      }
+    ],
+    "max_tokens": 256,
+    "temperature": 0.2,
+    "grammar": ""
+  }
+}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,29 @@
+meta {
+  name: chat-completions -long-
+  type: http
+  seq: 5
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
+           {"role": "user", "content": "How could one use electricity to cook an egg?"},
+           {"role": "assistant",
+                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
+              },
+              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
+       "max_tokens": 1024,
+       "temperature": 0.5
+  }
+}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,25 @@
+meta {
+  name: chat-completions -stream-
+  type: http
+  seq: 6
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
+       "max_tokens": 256,
+       "temperature": 0.9,
+       "stream": true
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: add model gallery
+  type: http
+  seq: 10
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
+      "name": "test"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
+meta {
+  name: delete model gallery
+  type: http
+  seq: 11
+}
+
+delete {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "name": "test"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: list MODELS in galleries
+  type: http
+  seq: 7
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: list model GALLERIES
+  type: http
+  seq: 8
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
+meta {
+  name: model gallery apply -gist-
+  type: http
+  seq: 12
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: model gallery apply
+  type: http
+  seq: 9
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
+      "name": "codellama7b"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: -tts
+  type: http
+  seq: 2
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
+  }
+}
--- a/examples/chainlit/Dockerfile
+++ b/examples/chainlit/Dockerfile
@@ -0,0 +1,16 @@
+# Use an official Python runtime as a parent image
+FROM harbor.home.sfxworks.net/docker/library/python:3.9-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the current directory contents into the container at /app
+COPY requirements.txt /app
+
+# Install any needed packages specified in requirements.txt
+RUN pip install -r requirements.txt
+
+COPY . /app
+
+# Run app.py when the container launches
+CMD ["chainlit", "run", "-h", "--host", "0.0.0.0", "main.py" ]
--- a/examples/chainlit/README.md
+++ b/examples/chainlit/README.md
@@ -0,0 +1,25 @@
+# LocalAI Demonstration with Embeddings and Chainlit
+
+This demonstration shows you how to use embeddings with existing data in `LocalAI`, and how to integrate it with Chainlit for an interactive querying experience. We are using the `llama_index` library to facilitate the embedding and querying processes, and `chainlit` to provide an interactive interface. The `Weaviate` client is used as the embedding source.
+
+## Prerequisites
+
+Before proceeding, make sure you have the following installed:
+- Weaviate client
+- LocalAI and its dependencies
+- Chainlit and its dependencies
+
+## Getting Started
+
+1. Clone this repository:
+2. Navigate to the project directory:
+3. Run the example: `chainlit run main.py`
+
+# Highlight on `llama_index` and `chainlit`
+
+`llama_index` is the key library that facilitates the process of embedding and querying data in LocalAI. It provides a seamless interface to integrate various components, such as `WeaviateVectorStore`, `LocalAI`, `ServiceContext`, and more, for a smooth querying experience.
+
+`chainlit` is used to provide an interactive interface for users to query the data and see the results in real-time. It integrates with llama_index to handle the querying process and display the results to the user.
+
+In this example, `llama_index` is used to set up the `VectorStoreIndex` and `QueryEngine`, and `chainlit` is used to handle the user interactions with `LocalAI` and display the results.
+
--- a/examples/chainlit/config.yaml
+++ b/examples/chainlit/config.yaml
@@ -0,0 +1,16 @@
+localAI:
+  temperature: 0
+  modelName: gpt-3.5-turbo
+  apiBase: http://local-ai.default
+  apiKey: stub
+  streaming: True
+weviate:
+  url: http://weviate.local
+  index: AIChroma
+query:
+  mode: hybrid
+  topK: 1
+  alpha: 0.0
+  chunkSize: 1024
+embedding:
+  model: BAAI/bge-small-en-v1.5
--- a/examples/chainlit/main.py
+++ b/examples/chainlit/main.py
@@ -0,0 +1,82 @@
+import os
+
+import weaviate
+from llama_index.storage.storage_context import StorageContext
+from llama_index.vector_stores import WeaviateVectorStore
+
+from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
+from llama_index.callbacks.base import CallbackManager
+from llama_index import (
+    LLMPredictor,
+    ServiceContext,
+    StorageContext,
+    VectorStoreIndex,
+)
+import chainlit as cl
+
+from llama_index.llms import LocalAI
+from llama_index.embeddings import HuggingFaceEmbedding
+import yaml
+
+# Load the configuration file
+with open("config.yaml", "r") as ymlfile:
+    cfg = yaml.safe_load(ymlfile)
+
+# Get the values from the configuration file or set the default values
+temperature = cfg['localAI'].get('temperature', 0)
+model_name = cfg['localAI'].get('modelName', "gpt-3.5-turbo")
+api_base = cfg['localAI'].get('apiBase', "http://local-ai.default")
+api_key = cfg['localAI'].get('apiKey', "stub")
+streaming = cfg['localAI'].get('streaming', True)
+weaviate_url = cfg['weviate'].get('url', "http://weviate.default")
+index_name = cfg['weviate'].get('index', "AIChroma")
+query_mode = cfg['query'].get('mode', "hybrid")
+topK = cfg['query'].get('topK', 1)
+alpha = cfg['query'].get('alpha', 0.0)
+embed_model_name = cfg['embedding'].get('model', "BAAI/bge-small-en-v1.5")
+chunk_size = cfg['query'].get('chunkSize', 1024)
+
+
+embed_model = HuggingFaceEmbedding(model_name=embed_model_name)
+
+
+llm = LocalAI(temperature=temperature, model_name=model_name, api_base=api_base, api_key=api_key, streaming=streaming)
+llm.globally_use_chat_completions = True;
+client = weaviate.Client(weaviate_url)
+vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name)
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+@cl.on_chat_start
+async def factory():
+
+    llm_predictor = LLMPredictor(
+        llm=llm
+    )
+    
+    service_context = ServiceContext.from_defaults(embed_model=embed_model, callback_manager=CallbackManager([cl.LlamaIndexCallbackHandler()]), llm_predictor=llm_predictor, chunk_size=chunk_size)
+
+    index = VectorStoreIndex.from_vector_store(
+        vector_store,
+        storage_context=storage_context,
+        service_context=service_context
+    )
+
+    query_engine = index.as_query_engine(vector_store_query_mode=query_mode, similarity_top_k=topK, alpha=alpha, streaming=True)
+
+    cl.user_session.set("query_engine", query_engine)
+
+
+@cl.on_message
+async def main(message: cl.Message):
+    query_engine = cl.user_session.get("query_engine")
+    response = await cl.make_async(query_engine.query)(message.content)
+
+    response_message = cl.Message(content="")
+
+    for token in response.response_gen:
+        await response_message.stream_token(token=token)
+
+    if response.response_txt:
+        response_message.content = response.response_txt
+
+    await response_message.send()
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -0,0 +1,7 @@
+llama_hub==0.0.41
+llama_index==0.8.55
+Requests==2.31.0
+weaviate_client==3.25.1
+transformers
+torch
+chainlit
--- a/examples/chatbot-ui-manual/README.md
+++ b/examples/chatbot-ui-manual/README.md
@@ -24,10 +24,13 @@ docker-compose up -d --pull always
 # docker-compose up -d --build
 ```

+Then browse to `http://localhost:3000` to view the Web UI.
+
 ## Pointing chatbot-ui to a separately managed LocalAI service

-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+
+```yaml
 version: '3.6'

 services:
@@ -40,9 +43,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```

-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.

 ## Accessing chatbot-ui

 Open http://localhost:3000 for the Web UI.
-
--- a/examples/chatbot-ui-manual/models
+++ b/examples/chatbot-ui-manual/models
@@ -0,0 +1 @@
+../models
--- a/examples/chatbot-ui/README.md
+++ b/examples/chatbot-ui/README.md
@@ -20,10 +20,13 @@ docker-compose up --pull always
 # docker-compose up -d --build
 ```

+Then browse to `http://localhost:3000` to view the Web UI.
+
 ## Pointing chatbot-ui to a separately managed LocalAI service

-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+
+```yaml
 version: '3.6'

 services:
@@ -36,9 +39,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```

-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.

 ## Accessing chatbot-ui

 Open http://localhost:3000 for the Web UI.
-
--- a/examples/configurations/README.md
+++ b/examples/configurations/README.md
@@ -0,0 +1,42 @@
+## Advanced configuration
+
+This section contains examples on how to install models manually with config files.
+
+### Prerequisites
+
+First clone LocalAI:
+
+```bash
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI
+```
+
+Setup the model you prefer from the examples below and then start LocalAI:
+
+```bash
+docker compose up -d --pull always
+```
+
+If LocalAI is already started, you can restart it with 
+
+```bash
+docker compose restart
+```
+
+See also the getting started: https://localai.io/basics/getting_started/
+
+### Mistral
+
+To setup mistral copy the files inside `mistral` in the `models` folder:
+
+```bash
+cp -r examples/configurations/mistral/* models/
+```
+
+Now download the model:
+
+```bash
+wget https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q6_K.gguf -O models/mistral-7b-openorca.Q6_K.gguf
+```
+
--- a/examples/configurations/llava/README.md
+++ b/examples/configurations/llava/README.md
@@ -0,0 +1,18 @@
+![llava](https://github.com/mudler/LocalAI/assets/2420543/cb0a0897-3b58-4350-af66-e6f4387b58d3)
+
+## Setup
+
+```
+mkdir models
+wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/ggml-model-q4_k.gguf -O models/ggml-model-q4_k.gguf
+wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/mmproj-model-f16.gguf -O models/mmproj-model-f16.gguf
+docker run -p 8080:8080 -v $PWD/models:/models -ti --rm quay.io/go-skynet/local-ai:master --models-path /models --threads 4
+```
+
+## Try it out
+
+```
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "llava",
+     "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
+```
--- a/examples/configurations/llava/chat-simple.tmpl
+++ b/examples/configurations/llava/chat-simple.tmpl
@@ -0,0 +1,3 @@
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+{{.Input}}
+ASSISTANT:
--- a/examples/configurations/llava/llava.yaml
+++ b/examples/configurations/llava/llava.yaml
@@ -0,0 +1,20 @@
+
+context_size: 4096
+f16: true
+threads: 11
+gpu_layers: 90
+name: llava
+mmap: true
+backend: llama-cpp
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+parameters:
+  model: ggml-model-q4_k.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+template:
+  chat: chat-simple
+mmproj: mmproj-model-f16.gguf
--- a/examples/configurations/mistral/chatml-block.tmpl
+++ b/examples/configurations/mistral/chatml-block.tmpl
@@ -0,0 +1,3 @@
+{{.Input}}
+<|im_start|>assistant
+
--- a/examples/configurations/mistral/chatml.tmpl
+++ b/examples/configurations/mistral/chatml.tmpl
@@ -0,0 +1,3 @@
+<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+{{if .Content}}{{.Content}}{{end}}
+<|im_end|>
--- a/examples/chatbot-ui-manual/models/completion.tmpl
+++ b/examples/chatbot-ui-manual/models/completion.tmpl
--- a/examples/configurations/mistral/mistral.yaml
+++ b/examples/configurations/mistral/mistral.yaml
@@ -0,0 +1,16 @@
+name: mistral
+mmap: true
+parameters:
+  model: mistral-7b-openorca.Q6_K.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+template:
+  chat_message: chatml
+  chat: chatml-block
+  completion: completion
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+threads: 4
--- a/examples/continue/README.md
+++ b/examples/continue/README.md
@@ -28,7 +28,7 @@ For a live demonstration, please click on the link below:
 3. Type `/config` within Continue's VSCode extension, or edit the file located at `~/.continue/config.py` on your system with the following configuration:

    ```py
-    from continuedev.src.continuedev.libs.llm.openai import OpenAI, OpenAIServerInfo
+    from continuedev.src.continuedev.libs.llm.openai import OpenAI

    config = ContinueConfig(
       ...
@@ -36,10 +36,7 @@ For a live demonstration, please click on the link below:
            default=OpenAI(
               api_key="my-api-key",
               model="gpt-3.5-turbo",
-               openai_server_info=OpenAIServerInfo(
-                  api_base="http://localhost:8080",
-                  model="gpt-3.5-turbo"
-               )
+               api_base="http://localhost:8080",
            )
       ),
    )
--- a/examples/discord-bot/.env.example
+++ b/examples/discord-bot/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=x
 DISCORD_BOT_TOKEN=x
 DISCORD_CLIENT_ID=x
--- a/examples/discord-bot/models
+++ b/examples/discord-bot/models
@@ -1 +1 @@
-../chatbot-ui/models/
+../models
--- a/examples/functions/.env.example
+++ b/examples/functions/.env.example
@@ -1,7 +1,11 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
+# see other options in the model gallery at https://github.com/go-skynet/model-gallery
 PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/openllama-7b-open-instruct.yaml", "name": "gpt-3.5-turbo"}]

 ## Change the default number of threads
--- a/examples/functions/README.md
+++ b/examples/functions/README.md
@@ -10,9 +10,12 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/functions

+cp -rfv .env.example .env
+
+# Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
+vim .env
+
 docker-compose run --rm functions
 ```

 Note: The example automatically downloads the `openllama` model as it is under a permissive license.
-
-See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.
--- a/examples/insomnia/Insomnia_LocalAI.json
+++ b/examples/insomnia/Insomnia_LocalAI.json
--- a/examples/langchain-chroma/.env.example
+++ b/examples/langchain-chroma/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 THREADS=4
 CONTEXT_SIZE=512
 MODELS_PATH=/models
--- a/examples/langchain-chroma/models
+++ b/examples/langchain-chroma/models
@@ -0,0 +1 @@
+../models
--- a/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
@@ -1,16 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: ggml-gpt4all-j
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/langchain-chroma/models/gpt4all.tmpl
+++ b/examples/langchain-chroma/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/examples/langchain-huggingface/models
+++ b/examples/langchain-huggingface/models
@@ -0,0 +1 @@
+../models
--- a/examples/langchain-huggingface/models/completion.tmpl
+++ b/examples/langchain-huggingface/models/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/examples/langchain-huggingface/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain-huggingface/models/gpt-3.5-turbo.yaml
@@ -1,17 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: gpt2
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-backend: "langchain-huggingface"
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/langchain-huggingface/models/gpt4all.tmpl
+++ b/examples/langchain-huggingface/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/examples/langchain/models
+++ b/examples/langchain/models
@@ -0,0 +1 @@
+../models
--- a/examples/langchain/models/completion.tmpl
+++ b/examples/langchain/models/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/examples/langchain/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain/models/gpt-3.5-turbo.yaml
@@ -1,17 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: ggml-gpt4all-j # ggml-koala-13B-4bit-128g
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-backend: "gptj"
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/langchain/models/gpt4all.tmpl
+++ b/examples/langchain/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/examples/llamaindex/README.md
+++ b/examples/llamaindex/README.md
@@ -0,0 +1,30 @@
+# LocalAI Demonstration with Embeddings
+
+This demonstration shows you how to use embeddings with existing data in LocalAI. We are using the `llama_index` library to facilitate the embedding and querying processes. The `Weaviate` client is used as the embedding source.
+
+## Prerequisites
+
+Before proceeding, make sure you have the following installed:
+- Weaviate client
+- LocalAI and its dependencies
+- llama_index and its dependencies 
+
+## Getting Started
+
+1. Clone this repository:
+
+2. Navigate to the project directory:
+
+3. Run the example:
+
+`python main.py`
+
+```
+Downloading (…)lve/main/config.json: 100%|███████████████████████████| 684/684 [00:00<00:00, 6.01MB/s]
+Downloading model.safetensors: 100%|███████████████████████████████| 133M/133M [00:03<00:00, 39.5MB/s]
+Downloading (…)okenizer_config.json: 100%|███████████████████████████| 366/366 [00:00<00:00, 2.79MB/s]
+Downloading (…)solve/main/vocab.txt: 100%|█████████████████████████| 232k/232k [00:00<00:00, 6.00MB/s]
+Downloading (…)/main/tokenizer.json: 100%|█████████████████████████| 711k/711k [00:00<00:00, 18.8MB/s]
+Downloading (…)cial_tokens_map.json: 100%|███████████████████████████| 125/125 [00:00<00:00, 1.18MB/s]
+LocalAI is a community-driven project that aims to make AI accessible to everyone. It was created by Ettore Di Giacinto and is focused on providing various AI-related features such as text generation with GPTs, text to audio, audio to text, image generation, and more. The project is constantly growing and evolving, with a roadmap for future improvements. Anyone is welcome to contribute, provide feedback, and submit pull requests to help make LocalAI better.
+```
--- a/examples/llamaindex/main.py
+++ b/examples/llamaindex/main.py
@@ -0,0 +1,38 @@
+import os
+
+import weaviate
+
+from llama_index import ServiceContext, VectorStoreIndex, StorageContext
+from llama_index.llms import LocalAI
+from llama_index.vector_stores import WeaviateVectorStore
+from llama_index.storage.storage_context import StorageContext
+
+# Weaviate client setup
+client = weaviate.Client("http://weviate.default")
+
+# Weaviate vector store setup
+vector_store = WeaviateVectorStore(weaviate_client=client, index_name="AIChroma")
+
+# Storage context setup
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+# LocalAI setup
+llm = LocalAI(temperature=0, model_name="gpt-3.5-turbo", api_base="http://local-ai.default", api_key="stub")
+llm.globally_use_chat_completions = True;
+
+# Service context setup
+service_context = ServiceContext.from_defaults(llm=llm, embed_model="local")
+
+# Load index from stored vectors
+index = VectorStoreIndex.from_vector_store(
+    vector_store,
+    storage_context=storage_context,
+    service_context=service_context
+)
+
+# Query engine setup
+query_engine = index.as_query_engine(similarity_top_k=1, vector_store_query_mode="hybrid")
+
+# Query example
+response = query_engine.query("What is LocalAI?")
+print(response)
--- a/examples/localai-webui/docker-compose.yml
+++ b/examples/localai-webui/docker-compose.yml
@@ -8,8 +8,6 @@ services:
      dockerfile: Dockerfile
    ports:
      - 8080:8080
-    env_file:
-      - .env
    volumes:
      - ./models:/models:cached
    command: ["/usr/bin/local-ai"]
--- a/examples/models/.gitignore
+++ b/examples/models/.gitignore
@@ -0,0 +1,7 @@
+# Ignore everything but predefined models
+*
+!.gitignore
+!completion.tmpl
+!embeddings.yaml
+!gpt4all.tmpl
+!gpt-3.5-turbo.yaml
--- a/examples/langchain-chroma/models/completion.tmpl
+++ b/examples/langchain-chroma/models/completion.tmpl
--- a/Show More
+++ b/Show More