fix(deps): update module github.com/google/uuid to v1.4.0

cleanup: drop bloomz and ggllm as now supported by llama.cpp (#1217 )
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-02 18:53:32 -05:00 · 2023-10-26 15:59:34 +00:00 · 2023-10-26 07:43:31 +02:00 · 2023-10-25 20:56:25 +02:00 · 2023-10-25 20:56:12 +02:00 · 2023-10-25 20:53:26 +02:00
137 changed files with 4748 additions and 998 deletions
--- a/.env
+++ b/.env
@@ -23,6 +23,12 @@ MODELS_PATH=/models
 ## Enable debug mode
 # DEBUG=true

+## Disables COMPEL (Diffusers)
+# COMPEL=0
+
+## Enable/Disable single backend (useful if only one GPU is available)
+# SINGLE_ACTIVE_BACKEND=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -44,3 +50,20 @@ MODELS_PATH=/models

 ## Specify a default upload limit in MB (whisper)
 # UPLOAD_LIMIT
+
+## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
+# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+
+### Advanced settings ###
+### Those are not really used by LocalAI, but from components in the stack ###
+##
+### Preload libraries
+# LD_PRELOAD=
+
+### Huggingface cache for models
+# HUGGINGFACE_HUB_CACHE=/usr/local/huggingface
+
+### Python backends GRPC max workers
+### Default number of workers for GRPC Python backends.
+### This actually controls wether a backend can process multiple requests or not.
+# PYTHON_GRPC_MAX_WORKERS=1
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,16 +8,24 @@ This PR fixes #
 **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)**
 - [ ] Yes, I signed my commits.
 
-
 <!--
 Thank you for contributing to LocalAI! 

-Contributing Conventions:
+Contributing Conventions
+-------------------------

-1. Include descriptive PR titles with [<component-name>] prepended.
-2. Build and test your changes before submitting a PR. 
+The draft above helps to give a quick overview of your PR.
+
+Remember to remove this comment and to at least:
+
+1. Include descriptive PR titles with [<component-name>] prepended. We use [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/).
+2. Build and test your changes before submitting a PR (`make build`). 
 3. Sign your commits
+4. **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below).
+5. **X/Twitter handle:** we announce bigger features on X/Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out!

 By following the community's contribution conventions upfront, the review process will 
 be accelerated and your PR merged more quickly.
+
+If no one reviews your PR within a few days, please @-mention @mudler.
 -->
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,6 +12,9 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
+          - repository: "ggerganov/llama.cpp"
+            variable: "CPPLLAMA_VERSION"
+            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
@@ -41,7 +44,7 @@ jobs:
            branch: "master"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -19,7 +19,8 @@ jobs:
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
@@ -38,7 +39,7 @@ jobs:
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
@@ -82,6 +83,10 @@ jobs:
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
@@ -92,11 +97,11 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@v4
+        uses: docker/metadata-action@v5
        with:
          images: quay.io/go-skynet/local-ai
          tags: |
@@ -118,14 +123,14 @@ jobs:

      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          registry: quay.io
          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}

      - name: Build and push
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -19,7 +19,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
@@ -29,6 +29,12 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
+
      - name: Build
        id: build
        env:
@@ -60,18 +66,26 @@ jobs:
    runs-on: macOS-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v4
        with:
          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Build
        id: build
        env:
          CMAKE_ARGS: "${{ matrix.defines }}"
          BUILD_ID: "${{ matrix.build }}"
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          make dist
      - uses: actions/upload-artifact@v3
        with:
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -0,0 +1,63 @@
+---
+name: 'GPU tests'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-latest:
+    runs-on: self-hosted
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
+      - name: Build
+        run: |
+          if [ ! -e /run/systemd/system ]; then
+            sudo mkdir /run/systemd/system
+          fi
+          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
+          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            BUILD_TYPE=cublas \
+            prepare-e2e run-e2e-image test-e2e
+      - name: Release space from worker ♻
+        if: always()
+        run: |
+          sudo rm -rf build || true
+          sudo rm -rf bin || true
+          sudo rm -rf dist || true
+          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
+          sudo cat logs.txt || true
+          sudo rm -rf logs.txt
+          make clean || true
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            teardown-e2e || true
+          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }}
+          docker system prune -f -a --volumes || true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,8 +20,40 @@ jobs:
      matrix:
        go-version: ['1.21.x']
    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -35,11 +67,15 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          
+
          sudo apt-get install -y ca-certificates cmake curl patch
          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
          sudo pip install -r extra/requirements.txt

+
+          # Pre-build stable diffusion before we install a newever version of abseil (not compatible with stablediffusion-ncn)
+          GO_TAGS="tts stablediffusion" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
          sudo mkdir /build && sudo chmod -R 777 /build && cd /build && \
          curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v1.11.0.tar.gz" | \
          tar -xzvf - && \
@@ -55,6 +91,12 @@ jobs:
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
+
      - name: Test
        run: |
          ESPEAK_DATA="/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data" GO_TAGS="tts stablediffusion" make test
@@ -66,7 +108,7 @@ jobs:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
@@ -76,6 +118,14 @@ jobs:
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
+      - name: Dependencies
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Test
        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,72 @@
+# Contributing to localAI
+
+Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
+
+## Table of Contents
+
+- [Getting Started](#getting-started)
+  - [Prerequisites](#prerequisites)
+  - [Setting up the Development Environment](#setting-up-the-development-environment)
+- [Contributing](#contributing)
+  - [Submitting an Issue](#submitting-an-issue)
+  - [Creating a Pull Request (PR)](#creating-a-pull-request-pr)
+- [Coding Guidelines](#coding-guidelines)
+- [Testing](#testing)
+- [Documentation](#documentation)
+- [Community and Communication](#community-and-communication)
+
+
+
+## Getting Started
+
+### Prerequisites
+
+- Golang [1.21]
+- Git
+- macOS/Linux
+
+### Setting up the Development Environment and running localAI in the local environment
+
+1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
+2. Navigate to the project directory: `cd LocalAI`
+3. Install the required dependencies: `make prepare`
+4. Run LocalAI: `make run`
+
+## Contributing
+
+We welcome contributions from everyone! To get started, follow these steps:
+
+### Submitting an Issue
+
+If you find a bug, have a feature request, or encounter any issues, please check the [issue tracker](https://github.com/go-skynet/LocalAI/issues) to see if a similar issue has already been reported. If not, feel free to [create a new issue](https://github.com/go-skynet/LocalAI/issues/new) and provide as much detail as possible.
+
+### Creating a Pull Request (PR)
+
+1. Fork the repository.
+2. Create a new branch with a descriptive name: `git checkout -b [branch name]`
+3. Make your changes and commit them.
+4. Push the changes to your fork: `git push origin [branch name]`
+5. Create a new pull request from your branch to the main project's `main` or `master` branch.
+6. Provide a clear description of your changes in the pull request.
+7. Make any requested changes during the review process.
+8. Once your PR is approved, it will be merged into the main project.
+
+## Coding Guidelines
+
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+
+## Testing
+
+`make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
+
+## Documentation
+
+- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
+
+## Community and Communication
+
+- You can reach out via the Github issue tracker.
+- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
+- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
+
+---
--- a/102
+++ b/102
@@ -1,6 +1,9 @@
 ARG GO_VERSION=1.21-bullseye
+ARG IMAGE_TYPE=extras
+# extras or core

-FROM golang:$GO_VERSION as requirements
+
+FROM golang:$GO_VERSION as requirements-core

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
@@ -11,12 +14,13 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py"
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py,vllm:/build/extra/grpc/vllm/backend_vllm.py"
 ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl patch pip
+    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
+

 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
@@ -30,33 +34,21 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

-# Extras requirements
-COPY extra/requirements.txt /build/extra/requirements.txt
-ENV PATH="/root/.cargo/bin:${PATH}"
-RUN pip install --upgrade pip
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN if [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
-    fi
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
-        pip install torch && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
-    fi
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+# OpenBLAS requirements and stable diffusion
+RUN apt-get install -y \
+    libopenblas-dev \
+    libopencv-dev \ 
+    && apt-get clean
+
+# Set up OpenCV
+RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

 WORKDIR /build

-# OpenBLAS requirements
-RUN apt-get install -y libopenblas-dev
-
-# Stable Diffusion requirements
-RUN apt-get install -y libopencv-dev && \
-    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-
-
 # piper requirements
 # Use pre-compiled Piper phonemization library (includes onnxruntime)
 #RUN if echo "${GO_TAGS}" | grep -q "tts"; then \
@@ -76,17 +68,39 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO
    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/ && \
+    rm spdlog-${SPDLOG_VERSION} -rf
+
+# Extras requirements
+FROM requirements-core as requirements-extras
+
+COPY extra/requirements.txt /build/extra/requirements.txt
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+    fi
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+    fi
+RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+
+# Vall-e-X
+RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
+
 # \
 #    ; fi

 ###################################
 ###################################

-FROM requirements as builder
+FROM requirements-${IMAGE_TYPE} as builder

 ARG GO_TAGS="stablediffusion tts"
-
+ARG GRPC_BACKENDS
+ARG BUILD_GRPC=true
+ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
@@ -101,12 +115,23 @@ RUN make prepare
 COPY . .
 COPY .git .

+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+RUN if [ "${BUILD_GRPC}" = "true" ]; then \
+    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+      -DgRPC_BUILD_TESTS=OFF \
+       ../.. && make -j12 install && rm -rf grpc \
+    ; fi
+
+# Rebuild with defaults backends
 RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build

 ###################################
 ###################################

-FROM requirements
+FROM requirements-${IMAGE_TYPE}

 ARG FFMPEG
 ARG BUILD_TYPE
@@ -116,6 +141,11 @@ ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

+ARG CUDA_MAJOR_VERSION=11
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
+ENV NVIDIA_VISIBLE_DEVICES=all
+
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
    apt-get install -y ffmpeg \
@@ -129,11 +159,23 @@ WORKDIR /build
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
 RUN make prepare-sources
+
+# Copy the binary
 COPY --from=builder /build/local-ai ./
-# To resolve exllama import error
-RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+
+# do not let stablediffusion rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+
+# Copy VALLE-X as it's not a real "lib"
+RUN if [ -d /usr/lib/vall-e-x ]; then \
+    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
+    fi
+
+# we also copy exllama libs over to resolve exllama import error
+RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
    fi
+
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/128
+++ b/128
@@ -4,10 +4,12 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-GOLLAMA_VERSION?=bf63302a2be787674e6ca4227a8aaeb95a8eb6b1
+GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0

 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

+CPPLLAMA_VERSION?=9d02956443e5c1ded29b7b5ed8a21bc01ba6f563
+
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
@@ -28,21 +30,21 @@ BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
 # go-piper version
 PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7

-# go-bloomz version
-BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
-
 # stablediffusion version
 STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632

-# Go-ggllm
-GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
-
 export BUILD_TYPE?=
+export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
+export CMAKE_ARGS?=
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=git

+TEST_DIR=/tmp/test
+
+RANDOM := $(shell bash -c 'echo $$RANDOM')
+
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
 LD_FLAGS?=
@@ -60,13 +62,19 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

+# Default Docker bridge IP
+E2E_BRIDGE_IP?=172.17.0.1
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif

-# workaround for rwkv.cpp
 ifeq ($(UNAME_S),Darwin)
-        CGO_LDFLAGS += -lcblas -framework Accelerate 
+	CGO_LDFLAGS += -lcblas -framework Accelerate
+ifneq ($(BUILD_TYPE),metal)
+    # explicit disable metal if on Darwin and metal is disabled
+	CMAKE_ARGS+=-DLLAMA_METAL=OFF
+endif
 endif

 ifeq ($(BUILD_TYPE),openblas)
@@ -78,6 +86,18 @@ ifeq ($(BUILD_TYPE),cublas)
 	export LLAMA_CUBLAS=1
 endif

+ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	# Llama-stable has no hipblas support, so override it here.
+	export STABLE_BUILD_TYPE=
+	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
+	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link
+endif
+
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
@@ -103,7 +123,13 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

-GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
+
+# If empty, then we build all
+ifeq ($(GRPC_BACKENDS),)
+	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
+endif

 .PHONY: all test build vendor

@@ -114,14 +140,6 @@ gpt4all:
 	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
 	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

-## go-ggllm
-go-ggllm:
-	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
-	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
-
-go-ggllm/libggllm.a: go-ggllm
-	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
-
 ## go-piper
 go-piper:
 	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
@@ -148,14 +166,6 @@ go-rwkv:
 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-## bloomz
-bloomz:
-	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
-	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1
-
-bloomz/libbloomz.a: bloomz
-	cd bloomz && make libbloomz.a
-
 go-bert/libgobert.a: go-bert
 	$(MAKE) -C go-bert libgobert.a

@@ -204,12 +214,12 @@ go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

 go-llama-stable/libbinding.a: go-llama-stable
-	$(MAKE) -C go-llama-stable BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

-go-piper/libpiper_binding.a:
+go-piper/libpiper_binding.a: go-piper
 	$(MAKE) -C go-piper libpiper_binding.a example/main

-get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
 	touch $@

 replace:
@@ -218,10 +228,8 @@ replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
-	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
-	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -237,19 +245,17 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C whisper.cpp clean
 	$(MAKE) -C go-stable-diffusion clean
 	$(MAKE) -C go-bert clean
-	$(MAKE) -C bloomz clean
 	$(MAKE) -C go-piper clean
-	$(MAKE) -C go-ggllm clean
 	$(MAKE) build

-prepare: prepare-sources $(OPTIONAL_TARGETS) 
+prepare: prepare-sources $(OPTIONAL_TARGETS)
 	touch $@

 clean: ## Remove build related file
 	$(GOCMD) clean -cache
 	rm -f prepare
 	rm -rf ./go-llama
-	rm -rf ./gpt4all	
+	rm -rf ./gpt4all
 	rm -rf ./go-llama-stable
 	rm -rf ./go-gpt2
 	rm -rf ./go-stable-diffusion
@@ -257,12 +263,11 @@ clean: ## Remove build related file
 	rm -rf ./backend-assets
 	rm -rf ./go-rwkv
 	rm -rf ./go-bert
-	rm -rf ./bloomz
 	rm -rf ./whisper.cpp
 	rm -rf ./go-piper
-	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
+	$(MAKE) -C backend/cpp/llama clean

 ## Build:

@@ -309,6 +314,26 @@ test: prepare test-models/testmodel grpcs
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

+prepare-e2e:
+	mkdir -p $(TEST_DIR)
+	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
+	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
+	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+
+run-e2e-image:
+	ls -liah $(abspath ./tests/e2e-fixtures)
+	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
+
+test-e2e:
+	@echo 'Running e2e tests'
+	BUILD_TYPE=$(BUILD_TYPE) \
+	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+
+teardown-e2e:
+	rm -rf $(TEST_DIR) || true
+	docker stop $$(docker ps -q --filter ancestor=localai-tests)
+
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
@@ -357,16 +382,14 @@ protogen-python:
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
 	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto

 ## GRPC

 backend-assets/grpc:
 	mkdir -p backend-assets/grpc

-backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
-
 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -376,6 +399,16 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

+backend/cpp/llama/grpc-server:
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
+
+backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
+	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
+endif
+
 backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
@@ -421,10 +454,6 @@ backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/

-backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
-
 backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
@@ -432,9 +461,12 @@ backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
 backend-assets/grpc/langchain-huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/

-backend-assets/grpc/stablediffusion: backend-assets/grpc go-stable-diffusion/libstablediffusion.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/
+backend-assets/grpc/stablediffusion: backend-assets/grpc
+	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
+		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
+		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
+	fi

 backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.
+**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.

 <p align="center"><b>Follow LocalAI </b></p>

@@ -104,15 +104,30 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs

 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

-### 💡 Example: Use GPT4ALL-J model
+### 💡 Example: Use Luna-AI Llama model

-See the [documentation](https://localai.io/basics/getting_started/#example-use-gpt4all-j-model-with-docker-compose)
+See the [documentation](https://localai.io/basics/getting_started)

 ### 🔗 Resources

 - [How to build locally](https://localai.io/basics/build/index.html)
 - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
 - [Projects integrating LocalAI](https://localai.io/integrations/)
+- [How tos section](https://localai.io/howtos/) (curated by our community)
+  
+## Citation
+
+If you utilize this repository, data in a downstream project, please consider citing it with:
+
+```
+@misc{localai,
+  author = {Ettore Di Giacinto},
+  title = {LocalAI: The free, Open source OpenAI alternative},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/go-skynet/LocalAI}},
+```

 ## ❤️ Sponsors

@@ -127,6 +142,11 @@ A huge thank you to our generous sponsors who support this project:
 |  [Spectro Cloud](https://www.spectrocloud.com/)  |  
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

+And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 
+
+- [Sponsor list](https://github.com/sponsors/mudler)
+- JDAM00 (donating HW for the CI)
+
 ## 🌟 Star history

 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
--- a/api/api.go
+++ b/api/api.go
@@ -11,6 +11,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/assets"

 	"github.com/gofiber/fiber/v2"
@@ -120,6 +121,9 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// Default middleware config
 	app.Use(recover.New())
+	if options.Metrics != nil {
+		app.Use(metrics.APIMiddleware(options.Metrics))
+	}

 	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
 	auth := func(c *fiber.Ctx) error {
@@ -168,9 +172,14 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Post("/models/apply", auth, localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cl, galleryService.C, options.Galleries))
-	app.Get("/models/available", auth, localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
-	app.Get("/models/jobs/:uuid", auth, localai.GetOpStatusEndpoint(galleryService))
+	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
+	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
+	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
+	app.Get("/models/galleries", auth, modelGalleryService.ListModelGalleriesEndpoint())
+	app.Post("/models/galleries", auth, modelGalleryService.AddModelGalleryEndpoint())
+	app.Delete("/models/galleries", auth, modelGalleryService.RemoveModelGalleryEndpoint())
+	app.Get("/models/jobs/:uuid", auth, modelGalleryService.GetOpStatusEndpoint())
+	app.Get("/models/jobs", auth, modelGalleryService.GetAllStatusEndpoint())

 	// openAI compatible API endpoint

@@ -224,5 +233,7 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
 	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))

+	app.Get("/metrics", metrics.MetricsHandler())
+
 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -15,6 +15,7 @@ import (

 	. "github.com/go-skynet/LocalAI/api"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
@@ -162,8 +163,12 @@ var _ = Describe("API test", func() {
 				},
 			}

+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
+					options.WithMetrics(metricsService),
 					options.WithContext(c),
 					options.WithGalleries(galleries),
 					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
@@ -363,9 +368,10 @@ var _ = Describe("API test", func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
+				modelName := "codellama"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/openllama-3b-gguf.yaml",
-					Name:      "openllama_3b_gguf",
+					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
+					Name:      modelName,
 					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
 				})

@@ -378,17 +384,22 @@ var _ = Describe("API test", func() {
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))

-				By("testing completion")
-				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "})
+				By("testing chat")
+				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
+					{
+						Role:    "user",
+						Content: "How much is 2+2?",
+					},
+				}})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
-				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")))

 				By("testing functions")
 				resp2, err := client.CreateChatCompletion(
 					context.TODO(),
 					openai.ChatCompletionRequest{
-						Model: "openllama_3b_gguf",
+						Model: modelName,
 						Messages: []openai.ChatCompletionMessage{
 							{
 								Role:    "user",
@@ -424,7 +435,7 @@ var _ = Describe("API test", func() {
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res))
+				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
@@ -473,9 +484,13 @@ var _ = Describe("API test", func() {
 				},
 			}

+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
+					options.WithMetrics(metricsService),
 					options.WithAudioDir(tmpdir),
 					options.WithImageDir(tmpdir),
 					options.WithGalleries(galleries),
@@ -577,12 +592,15 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())

-			var err error
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					options.WithContext(c),
 					options.WithModelLoader(modelLoader),
+					options.WithMetrics(metricsService),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")
@@ -786,10 +804,13 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
 			c, cancel = context.WithCancel(context.Background())

-			var err error
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
 			app, err = App(
 				append(commonOpts,
 					options.WithContext(c),
+					options.WithMetrics(metricsService),
 					options.WithModelLoader(modelLoader),
 					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -20,6 +20,8 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			SchedulerType: c.Diffusers.SchedulerType,
 			PipelineType:  c.Diffusers.PipelineType,
 			CFGScale:      c.Diffusers.CFGScale,
+			LoraAdapter:   c.LoraAdapter,
+			LoraBase:      c.LoraBase,
 			IMG2IMG:       c.Diffusers.IMG2IMG,
 			CLIPModel:     c.Diffusers.ClipModel,
 			CLIPSubfolder: c.Diffusers.ClipSubFolder,
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -6,6 +6,7 @@ import (
 	"regexp"
 	"strings"
 	"sync"
+	"unicode/utf8"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
@@ -97,9 +98,23 @@ func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c

 		if tokenCallback != nil {
 			ss := ""
-			err := inferenceModel.PredictStream(ctx, opts, func(s []byte) {
-				tokenCallback(string(s), tokenUsage)
-				ss += string(s)
+
+			var partialRune []byte
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
+				partialRune = append(partialRune, chars...)
+
+				for len(partialRune) > 0 {
+					r, size := utf8.DecodeRune(partialRune)
+					if r == utf8.RuneError {
+						// incomplete rune, wait for more bytes
+						break
+					}
+
+					tokenCallback(string(r), tokenUsage)
+					ss += string(r)
+
+					partialRune = partialRune[size:]
+				}
 			})
 			return LLMResponse{
 				Response: ss,
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -42,6 +42,9 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		Seed:          int32(c.Seed),
 		NBatch:        int32(b),
 		NoMulMatQ:     c.NoMulMatQ,
+		DraftModel:    c.DraftModel,
+		AudioPath:     c.VallE.AudioPath,
+		Quantization:  c.Quantization,
 		LoraAdapter:   c.LoraAdapter,
 		LoraBase:      c.LoraBase,
 		NGQA:          c.NGQA,
@@ -78,6 +81,7 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
 	return &pb.PredictOptions{
 		Temperature:         float32(c.Temperature),
 		TopP:                float32(c.TopP),
+		NDraft:              c.NDraft,
 		TopK:                int32(c.TopK),
 		Tokens:              int32(c.Maxtokens),
 		Threads:             int32(c.Threads),
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -43,6 +43,13 @@ type Config struct {

 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`
+
+	// Vall-e-x
+	VallE VallE `yaml:"vall-e"`
+}
+
+type VallE struct {
+	AudioPath string `yaml:"audio_path"`
 }

 type FeatureFlag map[string]*bool
@@ -94,6 +101,9 @@ type LLMConfig struct {
 	LoraAdapter     string   `yaml:"lora_adapter"`
 	LoraBase        string   `yaml:"lora_base"`
 	NoMulMatQ       bool     `yaml:"no_mulmatq"`
+	DraftModel      string   `yaml:"draft_model"`
+	NDraft          int32    `yaml:"n_draft"`
+	Quantization    string   `yaml:"quantization"`
 }

 type AutoGPTQ struct {
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"slices"
 	"strings"
 	"sync"

@@ -27,6 +28,7 @@ type galleryOp struct {
 }

 type galleryOpStatus struct {
+	FileName           string  `json:"file_name"`
 	Error              error   `json:"error"`
 	Processed          bool    `json:"processed"`
 	Message            string  `json:"message"`
@@ -50,7 +52,6 @@ func NewGalleryService(modelPath string) *galleryApplier {
 	}
 }

-// prepareModel applies a
 func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {

 	config, err := gallery.GetGalleryConfigFromURL(req.URL)
@@ -76,6 +77,13 @@ func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
 	return g.statuses[s]
 }

+func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
+	g.Lock()
+	defer g.Unlock()
+
+	return g.statuses
+}
+
 func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 	go func() {
 		for {
@@ -94,7 +102,7 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {

 				// displayDownload displays the download progress
 				progressCallback := func(fileName string, current string, total string, percentage float64) {
-					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
+					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
 					utils.DisplayDownloadFunction(fileName, current, total, percentage)
 				}

@@ -176,18 +184,12 @@ func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galler
 	return processRequests(modelPath, s, cm, galleries, requests)
 }

-/// Endpoints
+/// Endpoint Service

-func GetOpStatusEndpoint(g *galleryApplier) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		status := g.getStatus(c.Params("uuid"))
-		if status == nil {
-			return fmt.Errorf("could not find any status for ID")
-		}
-
-		return c.JSON(status)
-	}
+type ModelGalleryService struct {
+	galleries      []gallery.Gallery
+	modelPath      string
+	galleryApplier *galleryApplier
 }

 type GalleryModel struct {
@@ -195,7 +197,31 @@ type GalleryModel struct {
 	gallery.GalleryModel
 }

-func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan galleryOp, galleries []gallery.Gallery) func(c *fiber.Ctx) error {
+func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
+	return ModelGalleryService{
+		galleries:      galleries,
+		modelPath:      modelPath,
+		galleryApplier: galleryApplier,
+	}
+}
+
+func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		status := mgs.galleryApplier.getStatus(c.Params("uuid"))
+		if status == nil {
+			return fmt.Errorf("could not find any status for ID")
+		}
+		return c.JSON(status)
+	}
+}
+
+func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		return c.JSON(mgs.galleryApplier.getAllStatus())
+	}
+}
+
+func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(GalleryModel)
 		// Get input data from the request body
@@ -207,11 +233,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 		if err != nil {
 			return err
 		}
-		g <- galleryOp{
+		mgs.galleryApplier.C <- galleryOp{
 			req:         input.GalleryModel,
 			id:          uuid.String(),
 			galleryName: input.ID,
-			galleries:   galleries,
+			galleries:   mgs.galleries,
 		}
 		return c.JSON(struct {
 			ID        string `json:"uuid"`
@@ -220,11 +246,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 	}
 }

-func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string) func(c *fiber.Ctx) error {
+func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing models from galleries: %+v", galleries)
+		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)

-		models, err := gallery.AvailableGalleryModels(galleries, basePath)
+		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
 		if err != nil {
 			return err
 		}
@@ -239,3 +265,56 @@ func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string)
 		return c.Send(dat)
 	}
 }
+
+// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
+func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s already exists", input.Name)
+		}
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Adding %+v to gallery list", *input)
+		mgs.galleries = append(mgs.galleries, *input)
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s is not currently registered", input.Name)
+		}
+		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		})
+		return c.Send(nil)
+	}
+}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -15,15 +16,20 @@ import (
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )

 func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	emptyMessage := ""
+	id := uuid.New().String()
+	created := int(time.Now().Unix())

 	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		initialMessage := schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
 			Object:  "chat.completion.chunk",
@@ -32,6 +38,8 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
 				Object:  "chat.completion.chunk",
@@ -261,7 +269,9 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 				}

 				resp := &schema.OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							FinishReason: "stop",
@@ -355,6 +365,8 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}

 		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "chat.completion",
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -6,6 +6,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -13,16 +14,22 @@ import (
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )

 // https://platform.openai.com/docs/api-reference/completions
 func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	id := uuid.New().String()
+	created := int(time.Now().Unix())
+
 	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
 		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
 			resp := schema.OpenAIResponse{
-				Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []schema.Choice{
 					{
 						Index: 0,
@@ -108,7 +115,9 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				}

 				resp := &schema.OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{
 						{
 							Index:        0,
@@ -156,6 +165,8 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		}

 		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "text_completion",
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -3,6 +3,7 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -10,6 +11,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"

 	"github.com/rs/zerolog/log"
 )
@@ -62,7 +64,11 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			result = append(result, r...)
 		}

+		id := uuid.New().String()
+		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "edit",
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -3,10 +3,12 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"

 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
@@ -57,10 +59,14 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}

+		id := uuid.New().String()
+		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
-			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:   items,
-			Object: "list",
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:    items,
+			Object:  "list",
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -5,11 +5,14 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
-	"github.com/go-skynet/LocalAI/api/schema"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -174,8 +177,12 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 		}

+		id := uuid.New().String()
+		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
-			Data: result,
+			ID:      id,
+			Created: created,
+			Data:    result,
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -7,6 +7,7 @@ import (

 	"github.com/go-skynet/LocalAI/pkg/gallery"
 	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/metrics"
 	"github.com/rs/zerolog/log"
 )

@@ -24,6 +25,7 @@ type Option struct {
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
+	Metrics                             *metrics.Metrics

 	Galleries []gallery.Gallery

@@ -99,6 +101,7 @@ func WithStringGalleries(galls string) AppOption {
 	return func(o *Option) {
 		if galls == "" {
 			log.Debug().Msgf("no galleries to load")
+			o.Galleries = []gallery.Gallery{}
 			return
 		}
 		var galleries []gallery.Gallery
@@ -197,3 +200,9 @@ func WithApiKeys(apiKeys []string) AppOption {
 		o.ApiKeys = apiKeys
 	}
 }
+
+func WithMetrics(meter *metrics.Metrics) AppOption {
+	return func(o *Option) {
+		o.Metrics = meter
+	}
+}
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -0,0 +1,61 @@
+set(CMAKE_CXX_STANDARD 17)
+cmake_minimum_required(VERSION 3.15)
+set(TARGET grpc-server)
+set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+set(_REFLECTION grpc++_reflection)
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    link_directories("/opt/homebrew/lib")
+    include_directories("/opt/homebrew/include")
+endif()
+
+find_package(absl CONFIG REQUIRED)
+find_package(Protobuf CONFIG REQUIRED)
+find_package(gRPC CONFIG REQUIRED)
+
+find_program(_PROTOBUF_PROTOC protoc)
+set(_GRPC_GRPCPP grpc++)
+find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${Protobuf_INCLUDE_DIRS})
+
+message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+
+# Proto file
+get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
+
+add_custom_command(
+      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+      DEPENDS "${hw_proto}")
+
+# hw_grpc_proto
+add_library(hw_grpc_proto
+  ${hw_grpc_srcs}
+  ${hw_grpc_hdrs}
+  ${hw_proto_srcs}
+  ${hw_proto_hdrs})
+
+add_executable(${TARGET} grpc-server.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+  absl::flags_parse
+  gRPC::${_REFLECTION}
+  gRPC::${_GRPC_GRPCPP}
+  protobuf::${_PROTOBUF_LIBPROTOBUF})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,44 @@
+
+LLAMA_VERSION?=
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+
+# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
+# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblast)
+	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+endif
+
+llama.cpp:
+	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
+	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
+
+llama.cpp/examples/grpc-server:
+	mkdir -p llama.cpp/examples/grpc-server
+	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+
+rebuild:
+	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+clean:
+	rm -rf llama.cpp
+	rm -rf grpc-server
+
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -0,0 +1,926 @@
+// llama.cpp gRPC C++ backend server
+//
+// Ettore Di Giacinto <mudler@localai.io>
+//
+// This is a gRPC server for llama.cpp compatible with the LocalAI proto
+// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP, 
+// but modified to work with gRPC
+//
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <getopt.h>
+
+#include "common.h"
+#include "llama.h"
+#include "grammar-parser.h"
+#include "backend.pb.h"
+#include "backend.grpc.pb.h"
+
+// include std::regex
+#include <regex>
+#include <grpcpp/ext/proto_server_reflection_plugin.h>
+#include <grpcpp/grpcpp.h>
+#include <grpcpp/health_check_service_interface.h>
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::Status;
+
+
+using backend::HealthMessage;
+
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+};
+
+static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+{
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    {
+    }
+    return i;
+}
+
+enum stop_type
+{
+    STOP_FULL,
+    STOP_PARTIAL,
+};
+
+static bool ends_with(const std::string &str, const std::string &suffix)
+{
+    return str.size() >= suffix.size() &&
+           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+
+static size_t find_partial_stop_string(const std::string &stop,
+                                       const std::string &text)
+{
+    if (!text.empty() && !stop.empty())
+    {
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+        {
+            if (stop[char_index] == text_last_char)
+            {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial))
+                {
+                    return text.size() - char_index - 1;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+
+
+template <class Iter>
+static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
+{
+    std::string ret;
+    for (; begin != end; ++begin)
+    {
+        ret += llama_token_to_piece(ctx, *begin);
+    }
+    return ret;
+}
+
+
+// format incomplete utf-8 multibyte character for output
+static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+{
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
+    {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+    return out;
+}
+
+struct llama_server_context
+{
+    bool stream = false;
+    bool has_next_token = false;
+    std::string generated_text;
+    std::vector<completion_token_output> generated_token_probs;
+
+    size_t num_prompt_tokens = 0;
+    size_t num_tokens_predicted = 0;
+    size_t n_past = 0;
+    size_t n_remain = 0;
+
+    std::vector<llama_token> embd;
+
+    gpt_params params;
+
+    llama_model *model = nullptr;
+    llama_context *ctx = nullptr;
+    llama_sampling_context *ctx_sampling = nullptr;
+
+    int n_ctx;
+
+    bool truncated = false;
+    bool stopped_eos = false;
+    bool stopped_word = false;
+    bool stopped_limit = false;
+    std::string stopping_word;
+    int32_t multibyte_pending = 0;
+
+    std::mutex mutex;
+
+    std::unique_lock<std::mutex> lock()
+    {
+        return std::unique_lock<std::mutex>(mutex);
+    }
+
+    ~llama_server_context()
+    {
+        if (ctx)
+        {
+            llama_free(ctx);
+            ctx = nullptr;
+        }
+        if (model)
+        {
+            llama_free_model(model);
+            model = nullptr;
+        }
+    }
+
+    void rewind()
+    {
+        params.antiprompt.clear();
+        params.sparams.grammar.clear();
+        num_prompt_tokens = 0;
+        num_tokens_predicted = 0;
+        generated_text = "";
+        generated_text.reserve(n_ctx);
+        generated_token_probs.clear();
+        truncated = false;
+        stopped_eos = false;
+        stopped_word = false;
+        stopped_limit = false;
+        stopping_word = "";
+        multibyte_pending = 0;
+        n_remain = 0;
+        n_past = 0;
+        params.sparams.n_prev = n_ctx;
+    }
+
+    void initSampling() {
+        if (ctx_sampling != nullptr) {
+            llama_sampling_free(ctx_sampling);
+        }
+        ctx_sampling = llama_sampling_init(params.sparams);
+    }
+
+    bool loadModel(const gpt_params &params_)
+    {
+        params = params_;
+        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        if (model == nullptr)
+        {
+            return false;
+        }
+        n_ctx = llama_n_ctx(ctx);
+        return true;
+    }
+    std::vector<llama_token> tokenize_string(const char *prompt, bool add_bos) const {
+       // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens; 
+        auto s = std::string(prompt);
+        prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+        return prompt_tokens;
+    }
+     std::vector<llama_token> tokenize_array(const char **prompts, bool add_bos) const {
+                std::vector<llama_token> prompt_tokens; 
+
+            bool first = true;
+            bool is_string = true;
+            for (const char **p = prompts; *p != nullptr; ++p)
+              {
+                if (is_string)
+                {
+                    auto s = std::string(*p);
+                    std::vector<llama_token> p;
+                    if (first)
+                    {
+                        p = ::llama_tokenize(ctx, s, add_bos);
+                        first = false;
+                    }
+                    else
+                    {
+                        p = ::llama_tokenize(ctx, s, false);
+                    }
+                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+                }
+                else
+                {
+                    if (first)
+                    {
+                        first = false;
+                    }
+                    //prompt_tokens.push_back(p.template get<llama_token>());
+                }
+            }
+            return prompt_tokens;
+     }
+
+    void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
+        const int n_left = n_ctx - params.n_keep;
+        const int n_block_size = n_left / 2;
+        const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
+
+        // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
+        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+
+        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+
+        truncated = true;
+        prompt_tokens = new_tokens;
+    }
+
+    void loadInfill()
+    {
+        bool suff_rm_leading_spc = true;
+        if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
+            params.input_suffix.erase(0, 1);
+            suff_rm_leading_spc = false;
+        }
+
+        auto prefix_tokens = tokenize_string(params.input_prefix.c_str(), false);
+        auto suffix_tokens = tokenize_string(params.input_suffix.c_str(), false);
+        const int space_token = 29871;
+        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
+            suffix_tokens.erase(suffix_tokens.begin());
+        }
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
+        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
+        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
+        prefix_tokens.push_back(llama_token_middle(model));
+
+        auto prompt_tokens = prefix_tokens;
+
+        num_prompt_tokens = prompt_tokens.size();
+
+        if (params.n_keep < 0)
+        {
+            params.n_keep = (int)num_prompt_tokens;
+        }
+        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
+
+        // if input prompt is too big, truncate like normal
+        if (num_prompt_tokens >= (size_t) n_ctx)
+        {
+            truncatePrompt(prompt_tokens);
+            num_prompt_tokens = prompt_tokens.size();
+
+            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
+        }
+
+        // push the prompt into the sampling context (do not apply grammar)
+        for (auto & token : prompt_tokens)
+        {
+            llama_sampling_accept(ctx_sampling, ctx, token, false);
+        }
+
+        // compare the evaluated prompt with the new prompt
+        n_past = common_part(embd, prompt_tokens);
+        embd = prompt_tokens;
+
+        if (n_past == num_prompt_tokens)
+        {
+            // we have to evaluate at least 1 token to generate logits.
+            printf("we have to evaluate at least 1 token to generate logits\n");
+            n_past--;
+        }
+
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
+        has_next_token = true;
+    }
+    void loadPrompt(std::string prompt)
+    {
+        auto prompt_tokens = tokenize_string(prompt.c_str(), true);  // always add BOS
+
+        num_prompt_tokens = prompt_tokens.size();
+
+        if (params.n_keep < 0)
+        {
+            params.n_keep = (int)num_prompt_tokens;
+        }
+        params.n_keep = std::min(n_ctx - 4, params.n_keep);
+
+        // if input prompt is too big, truncate like normal
+        if (num_prompt_tokens >= (size_t) n_ctx)
+        {
+            truncatePrompt(prompt_tokens);
+            num_prompt_tokens = prompt_tokens.size();
+
+            GGML_ASSERT(num_prompt_tokens < (size_t)n_ctx);
+        }
+
+        // push the prompt into the sampling context (do not apply grammar)
+        for (auto & token : prompt_tokens)
+        {
+            llama_sampling_accept(ctx_sampling, ctx, token, false);
+        }
+
+        // compare the evaluated prompt with the new prompt
+        n_past = common_part(embd, prompt_tokens);
+
+        embd = prompt_tokens;
+        if (n_past == num_prompt_tokens)
+        {
+            // we have to evaluate at least 1 token to generate logits.
+            n_past--;
+        }
+
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
+        has_next_token = true;
+    }
+
+    void beginCompletion()
+    {
+        // number of tokens to keep when resetting context
+        n_remain = params.n_predict;
+        llama_set_rng_seed(ctx, params.seed);
+    }
+
+    completion_token_output nextToken()
+    {
+        completion_token_output result;
+        result.tok = -1;
+
+        if (embd.size() >= (size_t)n_ctx)
+        {
+            // Shift context
+
+            const int n_left    = n_past - params.n_keep - 1;
+            const int n_discard = n_left/2;
+
+            llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+            llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+            for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
+            {
+                embd[i - n_discard] = embd[i];
+            }
+            embd.resize(embd.size() - n_discard);
+
+            n_past -= n_discard;
+
+            truncated = true;
+        }
+
+        bool tg = true;
+        while (n_past < embd.size())
+        {
+            int n_eval = (int)embd.size() - n_past;
+            tg = n_eval == 1;
+            if (n_eval > params.n_batch)
+            {
+                n_eval = params.n_batch;
+            }
+
+            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
+            {
+                has_next_token = false;
+                return result;
+            }
+            n_past += n_eval;
+        }
+
+        if (params.n_predict == 0)
+        {
+            has_next_token = false;
+            result.tok = llama_token_eos(model);
+            return result;
+        }
+
+        {
+            // out of user input, sample next token
+            result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
+
+            llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
+
+            const int32_t n_probs = params.sparams.n_probs;
+            if (params.sparams.temp <= 0 && n_probs > 0)
+            {
+                // For llama_sample_token_greedy we need to sort candidates
+                llama_sample_softmax(ctx, &cur_p);
+            }
+
+            for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+            {
+                result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+            }
+
+            llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
+
+            if (tg) {
+                num_tokens_predicted++;
+            }
+        }
+
+        // add it to the context
+        embd.push_back(result.tok);
+        // decrement remaining sampling budget
+        --n_remain;
+
+        if (!embd.empty() && embd.back() == llama_token_eos(model))
+        {
+            // stopping_word = llama_token_to_piece(ctx, embd.back());
+            has_next_token = false;
+            stopped_eos = true;
+            return result;
+        }
+
+        has_next_token = params.n_predict == -1 || n_remain != 0;
+        return result;
+    }
+
+    size_t findStoppingStrings(const std::string &text, const size_t last_token_size,
+                               const stop_type type)
+    {
+        size_t stop_pos = std::string::npos;
+        for (const std::string &word : params.antiprompt)
+        {
+            size_t pos;
+            if (type == STOP_FULL)
+            {
+                const size_t tmp = word.size() + last_token_size;
+                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+                pos = text.find(word, from_pos);
+            }
+            else
+            {
+                pos = find_partial_stop_string(word, text);
+            }
+            if (pos != std::string::npos &&
+                (stop_pos == std::string::npos || pos < stop_pos))
+            {
+                if (type == STOP_FULL)
+                {
+                    stopping_word = word;
+                    stopped_word = true;
+                    has_next_token = false;
+                }
+                stop_pos = pos;
+            }
+        }
+        return stop_pos;
+    }
+
+    completion_token_output doCompletion()
+    {
+        auto token_with_probs = nextToken();
+
+        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
+        generated_text += token_text;
+
+        if (params.sparams.n_probs > 0)
+        {
+            generated_token_probs.push_back(token_with_probs);
+        }
+
+        if (multibyte_pending > 0)
+        {
+            multibyte_pending -= token_text.size();
+        }
+        else if (token_text.size() == 1)
+        {
+            const char c = token_text[0];
+            // 2-byte characters: 110xxxxx 10xxxxxx
+            if ((c & 0xE0) == 0xC0)
+            {
+                multibyte_pending = 1;
+                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+            }
+            else if ((c & 0xF0) == 0xE0)
+            {
+                multibyte_pending = 2;
+                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+            }
+            else if ((c & 0xF8) == 0xF0)
+            {
+                multibyte_pending = 3;
+            }
+            else
+            {
+                multibyte_pending = 0;
+            }
+        }
+
+        if (multibyte_pending > 0 && !has_next_token)
+        {
+            has_next_token = true;
+            n_remain++;
+        }
+
+        if (!has_next_token && n_remain == 0)
+        {
+            stopped_limit = true;
+        }
+
+        return token_with_probs;
+    }
+
+    std::vector<float> getEmbedding()
+    {
+        static const int n_embd = llama_n_embd(model);
+        if (!params.embedding)
+        {
+            return std::vector<float>(n_embd, 0.0f);
+        }
+        const float *data = llama_get_embeddings(ctx);
+        std::vector<float> embedding(data, data + n_embd);
+        return embedding;
+    }
+};
+
+
+static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
+{
+    gpt_params default_params;
+
+    llama.stream = streaming;
+    llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
+    llama.params.sparams.top_k = predict->topk();
+    llama.params.sparams.top_p = predict->topp();
+    llama.params.sparams.tfs_z = predict->tailfreesamplingz();
+    llama.params.sparams.typical_p = predict->typicalp();
+    llama.params.sparams.penalty_last_n = predict->repeat();
+    llama.params.sparams.temp = predict->temperature();
+    llama.params.sparams.penalty_repeat = predict->penalty();
+    llama.params.sparams.penalty_present = predict->presencepenalty();
+    llama.params.sparams.penalty_freq = predict->frequencypenalty();
+    llama.params.sparams.mirostat = predict->mirostat();
+    llama.params.sparams.mirostat_tau = predict->mirostattau();
+    llama.params.sparams.mirostat_eta = predict->mirostateta();
+    llama.params.sparams.penalize_nl = predict->penalizenl();
+    llama.params.n_keep = predict->nkeep();
+    llama.params.seed = predict->seed();
+    llama.params.sparams.grammar = predict->grammar();
+    // llama.params.n_probs = predict->
+    llama.params.prompt = predict->prompt();
+
+    llama.params.sparams.logit_bias.clear();
+
+    if (predict->ignoreeos())
+    {
+        llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
+    }
+
+    // const auto &logit_bias = body.find("logit_bias");
+    // if (logit_bias != body.end() && logit_bias->is_array())
+    // {
+    //     const int n_vocab = llama_n_vocab(llama.model);
+    //     for (const auto &el : *logit_bias)
+    //     {
+    //         if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
+    //         {
+    //             llama_token tok = el[0].get<llama_token>();
+    //             if (tok >= 0 && tok < n_vocab)
+    //             {
+    //                 if (el[1].is_number())
+    //                 {
+    //                     llama.params.logit_bias[tok] = el[1].get<float>();
+    //                 }
+    //                 else if (el[1].is_boolean() && !el[1].get<bool>())
+    //                 {
+    //                     llama.params.logit_bias[tok] = -INFINITY;
+    //                 }
+    //             }
+    //         }
+    //     }
+    // }
+
+    llama.params.antiprompt.clear();
+    for (const std::string& stopPrompt : predict->stopprompts()) {
+    if (!stopPrompt.empty())
+            {
+                llama.params.antiprompt.push_back(stopPrompt);
+            }
+    }
+}
+
+
+
+static void params_parse(const backend::ModelOptions* request,
+                                gpt_params & params) {
+   
+    params.model = request->modelfile();
+    //  params.model_alias ??
+    params.model_alias =  request->modelfile();
+    params.n_ctx = request->contextsize();
+    params.memory_f16 = request->f16memory();
+    params.n_threads = request->threads();
+    params.n_gpu_layers = request->ngpulayers();
+    params.n_batch = request->nbatch();
+
+    if (!request->tensorsplit().empty()) {
+        std::string arg_next = request->tensorsplit();
+
+        // split string by , and /
+        const std::regex regex{ R"([,/]+)" };
+        std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+        std::vector<std::string> split_arg{ it, {} };
+
+        GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+        for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
+            if (i_device < split_arg.size()) {
+                params.tensor_split[i_device] = std::stof(split_arg[i_device]);
+            }
+            else {
+                params.tensor_split[i_device] = 0.0f;
+            }
+        }
+    }
+
+    if (!request->maingpu().empty()) {
+        params.main_gpu = std::stoi(request->maingpu());
+    }
+    // TODO: lora needs also a scale factor
+    //params.lora_adapter = request->loraadapter();
+    //params.lora_base = request->lorabase();
+    params.use_mlock = request->mlock();
+    params.use_mmap = request->mmap();
+    params.embedding = request->embeddings();
+}
+
+static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.model);
+}
+
+// Function matching type llama_beam_search_callback_fn_t.
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
+static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
+    auto & llama = *static_cast<llama_server_context*>(callback_data);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
+        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
+        printf("%zu", n);
+    }
+    fflush(stdout);
+#if 0 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams:\n";
+    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
+struct token_translator {
+    llama_context * ctx;
+    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
+    std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
+};
+
+
+static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
+{
+    auto & gtps = llama.generated_token_probs;
+    auto translator = token_translator{llama.ctx};
+    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
+    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
+    if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
+        llama.generated_text.reserve(llama.generated_text.size() + len);
+    }
+    for (const completion_token_output & cto : gtps) {
+        llama.generated_text += translator(cto);
+    }
+}
+
+// GRPC Server start
+class BackendServiceImpl final : public backend::Backend::Service {
+  // The class has a llama instance that is shared across all RPCs
+  llama_server_context llama;
+public:
+  grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
+    // Implement Health RPC
+    reply->set_message("OK");
+    return Status::OK;
+  }
+
+  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
+    // Implement LoadModel RPC
+    gpt_params params;
+    params_parse(request, params);
+
+    llama_backend_init(params.numa);
+
+    // load the model
+    if (!llama.loadModel(params))
+    {
+        result->set_message("Failed loading model");
+        result->set_success(false);
+        return Status::CANCELLED;
+    }
+    result->set_message("Loading succeeded");
+    result->set_success(true);
+    return Status::OK;
+  }
+  grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
+        // Implement the streaming logic here based on the request options
+        // You can use writer->Write(response) to send a reply to the client
+        // and return grpc::Status::OK when the operation is complete.
+        auto lock = llama.lock();
+
+        llama.rewind();
+
+        llama_reset_timings(llama.ctx);
+
+        parse_options_completion(false, request, llama);
+
+        llama.initSampling();
+        llama.loadPrompt(request->prompt());
+        llama.beginCompletion();
+        size_t sent_count = 0;
+        size_t sent_token_probs_index = 0;
+
+        while (llama.has_next_token) {
+            const completion_token_output token_with_probs = llama.doCompletion();
+            if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
+                continue;
+            }
+            const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
+
+            size_t pos = std::min(sent_count, llama.generated_text.size());
+
+            const std::string str_test = llama.generated_text.substr(pos);
+            bool is_stop_full = false;
+            size_t stop_pos =
+                llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
+            if (stop_pos != std::string::npos) {
+                is_stop_full = true;
+                llama.generated_text.erase(
+                    llama.generated_text.begin() + pos + stop_pos,
+                    llama.generated_text.end());
+                pos = std::min(sent_count, llama.generated_text.size());
+            } else {
+                is_stop_full = false;
+                stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
+                    STOP_PARTIAL);
+            }
+
+            if (
+                stop_pos == std::string::npos ||
+                // Send rest of the text if we are at the end of the generation
+                (!llama.has_next_token && !is_stop_full && stop_pos > 0)
+            ) {
+                const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
+
+                sent_count += to_send.size();
+
+                std::vector<completion_token_output> probs_output = {};
+
+                if (llama.params.sparams.n_probs > 0) {
+                    const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
+                    size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
+                    size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
+                    if (probs_pos < probs_stop_pos) {
+                        probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                    }
+                    sent_token_probs_index = probs_stop_pos;
+                }
+                backend::Reply reply;
+                reply.set_message(to_send);
+
+                // Send the reply
+                writer->Write(reply);
+            }
+        }
+
+        llama_print_timings(llama.ctx);
+
+        llama.mutex.unlock();
+        lock.release();
+        return grpc::Status::OK;
+    }
+
+
+    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
+        auto lock = llama.lock();
+        llama.rewind();
+        llama_reset_timings(llama.ctx);
+        parse_options_completion(false, request, llama);
+
+        llama.initSampling();
+        llama.loadPrompt(request->prompt());
+        llama.beginCompletion();
+
+        if (llama.params.n_beams) {
+            // Fill llama.generated_token_probs vector with final beam.
+            llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
+                                llama.n_past, llama.n_remain);
+            // Translate llama.generated_token_probs to llama.generated_text.
+            append_to_generated_text_from_generated_token_probs(llama);
+        } else {
+            size_t stop_pos = std::string::npos;
+
+            while (llama.has_next_token) {
+                const completion_token_output token_with_probs = llama.doCompletion();
+                const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
+
+                stop_pos = llama.findStoppingStrings(llama.generated_text,
+                    token_text.size(), STOP_FULL);
+            }
+
+            if (stop_pos == std::string::npos) {
+                stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+            }
+            if (stop_pos != std::string::npos) {
+                llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
+                    llama.generated_text.end());
+            }
+        }
+
+        auto probs = llama.generated_token_probs;
+        if (llama.params.sparams.n_probs > 0 && llama.stopped_word) {
+            const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
+            probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
+        }
+        reply->set_message(llama.generated_text);
+        return grpc::Status::OK;
+    }
+};
+
+void RunServer(const std::string& server_address) {
+  BackendServiceImpl service;
+
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service);
+
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  std::cout << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+int main(int argc, char** argv) {
+  std::string server_address("localhost:50051");
+
+  // Define long and short options
+  struct option long_options[] = {
+      {"addr", required_argument, nullptr, 'a'},
+      {nullptr, 0, nullptr, 0}
+  };
+
+  // Parse command-line arguments
+  int option;
+  int option_index = 0;
+  while ((option = getopt_long(argc, argv, "a:", long_options, &option_index)) != -1) {
+    switch (option) {
+      case 'a':
+        server_address = optarg;
+        break;
+      default:
+        std::cerr << "Usage: " << argv[0] << " [--addr=<address>] or [-a <address>]" << std::endl;
+        return 1;
+    }
+  }
+
+  RunServer(server_address);
+  return 0;
+}
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -1,23 +0,0 @@
-package main
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -1,25 +0,0 @@
-package main
-
-// GRPC Falcon server
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &falcon.LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,4 +12,5 @@ services:
      - .env
    volumes:
      - ./models:/models:cached
+      - ./images/:/tmp/generated/images/
    command: ["/usr/bin/local-ai" ]
--- a/examples/README.md
+++ b/examples/README.md
@@ -159,14 +159,24 @@ Allows to run any LocalAI-compatible model as a backend on the servers of https:

 ### Continue

-<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
-
 _by [@gruberdev](https://github.com/gruberdev)_

+<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
+
 Demonstrates how to integrate an open-source copilot alternative that enhances code analysis, completion, and improvements. This approach seamlessly integrates with any LocalAI model, offering a more user-friendly experience.

 [Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/continue/)

+### Streamlit bot
+
+_by [@majoshi1](https://github.com/majoshi1)_
+
+![Screenshot](streamlit-bot/streamlit-bot.png)
+
+A chat bot made using `Streamlit` & LocalAI.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/streamlit-bot/)
+
 ## Want to contribute?

 Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/autoGPT/.env.example
+++ b/examples/autoGPT/.env.example
@@ -1,5 +1,9 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
-PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
+# see other options in the model gallery at https://github.com/go-skynet/model-gallery
+PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
--- a/examples/autoGPT/README.md
+++ b/examples/autoGPT/README.md
@@ -10,12 +10,16 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/autoGPT

+cp -rfv .env.example .env
+
+# Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
+vim .env
+
 docker-compose run --rm auto-gpt
 ```

 Note: The example automatically downloads the `gpt4all` model as it is under a permissive license. The GPT4All model does not seem to be enough to run AutoGPT. WizardLM-7b-uncensored seems to perform better (with `f16: true`).

-See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.

 ## Without docker

--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: backend monitor
+  type: http
+  seq: 4
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
+  body: none
+  auth: none
+}
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -0,0 +1,21 @@
+meta {
+  name: backend-shutdown
+  type: http
+  seq: 3
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,5 @@
+{
+  "version": "1",
+  "name": "LocalAI Test Requests",
+  "type": "collection"
+}
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -0,0 +1,6 @@
+vars {
+  HOST: localhost
+  PORT: 8080
+  DEFAULT_MODEL: gpt-3.5-turbo
+  PROTOCOL: http://
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: get models list
+  type: http
+  seq: 2
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,25 @@
+meta {
+  name: Generate image
+  type: http
+  seq: 1
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+    "prompt": "<positive prompt>|<negative prompt>",
+    "model": "model-name",
+    "step": 51,
+    "size": "1024x1024",
+    "image": ""
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,24 @@
+meta {
+  name: -completions
+  type: http
+  seq: 4
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "prompt": "function downloadFile(string url, string outputPath) {",
+      "max_tokens": 256,
+      "temperature": 0.5
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,23 @@
+meta {
+  name: -edits
+  type: http
+  seq: 5
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "What day of the wek is it?",
+      "instruction": "Fix the spelling mistakes"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: -embeddings
+  type: http
+  seq: 6
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,30 @@
+meta {
+  name: chat completion -simple- 1 message-
+  type: http
+  seq: 4
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+    "model": "{{DEFAULT_MODEL}}",
+    "messages": [
+      {
+        "role": "user",
+        "content": "How could one use friction to cook an egg?"
+      }
+    ],
+    "max_tokens": 256,
+    "temperature": 0.2,
+    "grammar": ""
+  }
+}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,29 @@
+meta {
+  name: chat-completions -long-
+  type: http
+  seq: 5
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
+           {"role": "user", "content": "How could one use electricity to cook an egg?"},
+           {"role": "assistant",
+                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
+              },
+              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
+       "max_tokens": 1024,
+       "temperature": 0.5
+  }
+}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,25 @@
+meta {
+  name: chat-completions -stream-
+  type: http
+  seq: 6
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
+       "max_tokens": 256,
+       "temperature": 0.9,
+       "stream": true
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: add model gallery
+  type: http
+  seq: 10
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
+      "name": "test"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
+meta {
+  name: delete model gallery
+  type: http
+  seq: 11
+}
+
+delete {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "name": "test"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: list MODELS in galleries
+  type: http
+  seq: 7
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: list model GALLERIES
+  type: http
+  seq: 8
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
+meta {
+  name: model gallery apply -gist-
+  type: http
+  seq: 12
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: model gallery apply
+  type: http
+  seq: 9
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
+      "name": "codellama7b"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: -tts
+  type: http
+  seq: 2
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
+  }
+}
--- a/examples/chatbot-ui-manual/README.md
+++ b/examples/chatbot-ui-manual/README.md
@@ -24,10 +24,13 @@ docker-compose up -d --pull always
 # docker-compose up -d --build
 ```

+Then browse to `http://localhost:3000` to view the Web UI.
+
 ## Pointing chatbot-ui to a separately managed LocalAI service

-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+
+```yaml
 version: '3.6'

 services:
@@ -40,9 +43,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```

-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.

 ## Accessing chatbot-ui

 Open http://localhost:3000 for the Web UI.
-
--- a/examples/chatbot-ui-manual/models
+++ b/examples/chatbot-ui-manual/models
@@ -0,0 +1 @@
+../models
--- a/examples/chatbot-ui/README.md
+++ b/examples/chatbot-ui/README.md
@@ -20,10 +20,13 @@ docker-compose up --pull always
 # docker-compose up -d --build
 ```

+Then browse to `http://localhost:3000` to view the Web UI.
+
 ## Pointing chatbot-ui to a separately managed LocalAI service

-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+
+```yaml
 version: '3.6'

 services:
@@ -36,9 +39,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```

-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.

 ## Accessing chatbot-ui

 Open http://localhost:3000 for the Web UI.
-
--- a/examples/configurations/README.md
+++ b/examples/configurations/README.md
@@ -0,0 +1,42 @@
+## Advanced configuration
+
+This section contains examples on how to install models manually with config files.
+
+### Prerequisites
+
+First clone LocalAI:
+
+```bash
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI
+```
+
+Setup the model you prefer from the examples below and then start LocalAI:
+
+```bash
+docker compose up -d --pull always
+```
+
+If LocalAI is already started, you can restart it with 
+
+```bash
+docker compose restart
+```
+
+See also the getting started: https://localai.io/basics/getting_started/
+
+### Mistral
+
+To setup mistral copy the files inside `mistral` in the `models` folder:
+
+```bash
+cp -r examples/configurations/mistral/* models/
+```
+
+Now download the model:
+
+```bash
+wget https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q6_K.gguf -O models/mistral-7b-openorca.Q6_K.gguf
+```
+
--- a/examples/configurations/mistral/chatml-block.tmpl
+++ b/examples/configurations/mistral/chatml-block.tmpl
@@ -0,0 +1,3 @@
+{{.Input}}
+<|im_start|>assistant
+
--- a/examples/configurations/mistral/chatml.tmpl
+++ b/examples/configurations/mistral/chatml.tmpl
@@ -0,0 +1,3 @@
+<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+{{if .Content}}{{.Content}}{{end}}
+<|im_end|>
--- a/examples/chatbot-ui-manual/models/completion.tmpl
+++ b/examples/chatbot-ui-manual/models/completion.tmpl
--- a/examples/configurations/mistral/mistral.yaml
+++ b/examples/configurations/mistral/mistral.yaml
@@ -0,0 +1,16 @@
+name: mistral
+mmap: true
+parameters:
+  model: mistral-7b-openorca.Q6_K.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+template:
+  chat_message: chatml
+  chat: chatml-block
+  completion: completion
+context_size: 4096
+f16: true
+stopwords:
+- <|im_end|>
+threads: 4
--- a/examples/continue/README.md
+++ b/examples/continue/README.md
@@ -28,7 +28,7 @@ For a live demonstration, please click on the link below:
 3. Type `/config` within Continue's VSCode extension, or edit the file located at `~/.continue/config.py` on your system with the following configuration:

    ```py
-    from continuedev.src.continuedev.libs.llm.openai import OpenAI, OpenAIServerInfo
+    from continuedev.src.continuedev.libs.llm.openai import OpenAI

    config = ContinueConfig(
       ...
@@ -36,10 +36,7 @@ For a live demonstration, please click on the link below:
            default=OpenAI(
               api_key="my-api-key",
               model="gpt-3.5-turbo",
-               openai_server_info=OpenAIServerInfo(
-                  api_base="http://localhost:8080",
-                  model="gpt-3.5-turbo"
-               )
+               api_base="http://localhost:8080",
            )
       ),
    )
--- a/examples/discord-bot/.env.example
+++ b/examples/discord-bot/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=x
 DISCORD_BOT_TOKEN=x
 DISCORD_CLIENT_ID=x
--- a/examples/discord-bot/models
+++ b/examples/discord-bot/models
@@ -1 +1 @@
-../chatbot-ui/models/
+../models
--- a/examples/functions/.env.example
+++ b/examples/functions/.env.example
@@ -1,7 +1,11 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
+# see other options in the model gallery at https://github.com/go-skynet/model-gallery
 PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/openllama-7b-open-instruct.yaml", "name": "gpt-3.5-turbo"}]

 ## Change the default number of threads
--- a/examples/functions/README.md
+++ b/examples/functions/README.md
@@ -10,9 +10,12 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/functions

+cp -rfv .env.example .env
+
+# Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
+vim .env
+
 docker-compose run --rm functions
 ```

 Note: The example automatically downloads the `openllama` model as it is under a permissive license.
-
-See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.
--- a/examples/insomnia/Insomnia_LocalAI.json
+++ b/examples/insomnia/Insomnia_LocalAI.json
--- a/examples/langchain-chroma/.env.example
+++ b/examples/langchain-chroma/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 THREADS=4
 CONTEXT_SIZE=512
 MODELS_PATH=/models
--- a/examples/langchain-chroma/models
+++ b/examples/langchain-chroma/models
@@ -0,0 +1 @@
+../models
--- a/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
@@ -1,16 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: ggml-gpt4all-j
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/langchain-chroma/models/gpt4all.tmpl
+++ b/examples/langchain-chroma/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/examples/langchain-huggingface/models
+++ b/examples/langchain-huggingface/models
@@ -0,0 +1 @@
+../models
--- a/examples/langchain-huggingface/models/completion.tmpl
+++ b/examples/langchain-huggingface/models/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/examples/langchain-huggingface/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain-huggingface/models/gpt-3.5-turbo.yaml
@@ -1,17 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: gpt2
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-backend: "langchain-huggingface"
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/langchain-huggingface/models/gpt4all.tmpl
+++ b/examples/langchain-huggingface/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/examples/langchain/models
+++ b/examples/langchain/models
@@ -0,0 +1 @@
+../models
--- a/examples/langchain/models/completion.tmpl
+++ b/examples/langchain/models/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/examples/langchain/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain/models/gpt-3.5-turbo.yaml
@@ -1,17 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: ggml-gpt4all-j # ggml-koala-13B-4bit-128g
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-backend: "gptj"
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/langchain/models/gpt4all.tmpl
+++ b/examples/langchain/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/examples/localai-webui/docker-compose.yml
+++ b/examples/localai-webui/docker-compose.yml
@@ -8,8 +8,6 @@ services:
      dockerfile: Dockerfile
    ports:
      - 8080:8080
-    env_file:
-      - .env
    volumes:
      - ./models:/models:cached
    command: ["/usr/bin/local-ai"]
--- a/examples/models/.gitignore
+++ b/examples/models/.gitignore
@@ -0,0 +1,7 @@
+# Ignore everything but predefined models
+*
+!.gitignore
+!completion.tmpl
+!embeddings.yaml
+!gpt4all.tmpl
+!gpt-3.5-turbo.yaml
--- a/examples/langchain-chroma/models/completion.tmpl
+++ b/examples/langchain-chroma/models/completion.tmpl
--- a/examples/langchain-chroma/models/embeddings.yaml
+++ b/examples/langchain-chroma/models/embeddings.yaml
--- a/examples/chatbot-ui-manual/models/gpt-3.5-turbo.yaml
+++ b/examples/chatbot-ui-manual/models/gpt-3.5-turbo.yaml
--- a/examples/chatbot-ui-manual/models/gpt4all.tmpl
+++ b/examples/chatbot-ui-manual/models/gpt4all.tmpl
--- a/examples/query_data/models
+++ b/examples/query_data/models
@@ -0,0 +1 @@
+../models
--- a/examples/query_data/models/completion.tmpl
+++ b/examples/query_data/models/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/examples/query_data/models/embeddings.yaml
+++ b/examples/query_data/models/embeddings.yaml
@@ -1,6 +0,0 @@
-name: text-embedding-ada-002
-parameters:
-  model: bert
-threads: 14
-backend: bert-embeddings
-embeddings: true
--- a/examples/query_data/models/gpt-3.5-turbo.yaml
+++ b/examples/query_data/models/gpt-3.5-turbo.yaml
@@ -1,16 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: ggml-gpt4all-j
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/query_data/query.py
+++ b/examples/query_data/query.py
@@ -15,7 +15,7 @@ llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo
 # Configure prompt parameters and initialise helper
 max_input_size = 500
 num_output = 256
-max_chunk_overlap = 20
+max_chunk_overlap = 0.2

 prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

--- a/examples/query_data/store.py
+++ b/examples/query_data/store.py
@@ -15,7 +15,7 @@ llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-3.5-turbo
 # Configure prompt parameters and initialise helper
 max_input_size = 400
 num_output = 400
-max_chunk_overlap = 30
+max_chunk_overlap = 0.3

 prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

--- a/examples/slack-bot/.env.example
+++ b/examples/slack-bot/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 SLACK_APP_TOKEN=xapp-1-...
 SLACK_BOT_TOKEN=xoxb-...
 OPENAI_API_KEY=sk-...
--- a/examples/slack-bot/README.md
+++ b/examples/slack-bot/README.md
@@ -18,7 +18,7 @@ git clone https://github.com/seratch/ChatGPT-in-Slack
 # Download gpt4all-j to models/
 wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j

-# Set the discord bot options (see: https://github.com/seratch/ChatGPT-in-Slack)
+# Set the Slack bot options (see: https://github.com/seratch/ChatGPT-in-Slack)
 cp -rfv .env.example .env
 vim .env

--- a/examples/slack-bot/models
+++ b/examples/slack-bot/models
@@ -1 +1 @@
-../chatbot-ui/models
+../models
--- a/examples/slack-qa-bot/.env.example
+++ b/examples/slack-qa-bot/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 # Create an app-level token with connections:write scope
 SLACK_APP_TOKEN=xapp-1-...
 # Install the app into your workspace to grab this token
--- a/examples/streamlit-bot/.gitignore
+++ b/examples/streamlit-bot/.gitignore
@@ -0,0 +1 @@
+installer_files
--- a/examples/streamlit-bot/LICENSE
+++ b/examples/streamlit-bot/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Manohar Joshi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/examples/streamlit-bot/Main.py
+++ b/examples/streamlit-bot/Main.py
@@ -0,0 +1,70 @@
+import streamlit as st
+import time
+import requests
+import json
+
+def ask(prompt):
+    url = 'http://localhost:8080/v1/chat/completions'
+    myobj = {
+        "model": "ggml-gpt4all-j.bin",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.9
+    }
+    myheaders = { "Content-Type" : "application/json" }  
+
+    x = requests.post(url, json = myobj, headers=myheaders)
+    
+    print(x.text)
+    
+    json_data = json.loads(x.text)
+
+    return json_data["choices"][0]["message"]["content"]
+
+
+def main():
+    # Page setup
+    st.set_page_config(page_title="Ask your LLM")
+    st.header("Ask your Question 💬")
+
+    # Initialize chat history
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+
+    # Scroll to bottom
+    st.markdown(
+        """
+        <script>
+        var element = document.getElementById("end-of-chat");
+        element.scrollIntoView({behavior: "smooth"});
+        </script>
+        """,
+        unsafe_allow_html=True,
+    )   
+
+    # React to user input
+    if prompt := st.chat_input("What is up?"):
+        # Display user message in chat message container
+        st.chat_message("user").markdown(prompt)
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        print(f"User has asked the following question: {prompt}")
+        
+        # Process
+        response = ""
+        with st.spinner('Processing...'):
+            response = ask(prompt)
+            
+        #response = f"Echo: {prompt}"
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"):
+            st.markdown(response)
+        # Add assistant response to chat history
+        st.session_state.messages.append({"role": "assistant", "content": response})     
+
+if __name__ == "__main__":
+    main()        
--- a/examples/streamlit-bot/README.md
+++ b/examples/streamlit-bot/README.md
@@ -0,0 +1,54 @@
+## Streamlit bot
+
+![Screenshot](streamlit-bot.png)
+
+This is an example to deploy a Streamlit bot with LocalAI instead of OpenAI. Instructions are for Windows.
+
+```bash
+# Install & run Git Bash
+
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI.git
+cd LocalAI
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Use a template from the examples
+cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
+
+# (optional) Edit the .env file to set things like context size and threads
+# vim .env
+# Download model
+curl --progress-bar -C - -O https://gpt4all.io/models/ggml-gpt4all-j.bin > models/ggml-gpt4all-j.bin
+
+# Install & Run Docker Desktop for Windows
+https://www.docker.com/products/docker-desktop/
+
+# start with docker-compose
+docker-compose up -d --pull always
+# or you can build the images with:
+# docker-compose up -d --build
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+# {"object":"list","data":[{"id":"ggml-gpt4all-j","object":"model"}]}
+
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-gpt4all-j",
+     "messages": [{"role": "user", "content": "How are you?"}],
+     "temperature": 0.9
+   }'
+
+# {"model":"ggml-gpt4all-j","choices":[{"message":{"role":"assistant","content":"I'm doing well, thanks. How about you?"}}]}
+
+cd examples/streamlit-bot
+
+install_requirements.bat
+
+# run the bot
+start_windows.bat
+
+# UI will be launched automatically (http://localhost:8501/) in browser.
+
+```
+
--- a/examples/streamlit-bot/cmd_windows.bat
+++ b/examples/streamlit-bot/cmd_windows.bat
@@ -0,0 +1,31 @@
+@echo off
+
+cd /D "%~dp0"
+
+set PATH=%PATH%;%SystemRoot%\system32
+
+echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
+
+@rem fix failed install when installing to a separate drive
+set TMP=%cd%\installer_files
+set TEMP=%cd%\installer_files
+
+@rem config
+set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
+set INSTALL_ENV_DIR=%cd%\installer_files\env
+
+@rem environment isolation
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+set "CUDA_PATH=%INSTALL_ENV_DIR%"
+set "CUDA_HOME=%CUDA_PATH%"
+
+@rem activate installer env
+call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
+
+@rem enter commands
+cmd /k "%*"
+
+:end
+pause
--- a/examples/streamlit-bot/install_requirements.bat
+++ b/examples/streamlit-bot/install_requirements.bat
@@ -0,0 +1,81 @@
+@echo off
+
+cd /D "%~dp0"
+
+set PATH=%PATH%;%SystemRoot%\system32
+
+echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
+
+@rem Check for special characters in installation path
+set "SPCHARMESSAGE="WARNING: Special characters were detected in the installation path!" "         This can cause the installation to fail!""
+echo "%CD%"| findstr /R /C:"[!#\$%&()\*+,;<=>?@\[\]\^`{|}~]" >nul && (
+	call :PrintBigMessage %SPCHARMESSAGE%
+)
+set SPCHARMESSAGE=
+
+@rem fix failed install when installing to a separate drive
+set TMP=%cd%\installer_files
+set TEMP=%cd%\installer_files
+
+@rem config
+set INSTALL_DIR=%cd%\installer_files
+set CONDA_ROOT_PREFIX=%cd%\installer_files\conda
+set INSTALL_ENV_DIR=%cd%\installer_files\env
+set MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Windows-x86_64.exe
+set conda_exists=F
+
+@rem figure out whether git and conda needs to be installed
+call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1
+if "%ERRORLEVEL%" EQU "0" set conda_exists=T
+
+@rem (if necessary) install git and conda into a contained environment
+@rem download conda
+if "%conda_exists%" == "F" (
+	echo Downloading Miniconda from %MINICONDA_DOWNLOAD_URL% to %INSTALL_DIR%\miniconda_installer.exe
+
+	mkdir "%INSTALL_DIR%"
+	call curl -Lk "%MINICONDA_DOWNLOAD_URL%" > "%INSTALL_DIR%\miniconda_installer.exe" || ( echo. && echo Miniconda failed to download. && goto end )
+
+	echo Installing Miniconda to %CONDA_ROOT_PREFIX%
+	start /wait "" "%INSTALL_DIR%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%CONDA_ROOT_PREFIX%
+
+	@rem test the conda binary
+	echo Miniconda version:
+	call "%CONDA_ROOT_PREFIX%\_conda.exe" --version || ( echo. && echo Miniconda not found. && goto end )
+)
+
+@rem create the installer env
+if not exist "%INSTALL_ENV_DIR%" (
+  echo Packages to install: %PACKAGES_TO_INSTALL%
+  call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.10 || ( echo. && echo Conda environment creation failed. && goto end )
+)
+
+@rem check if conda environment was actually created
+if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
+
+@rem environment isolation
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+set "CUDA_PATH=%INSTALL_ENV_DIR%"
+set "CUDA_HOME=%CUDA_PATH%"
+
+@rem activate installer env
+call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ( echo. && echo Miniconda hook not found. && goto end )
+
+@rem setup installer env
+call pip install -r requirements.txt
+
+@rem below are functions for the script   next line skips these during normal execution
+goto end
+
+:PrintBigMessage
+echo. && echo.
+echo *******************************************************************
+for %%M in (%*) do echo * %%~M
+echo *******************************************************************
+echo. && echo.
+exit /b
+
+:end
+pause
--- a/Show More
+++ b/Show More