fix(deps): update github.com/ggerganov/whisper.cpp/bindings/go digest to 54c978c

Adding transcript subcommand (#1171 )
Adding the transcript subcommand to the localai binary This PR is related to #816
2026-06-02 05:06:34 -04:00 · 2023-10-15 12:54:26 +00:00 · 2023-10-15 09:17:41 +02:00 · 2023-10-14 12:29:22 +02:00 · 2023-10-14 12:28:58 +02:00 · 2023-10-14 12:27:35 +02:00
203 changed files with 8229 additions and 1314 deletions
--- a/.env
+++ b/.env
@@ -23,7 +23,16 @@ MODELS_PATH=/models
 ## Enable debug mode
 # DEBUG=true

+## Disables COMPEL (Diffusers)
+# COMPEL=0
+
+## Enable/Disable single backend (useful if only one GPU is available)
+# SINGLE_ACTIVE_BACKEND=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
+## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
+## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
+## clBLAS:   This is an open-source implementation of the BLAS library that uses OpenCL, a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. clBLAS is designed to take advantage of the parallel computing power of GPUs but can also run on any hardware that supports OpenCL. This includes hardware from different vendors like Nvidia, AMD, and Intel.
 # BUILD_TYPE=openblas

 ## Uncomment and set to true to enable rebuilding from source
@@ -41,3 +50,20 @@ MODELS_PATH=/models

 ## Specify a default upload limit in MB (whisper)
 # UPLOAD_LIMIT
+
+## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
+# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+
+### Advanced settings ###
+### Those are not really used by LocalAI, but from components in the stack ###
+##
+### Preload libraries
+# LD_PRELOAD=
+
+### Huggingface cache for models
+# HUGGINGFACE_HUB_CACHE=/usr/local/huggingface
+
+### Python backends GRPC max workers
+### Default number of workers for GRPC Python backends.
+### This actually controls wether a backend can process multiple requests or not.
+# PYTHON_GRPC_MAX_WORKERS=1
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.sh text eol=lf
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,16 +8,24 @@ This PR fixes #
 **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)**
 - [ ] Yes, I signed my commits.
 
-
 <!--
 Thank you for contributing to LocalAI! 

-Contributing Conventions:
+Contributing Conventions
+-------------------------

-1. Include descriptive PR titles with [<component-name>] prepended.
-2. Build and test your changes before submitting a PR. 
+The draft above helps to give a quick overview of your PR.
+
+Remember to remove this comment and to at least:
+
+1. Include descriptive PR titles with [<component-name>] prepended. We use [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/).
+2. Build and test your changes before submitting a PR (`make build`). 
 3. Sign your commits
+4. **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below).
+5. **X/Twitter handle:** we announce bigger features on X/Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out!

 By following the community's contribution conventions upfront, the review process will 
 be accelerated and your PR merged more quickly.
+
+If no one reviews your PR within a few days, please @-mention @mudler.
 -->
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,9 +12,6 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
-          - repository: "go-skynet/go-llama.cpp"
-            variable: "GOLLAMA_GRAMMAR_VERSION"
-            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -96,7 +96,7 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@v4
+        uses: docker/metadata-action@v5
        with:
          images: quay.io/go-skynet/local-ai
          tags: |
@@ -118,14 +118,14 @@ jobs:

      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          registry: quay.io
          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}

      - name: Build and push
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -22,6 +22,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          sudo apt-get update
@@ -60,6 +63,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Build
        id: build
        env:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,12 +16,53 @@ concurrency:
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
      - name: Clone
        uses: actions/checkout@v3
        with: 
          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
@@ -43,8 +84,8 @@ jobs:
          mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
          curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz" | \
          tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /lib64/ && \
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
      - name: Test
        run: |
@@ -52,13 +93,21 @@ jobs:

  macOS-latest:
    runs-on: macOS-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
        with: 
          submodules: true
-
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Test
        run: |
          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # go-llama build artifacts
 go-llama
+go-llama-stable
 /gpt4all
 go-stable-diffusion
 go-piper
@@ -22,6 +23,8 @@ LocalAI
 local-ai
 # prevent above rules from omitting the helm chart
 !charts/*
+# prevent above rules from omitting the api/localai folder
+!api/localai

 # Ignore models
 models/*
@@ -36,5 +39,5 @@ release/

 # Generated during build
 backend-assets/
-
+prepare
 /ggml-metal.metal
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,72 @@
+# Contributing to localAI
+
+Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
+
+## Table of Contents
+
+- [Getting Started](#getting-started)
+  - [Prerequisites](#prerequisites)
+  - [Setting up the Development Environment](#setting-up-the-development-environment)
+- [Contributing](#contributing)
+  - [Submitting an Issue](#submitting-an-issue)
+  - [Creating a Pull Request (PR)](#creating-a-pull-request-pr)
+- [Coding Guidelines](#coding-guidelines)
+- [Testing](#testing)
+- [Documentation](#documentation)
+- [Community and Communication](#community-and-communication)
+
+
+
+## Getting Started
+
+### Prerequisites
+
+- Golang [1.21]
+- Git
+- macOS/Linux
+
+### Setting up the Development Environment and running localAI in the local environment
+
+1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
+2. Navigate to the project directory: `cd LocalAI`
+3. Install the required dependencies: `make prepare`
+4. Run LocalAI: `make run`
+
+## Contributing
+
+We welcome contributions from everyone! To get started, follow these steps:
+
+### Submitting an Issue
+
+If you find a bug, have a feature request, or encounter any issues, please check the [issue tracker](https://github.com/go-skynet/LocalAI/issues) to see if a similar issue has already been reported. If not, feel free to [create a new issue](https://github.com/go-skynet/LocalAI/issues/new) and provide as much detail as possible.
+
+### Creating a Pull Request (PR)
+
+1. Fork the repository.
+2. Create a new branch with a descriptive name: `git checkout -b [branch name]`
+3. Make your changes and commit them.
+4. Push the changes to your fork: `git push origin [branch name]`
+5. Create a new pull request from your branch to the main project's `main` or `master` branch.
+6. Provide a clear description of your changes in the pull request.
+7. Make any requested changes during the review process.
+8. Once your PR is approved, it will be merged into the main project.
+
+## Coding Guidelines
+
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+
+## Testing
+
+`make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
+
+## Documentation
+
+- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
+
+## Community and Communication
+
+- You can reach out via the Github issue tracker.
+- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
+- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
+
+---
--- a/44
+++ b/44
@@ -1,4 +1,4 @@
-ARG GO_VERSION=1.20-bullseye
+ARG GO_VERSION=1.21-bullseye

 FROM golang:$GO_VERSION as requirements

@@ -11,15 +11,16 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py"
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py,vllm:/build/extra/grpc/vllm/backend_vllm.py"
+ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
    apt-get install -y ca-certificates cmake curl patch pip

-# Extras requirements
-COPY extra/requirements.txt /build/extra/requirements.txt
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+# Use the variables in subsequent instructions
+RUN echo "Target Architecture: $TARGETARCH"
+RUN echo "Target Variant: $TARGETVARIANT"

 # CuBLAS requirements
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
@@ -29,10 +30,26 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

+# Extras requirements
+COPY extra/requirements.txt /build/extra/requirements.txt
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+    fi
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+    fi
+RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+
+# Vall-e-X
+RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
+
 WORKDIR /build

 # OpenBLAS requirements
@@ -42,9 +59,6 @@ RUN apt-get install -y libopenblas-dev
 RUN apt-get install -y libopencv-dev && \
    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

-# Use the variables in subsequent instructions
-RUN echo "Target Architecture: $TARGETARCH"
-RUN echo "Target Variant: $TARGETVARIANT"

 # piper requirements
 # Use pre-compiled Piper phonemization library (includes onnxruntime)
@@ -63,8 +77,8 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO
    mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
    curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH:-$(go env GOARCH)}${TARGETVARIANT}.tar.gz" | \
    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /lib64/ && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
 # \
 #    ; fi
@@ -98,7 +112,10 @@ RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data
 FROM requirements

 ARG FFMPEG
+ARG BUILD_TYPE
+ARG TARGETARCH

+ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

@@ -117,6 +134,13 @@ COPY . .
 RUN make prepare-sources
 COPY --from=builder /build/local-ai ./

+# Copy VALLE-X as it's not a real "lib"
+RUN cp -rfv /usr/lib/vall-e-x/* ./
+
+# To resolve exllama import error
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
+    fi
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/87
+++ b/87
@@ -4,21 +4,13 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-# Temporarly pinned to https://github.com/go-skynet/go-llama.cpp/pull/124
-GOLLAMA_VERSION?=f3a6ee0ef53d667f110d28fcf9b808bdca741c07
+GOLLAMA_VERSION?=1676dcd7a139b6cdfbaea5fd67f46dc25d9d8bcf

-GOLLAMA_GRAMMAR_VERSION?=cb8d7cd4cb95725a04504a9e3a26dd72a12b69ac
-# Temporary set a specific version of llama.cpp
-# containing: https://github.com/ggerganov/llama.cpp/pull/1773 and
-# rebased on top of master.
-# This pin can be dropped when the PR above is merged, and go-llama has merged changes as well
-# Set empty to use the version pinned by go-llama
-LLAMA_CPP_GRAMMAR_REPO?=https://github.com/mudler/llama.cpp
-LLAMA_CPP_GRAMMAR_VERSION?=48ce8722a05a018681634af801fd0fd45b3a87cc
+GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
-GPT4ALL_VERSION?=8aba2c9009fb6bc723f623c614e265b41722e4e3
+GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8

 # go-ggml-transformers version
 GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
@@ -46,6 +38,8 @@ STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
 GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b

 export BUILD_TYPE?=
+export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
+export CMAKE_ARGS?=
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
@@ -72,9 +66,12 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif

-# workaround for rwkv.cpp
 ifeq ($(UNAME_S),Darwin)
-        CGO_LDFLAGS += -lcblas -framework Accelerate 
+	CGO_LDFLAGS += -lcblas -framework Accelerate
+ifneq ($(BUILD_TYPE),metal)
+    # explicit disable metal if on Darwin and metal is disabled
+	CMAKE_ARGS+=-DLLAMA_METAL=OFF
+endif
 endif

 ifeq ($(BUILD_TYPE),openblas)
@@ -86,6 +83,18 @@ ifeq ($(BUILD_TYPE),cublas)
 	export LLAMA_CUBLAS=1
 endif

+ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	# Llama-stable has no hipblas support, so override it here.
+	export STABLE_BUILD_TYPE=
+	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
+	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link
+endif
+
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export LLAMA_METAL=1
@@ -111,6 +120,8 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 	OPTIONAL_GRPC+=backend-assets/grpc/piper
 endif

+GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+
 .PHONY: all test build vendor

 all: help
@@ -202,28 +213,23 @@ go-llama:
 	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
 	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

-go-llama-grammar:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-grammar
-	cd go-llama-grammar && git checkout -b build $(GOLLAMA_GRAMMAR_VERSION) && git submodule update --init --recursive --depth 1
-ifneq ($(LLAMA_CPP_GRAMMAR_REPO),)
-	cd go-llama-grammar && rm -rf llama.cpp && git clone $(LLAMA_CPP_GRAMMAR_REPO) llama.cpp && cd llama.cpp && git checkout -b build $(LLAMA_CPP_GRAMMAR_VERSION) && git submodule update --init --recursive --depth 1
-endif
+go-llama-stable:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
+	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

 go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

-go-llama-grammar/libbinding.a: go-llama-grammar
-	$(MAKE) -C go-llama-grammar BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+go-llama-stable/libbinding.a: go-llama-stable
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

 go-piper/libpiper_binding.a:
 	$(MAKE) -C go-piper libpiper_binding.a example/main

-get-sources: go-llama go-ggllm go-llama-grammar go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
 	touch $@

 replace:
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp-grammar=$(shell pwd)/go-llama-grammar
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
@@ -241,7 +247,7 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C go-llama clean
-	$(MAKE) -C go-llama-grammar clean
+	$(MAKE) -C go-llama-stable clean
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C go-ggml-transformers clean
 	$(MAKE) -C go-rwkv clean
@@ -253,13 +259,15 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C go-ggllm clean
 	$(MAKE) build

-prepare: prepare-sources $(OPTIONAL_TARGETS) 
+prepare: prepare-sources $(OPTIONAL_TARGETS)
 	touch $@

 clean: ## Remove build related file
 	$(GOCMD) clean -cache
-	rm -fr ./go-llama
+	rm -f prepare
+	rm -rf ./go-llama
 	rm -rf ./gpt4all
+	rm -rf ./go-llama-stable
 	rm -rf ./go-gpt2
 	rm -rf ./go-stable-diffusion
 	rm -rf ./go-ggml-transformers
@@ -311,9 +319,10 @@ test: prepare test-models/testmodel grpcs
 	export GO_TAGS="tts stablediffusion"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama" --flake-attempts 5 -v -r ./api ./pkg
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
+	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

@@ -325,6 +334,10 @@ test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg

+test-llama-gguf: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+
 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
@@ -356,7 +369,13 @@ protogen-go:
    pkg/grpc/proto/backend.proto

 protogen-python:
-	python -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto

 ## GRPC

@@ -368,6 +387,7 @@ backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/

 backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
 # TODO: every binary should have its own folder instead, so can have different metal implementations
@@ -375,9 +395,10 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

-backend-assets/grpc/llama-grammar: backend-assets/grpc go-llama-grammar/libbinding.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-grammar LIBRARY_PATH=$(shell pwd)/go-llama-grammar \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-grammar ./cmd/grpc/llama-grammar/
+backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/

 backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
@@ -442,4 +463,4 @@ backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/

-grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/llama-grammar backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+grpcs: prepare $(GRPC_BACKENDS)
--- a/README.md
+++ b/README.md
@@ -1,208 +1,135 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>

-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml) [![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)
+<p align="center">
+<a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
+<img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
+<img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
+<img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
+</a>
+<a href='https://github.com/go-skynet/LocalAI/releases'>
+<img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
+</a>
+</p>

-[![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 
+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+> 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)

-[Documentation website](https://localai.io/)

-**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-For a list of the supported model families, please see [the model compatibility table](https://localai.io/model-compatibility/index.html#model-compatibility-table).
+**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.
+
+<p align="center"><b>Follow LocalAI </b></p>
+
+<p align="center">
+<a href="https://twitter.com/LocalAI_API" target="blank">
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+</a>
+<a href="https://discord.gg/uJAeKSAGDy" target="blank">
+<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
+</a>
+
+<p align="center"><b>Connect with the Creator </b></p>
+
+<p align="center">
+<a href="https://twitter.com/mudler_it" target="blank">
+<img src="https://img.shields.io/twitter/follow/mudler_it?label=Follow: mudler_it&style=social" alt="Follow mudler_it"/>
+</a>
+<a href='https://github.com/mudler'>
+<img alt="Follow on Github" src="https://img.shields.io/badge/Follow-mudler-black?logo=github&link=https%3A%2F%2Fgithub.com%2Fmudler">
+</a>
+</p>
+
+<p align="center"><b>Share LocalAI Repository</b></p>
+
+<p align="center">
+
+<a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
+<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a> 
+<a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
+<a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
+" target="blank">
+<img src="https://img.shields.io/twitter/url?label=Reddit&logo=Reddit&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Reddit"/>
+</a> <a href="mailto:?subject=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%3A%0Ahttps://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Gmail&logo=Gmail&style=social&url=https://github.com/go-skynet/LocalAI"/></a> <a href="https://www.buymeacoffee.com/mudler" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="23" width="100" style="border-radius:1px"></a>
+
+</p>
+
+<hr>

 In a nutshell:

 - Local, OpenAI drop-in alternative REST API. You own your data.
 - NO GPU required. NO Internet access is required either
  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
- Supports multiple models:
-  - 📖 Text generation with GPTs (`llama.cpp`, `gpt4all.cpp`, ... and more)
-  - 🗣 Text to Audio 🎺🆕
-  - 🔈 Audio to Text (Audio transcription with `whisper.cpp`)
-  - 🎨 Image generation with stable diffusion
+- Supports multiple models
 - 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance. 
+- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.

 LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 

-See the [Getting started](https://localai.io/basics/getting_started/index.html) and [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) sections to learn how to use LocalAI. For a list of curated models check out the [model gallery](https://localai.io/models/).
+Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
+
+## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)
+
+## 🚀 [Features](https://localai.io/features/)
+
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
+- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
+- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
+- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)


-| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
-|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
-|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
-
-|                                                                    [Telegram bot](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot)   | [Flowise](https://github.com/go-skynet/LocalAI/tree/master/examples/flowise)                                                                                                                     |
-|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
-![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)   |  ![Screenshot from 2023-05-30 18-01-03](https://github.com/go-skynet/LocalAI/assets/2420543/02458782-0549-4131-971c-95ee56ec1af8)|    |
-
-## Hot topics / Roadmap
-
- [x] Support for embeddings
- [x] Support for audio transcription with https://github.com/ggerganov/whisper.cpp
- [X] Support for text-to-audio
- [x] GPU/CUDA support ( https://github.com/go-skynet/LocalAI/issues/69 )
- [X] Enable automatic downloading of models from a curated gallery
- [X] Enable automatic downloading of models from HuggingFace
- [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351) 
- [ ] Enable gallery management directly from the webui.
- [ ] 🔥 OpenAI functions: https://github.com/go-skynet/LocalAI/issues/588
-
-## News
-
- 🔥🔥🔥 28-06-2023: **v1.20.0**: Added text to audio and gallery huggingface repositories! [Release notes](https://localai.io/basics/news/index.html#-28-06-2023-__v1200__-) [Changelog](https://github.com/go-skynet/LocalAI/releases/tag/v1.20.0)
- 🔥🔥🔥 19-06-2023: **v1.19.0**: CUDA support! [Release notes](https://localai.io/basics/news/index.html#-19-06-2023-__v1190__-) [Changelog](https://github.com/go-skynet/LocalAI/releases/tag/v1.19.0)
- 🔥🔥🔥 06-06-2023: **v1.18.0**: Many updates, new features, and much more 🚀, check out the [Release notes](https://localai.io/basics/news/index.html#-06-06-2023-__v1180__-)!
- 29-05-2023: LocalAI now has a website, [https://localai.io](https://localai.io)! check the news in the [dedicated section](https://localai.io/basics/news/index.html)!
-
-For latest news, follow also on Twitter [@LocalAI_API](https://twitter.com/LocalAI_API) and [@mudler_it](https://twitter.com/mudler_it)
-
-## Media, Blogs, Social
+## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
 - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
 - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
 - [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)

-## Contribute and help
+## 💻 Usage

-To help the project you can:
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

- [Hacker news post](https://news.ycombinator.com/item?id=35726934) - help us out by voting if you like this project.
+### 💡 Example: Use Luna-AI Llama model

- If you have technological skills and want to contribute to development, have a look at the open issues. If you are new you can have a look at the [good-first-issue](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and [help-wanted](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.
+See the [documentation](https://localai.io/basics/getting_started)

- If you don't have technological skills you can still help improving documentation or add examples or share your user-stories with our community, any help and contribution is welcome!
+### 🔗 Resources

-## Usage
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/integrations/)
+- [How tos section](https://localai.io/howtos/) (curated by our community)
+  
+## Citation

-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section. Here below you will find generic, quick instructions to get ready and use LocalAI.
-
-The easiest way to run LocalAI is by using `docker-compose` (to build locally, see [building LocalAI](https://localai.io/basics/build/index.html)):
-
-```bash
-
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-
-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# copy your models to models/
-cp your-model.bin models/
-
-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
-
-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "your-model.bin",            
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
-```
-
-### Example: Use GPT4ALL-J model
-
-<details>
-
-```bash
-# Clone LocalAI
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-
-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# Download gpt4all-j to models/
-wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
-
-# Use a template from the examples
-cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
-
-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"ggml-gpt4all-j","object":"model"}]}
-
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-gpt4all-j",
-     "messages": [{"role": "user", "content": "How are you?"}],
-     "temperature": 0.9 
-   }'
-
-# {"model":"ggml-gpt4all-j","choices":[{"message":{"role":"assistant","content":"I'm doing well, thanks. How about you?"}}]}
-```
-</details>
-
-
-### Build locally
-
-<details>
-
-In order to build the `LocalAI` container image locally you can use `docker`:
+If you utilize this repository, data in a downstream project, please consider citing it with:

 ```
-# build the image
-docker build -t localai .
-docker run localai
+@misc{localai,
+  author = {Ettore Di Giacinto},
+  title = {LocalAI: The free, Open source OpenAI alternative},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/go-skynet/LocalAI}},
 ```

-Or you can build the binary with `make`:
-
-```
-make build
-```
-
-</details>
-
-See the [build section](https://localai.io/basics/build/index.html) in our documentation for detailed instructions.
-
-### Run LocalAI in Kubernetes
-
-LocalAI can be installed inside Kubernetes with helm. See [installation instructions](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes).
-
-## Supported API endpoints
-
-See the [list of the LocalAI features](https://localai.io/features/index.html) for a full tour of the available API endpoints.
-
-## Frequently asked questions
-
-See [the FAQ](https://localai.io/faq/index.html) section for a list of common questions.
-
-## Projects already using LocalAI to run local models
-
-Feel free to open up a PR to get your project listed!
-
- [Kairos](https://github.com/kairos-io/kairos)
- [k8sgpt](https://github.com/k8sgpt-ai/k8sgpt#running-local-models)
- [Spark](https://github.com/cedriking/spark)
- [autogpt4all](https://github.com/aorumbayev/autogpt4all)
- [Mods](https://github.com/charmbracelet/mods)
- [Flowise](https://github.com/FlowiseAI/Flowise)
-
-## Sponsors
+## ❤️ Sponsors

 > Do you find LocalAI useful?

@@ -215,21 +142,22 @@ A huge thank you to our generous sponsors who support this project:
 |  [Spectro Cloud](https://www.spectrocloud.com/)  |  
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

-## Star history
+And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 
+
+- [Sponsor list](https://github.com/sponsors/mudler)
+- JDAM00 (donating HW for the CI)
+
+## 🌟 Star history

 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)

-## License
+## 📖 License

 LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-MIT
+MIT - Author Ettore Di Giacinto

-## Author
-
-Ettore Di Giacinto and others
-
-## Acknowledgements
+## 🙇 Acknowledgements

 LocalAI couldn't have been built without the help of great software already available from the community. Thank you!

@@ -240,9 +168,12 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
+- https://github.com/rhasspy/piper
+- https://github.com/cmp-nct/ggllm.cpp

-## Contributors
+## 🤗 Contributors

+This is a community project, a special thanks to our contributors! 🤗
 <a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
 </a>
--- a/api/api.go
+++ b/api/api.go
@@ -2,11 +2,14 @@ package api

 import (
 	"errors"
+	"fmt"
+	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/localai"
 	"github.com/go-skynet/LocalAI/api/openai"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/internal"
 	"github.com/go-skynet/LocalAI/pkg/assets"

@@ -18,7 +21,7 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func App(opts ...options.AppOption) (*fiber.App, error) {
+func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader, error) {
 	options := options.NewOptions(opts...)

 	zerolog.SetGlobalLevel(zerolog.InfoLevel)
@@ -26,6 +29,65 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		zerolog.SetGlobalLevel(zerolog.DebugLevel)
 	}

+	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
+	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
+
+	cl := config.NewConfigLoader()
+	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
+		log.Error().Msgf("error loading config files: %s", err.Error())
+	}
+
+	if options.ConfigFile != "" {
+		if err := cl.LoadConfigFile(options.ConfigFile); err != nil {
+			log.Error().Msgf("error loading config file: %s", err.Error())
+		}
+	}
+
+	if options.Debug {
+		for _, v := range cl.ListConfigs() {
+			cfg, _ := cl.GetConfig(v)
+			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
+		}
+	}
+
+	if options.AssetsDestination != "" {
+		// Extract files from the embedded FS
+		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
+		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
+		if err != nil {
+			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+		}
+	}
+
+	if options.PreloadJSONModels != "" {
+		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	// turn off any process that was started by GRPC if the context is canceled
+	go func() {
+		<-options.Context.Done()
+		log.Debug().Msgf("Context canceled, shutting down")
+		options.Loader.StopAllGRPC()
+	}()
+
+	return options, cl, nil
+}
+
+func App(opts ...options.AppOption) (*fiber.App, error) {
+
+	options, cl, err := Startup(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed basic startup tasks with error %s", err.Error())
+	}
+
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
 		BodyLimit:             options.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
@@ -43,8 +105,8 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 			// Send custom error page
 			return ctx.Status(code).JSON(
-				openai.ErrorResponse{
-					Error: &openai.APIError{Message: err.Error(), Code: code},
+				schema.ErrorResponse{
+					Error: &schema.APIError{Message: err.Error(), Code: code},
 				},
 			)
 		},
@@ -56,49 +118,33 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 		}))
 	}

-	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
-	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
-
-	cm := config.NewConfigLoader()
-	if err := cm.LoadConfigs(options.Loader.ModelPath); err != nil {
-		log.Error().Msgf("error loading config files: %s", err.Error())
-	}
-
-	if options.ConfigFile != "" {
-		if err := cm.LoadConfigFile(options.ConfigFile); err != nil {
-			log.Error().Msgf("error loading config file: %s", err.Error())
-		}
-	}
-
-	if options.Debug {
-		for _, v := range cm.ListConfigs() {
-			cfg, _ := cm.GetConfig(v)
-			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
-		}
-	}
-
-	if options.AssetsDestination != "" {
-		// Extract files from the embedded FS
-		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
-		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
-		if err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
-		}
-	}
-
 	// Default middleware config
 	app.Use(recover.New())

-	if options.PreloadJSONModels != "" {
-		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cm, options.Galleries); err != nil {
-			return nil, err
-		}
-	}
+	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
+	auth := func(c *fiber.Ctx) error {
+		if len(options.ApiKeys) > 0 {
+			authHeader := c.Get("Authorization")
+			if authHeader == "" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+			}
+			authHeaderParts := strings.Split(authHeader, " ")
+			if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+			}

-	if options.PreloadModelsFromPath != "" {
-		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cm, options.Galleries); err != nil {
-			return nil, err
+			apiKey := authHeaderParts[1]
+			validApiKey := false
+			for _, key := range options.ApiKeys {
+				if apiKey == key {
+					validApiKey = true
+				}
+			}
+			if !validApiKey {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+			}
 		}
+		return c.Next()
 	}

 	if options.CORS {
@@ -114,44 +160,49 @@ func App(opts ...options.AppOption) (*fiber.App, error) {

 	// LocalAI API endpoints
 	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
-	galleryService.Start(options.Context, cm)
+	galleryService.Start(options.Context, cl)

-	app.Get("/version", func(c *fiber.Ctx) error {
+	app.Get("/version", auth, func(c *fiber.Ctx) error {
 		return c.JSON(struct {
 			Version string `json:"version"`
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Post("/models/apply", localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cm, galleryService.C, options.Galleries))
-	app.Get("/models/available", localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
-	app.Get("/models/jobs/:uuid", localai.GetOpStatusEndpoint(galleryService))
+	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
+	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
+	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
+	app.Get("/models/galleries", auth, modelGalleryService.ListModelGalleriesEndpoint())
+	app.Post("/models/galleries", auth, modelGalleryService.AddModelGalleryEndpoint())
+	app.Delete("/models/galleries", auth, modelGalleryService.RemoveModelGalleryEndpoint())
+	app.Get("/models/jobs/:uuid", auth, modelGalleryService.GetOpStatusEndpoint())
+	app.Get("/models/jobs", auth, modelGalleryService.GetAllStatusEndpoint())

 	// openAI compatible API endpoint

 	// chat
-	app.Post("/v1/chat/completions", openai.ChatEndpoint(cm, options))
-	app.Post("/chat/completions", openai.ChatEndpoint(cm, options))
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, options))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, options))

 	// edit
-	app.Post("/v1/edits", openai.EditEndpoint(cm, options))
-	app.Post("/edits", openai.EditEndpoint(cm, options))
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, options))
+	app.Post("/edits", auth, openai.EditEndpoint(cl, options))

 	// completion
-	app.Post("/v1/completions", openai.CompletionEndpoint(cm, options))
-	app.Post("/completions", openai.CompletionEndpoint(cm, options))
-	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cm, options))
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, options))

 	// embeddings
-	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cm, options))
-	app.Post("/embeddings", openai.EmbeddingsEndpoint(cm, options))
-	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))

 	// audio
-	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cm, options))
-	app.Post("/tts", localai.TTSEndpoint(cm, options))
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, options))
+	app.Post("/tts", auth, localai.TTSEndpoint(cl, options))

 	// images
-	app.Post("/v1/images/generations", openai.ImageEndpoint(cm, options))
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, options))

 	if options.ImageDir != "" {
 		app.Static("/generated-images", options.ImageDir)
@@ -169,16 +220,14 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/healthz", ok)
 	app.Get("/readyz", ok)

-	// models
-	app.Get("/v1/models", openai.ListModelsEndpoint(options.Loader, cm))
-	app.Get("/models", openai.ListModelsEndpoint(options.Loader, cm))
+	// Experimental Backend Statistics Module
+	backendMonitor := localai.NewBackendMonitor(cl, options) // Split out for now
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
+	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))

-	// turn off any process that was started by GRPC if the context is canceled
-	go func() {
-		<-options.Context.Done()
-		log.Debug().Msgf("Context canceled, shutting down")
-		options.Loader.StopGRPC()
-	}()
+	// models
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
+	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))

 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -8,7 +8,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"net/http"
 	"os"
 	"path/filepath"
@@ -30,10 +29,10 @@ import (
 )

 type modelApplyRequest struct {
-	ID        string            `json:"id"`
-	URL       string            `json:"url"`
-	Name      string            `json:"name"`
-	Overrides map[string]string `json:"overrides"`
+	ID        string                 `json:"id"`
+	URL       string                 `json:"url"`
+	Name      string                 `json:"name"`
+	Overrides map[string]interface{} `json:"overrides"`
 }

 func getModelStatus(url string) (response map[string]interface{}) {
@@ -45,7 +44,7 @@ func getModelStatus(url string) (response map[string]interface{}) {
 	}
 	defer resp.Body.Close()

-	body, err := ioutil.ReadAll(resp.Body)
+	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		fmt.Println("Error reading response body:", err)
 		return
@@ -97,7 +96,7 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 	}
 	defer resp.Body.Close()

-	body, err := ioutil.ReadAll(resp.Body)
+	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		fmt.Println("Error reading response body:", err)
 		return
@@ -153,7 +152,7 @@ var _ = Describe("API test", func() {
 			}
 			out, err := yaml.Marshal(g)
 			Expect(err).ToNot(HaveOccurred())
-			err = ioutil.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
+			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
 			Expect(err).ToNot(HaveOccurred())

 			galleries := []gallery.Gallery{
@@ -243,7 +242,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
 					Name: "bert",
-					Overrides: map[string]string{
+					Overrides: map[string]interface{}{
 						"backend": "llama",
 					},
 				})
@@ -269,7 +268,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
 					Name:      "bert",
-					Overrides: map[string]string{},
+					Overrides: map[string]interface{}{},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -297,7 +296,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]string{"backend": "llama-grammar"},
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -352,6 +351,82 @@ var _ = Describe("API test", func() {
 				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
 				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)

+				var res map[string]string
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
+				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
+				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+			})
+
+			It("runs openllama gguf", Label("llama-gguf"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				modelName := "codellama"
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
+					Name:      modelName,
+					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				By("testing chat")
+				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
+					{
+						Role:    "user",
+						Content: "How much is 2+2?",
+					},
+				}})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")))
+
+				By("testing functions")
+				resp2, err := client.CreateChatCompletion(
+					context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: modelName,
+						Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "What is the weather like in San Francisco (celsius)?",
+							},
+						},
+						Functions: []openai.FunctionDefinition{
+							openai.FunctionDefinition{
+								Name:        "get_current_weather",
+								Description: "Get the current weather",
+								Parameters: jsonschema.Definition{
+									Type: jsonschema.Object,
+									Properties: map[string]jsonschema.Definition{
+										"location": {
+											Type:        jsonschema.String,
+											Description: "The city and state, e.g. San Francisco, CA",
+										},
+										"unit": {
+											Type: jsonschema.String,
+											Enum: []string{"celcius", "fahrenheit"},
+										},
+									},
+									Required: []string{"location"},
+								},
+							},
+						},
+					})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Choices)).To(Equal(1))
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
@@ -366,9 +441,8 @@ var _ = Describe("API test", func() {
 				}

 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/gpt4all-j.yaml",
-					Name:      "gpt4all-j",
-					Overrides: map[string]string{},
+					URL:  "github:go-skynet/model-gallery/gpt4all-j.yaml",
+					Name: "gpt4all-j",
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -472,6 +546,9 @@ var _ = Describe("API test", func() {

 			response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 				ID: "model-gallery@stablediffusion",
+				Overrides: map[string]interface{}{
+					"parameters": map[string]interface{}{"model": "stablediffusion_assets"},
+				},
 			})

 			Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -536,7 +613,7 @@ var _ = Describe("API test", func() {
 		It("returns the models list", func() {
 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(11))
+			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions", func() {
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
@@ -740,19 +817,14 @@ var _ = Describe("API test", func() {
 			cancel()
 			app.Shutdown()
 		})
-		It("can generate chat completions from config file", func() {
-			models, err := client.ListModels(context.TODO())
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(13))
-		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+		It("can generate chat completions from config file (list1)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+		It("can generate chat completions from config file (list2)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
--- a/api/backend/embeddings.go
+++ b/api/backend/embeddings.go
@@ -2,7 +2,6 @@ package backend

 import (
 	"fmt"
-	"sync"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
@@ -22,17 +21,13 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	var inferenceModel interface{}
 	var err error

-	opts := []model.Option{
-		model.WithLoadGRPCLLMModelOpts(grpcOpts),
+	opts := modelOpts(c, o, []model.Option{
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
-		model.WithModelFile(modelFile),
+		model.WithModel(modelFile),
 		model.WithContext(o.Context),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+	})

 	if c.Backend == "" {
 		inferenceModel, err = loader.GreedyLoader(opts...)
@@ -80,18 +75,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	}

 	return func() ([]float32, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
 		embeds, err := fn()
 		if err != nil {
 			return embeds, err
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -1,31 +1,33 @@
 package backend

 import (
-	"fmt"
-	"sync"
-
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
-	if c.Backend != model.StableDiffusionBackend {
-		return nil, fmt.Errorf("endpoint only working with stablediffusion models")
-	}
+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {

-	opts := []model.Option{
+	opts := modelOpts(c, o, []model.Option{
 		model.WithBackendString(c.Backend),
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithContext(o.Context),
-		model.WithModelFile(c.ImageGenerationAssets),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+		model.WithModel(c.Model),
+		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
+			CUDA:          c.Diffusers.CUDA,
+			SchedulerType: c.Diffusers.SchedulerType,
+			PipelineType:  c.Diffusers.PipelineType,
+			CFGScale:      c.Diffusers.CFGScale,
+			LoraAdapter:   c.LoraAdapter,
+			LoraBase:      c.LoraBase,
+			IMG2IMG:       c.Diffusers.IMG2IMG,
+			CLIPModel:     c.Diffusers.ClipModel,
+			CLIPSubfolder: c.Diffusers.ClipSubFolder,
+			CLIPSkip:      int32(c.Diffusers.ClipSkip),
+		}),
+	})

 	inferenceModel, err := loader.BackendLoader(
 		opts...,
@@ -38,31 +40,20 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 		_, err := inferenceModel.GenerateImage(
 			o.Context,
 			&proto.GenerateImageRequest{
-				Height:         int32(height),
-				Width:          int32(width),
-				Mode:           int32(mode),
-				Step:           int32(step),
-				Seed:           int32(seed),
-				PositivePrompt: positive_prompt,
-				NegativePrompt: negative_prompt,
-				Dst:            dst,
+				Height:           int32(height),
+				Width:            int32(width),
+				Mode:             int32(mode),
+				Step:             int32(step),
+				Seed:             int32(seed),
+				CLIPSkip:         int32(c.Diffusers.ClipSkip),
+				PositivePrompt:   positive_prompt,
+				NegativePrompt:   negative_prompt,
+				Dst:              dst,
+				Src:              src,
+				EnableParameters: c.Diffusers.EnableParameters,
 			})
 		return err
 	}

-	return func() error {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[c.Backend]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[c.Backend] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		return fn()
-	}, nil
+	return fn, nil
 }
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -1,10 +1,12 @@
 package backend

 import (
+	"context"
 	"os"
 	"regexp"
 	"strings"
 	"sync"
+	"unicode/utf8"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
@@ -14,7 +16,17 @@ import (
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-func ModelInference(s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string) bool) (func() (string, error), error) {
+type LLMResponse struct {
+	Response string // should this be []byte?
+	Usage    TokenUsage
+}
+
+type TokenUsage struct {
+	Prompt     int
+	Completion int
+}
+
+func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model

 	grpcOpts := gRPCModelOpts(c)
@@ -22,17 +34,13 @@ func ModelInference(s string, loader *model.ModelLoader, c config.Config, o *opt
 	var inferenceModel *grpc.Client
 	var err error

-	opts := []model.Option{
-		model.WithLoadGRPCLLMModelOpts(grpcOpts),
+	opts := modelOpts(c, o, []model.Option{
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
-		model.WithModelFile(modelFile),
+		model.WithModel(modelFile),
 		model.WithContext(o.Context),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+	})

 	if c.Backend != "" {
 		opts = append(opts, model.WithBackendString(c.Backend))
@@ -61,40 +69,71 @@ func ModelInference(s string, loader *model.ModelLoader, c config.Config, o *opt
 	}

 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
-	fn := func() (string, error) {
+	fn := func() (LLMResponse, error) {
 		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
+
+		tokenUsage := TokenUsage{}
+
+		// check the per-model feature flag for usage, since tokenCallback may have a cost.
+		// Defaults to off as for now it is still experimental
+		if c.FeatureFlag.Enabled("usage") {
+			userTokenCallback := tokenCallback
+			if userTokenCallback == nil {
+				userTokenCallback = func(token string, usage TokenUsage) bool {
+					return true
+				}
+			}
+
+			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
+			if pErr == nil && promptInfo.Length > 0 {
+				tokenUsage.Prompt = int(promptInfo.Length)
+			}
+
+			tokenCallback = func(token string, usage TokenUsage) bool {
+				tokenUsage.Completion++
+				return userTokenCallback(token, tokenUsage)
+			}
+		}
+
 		if tokenCallback != nil {
 			ss := ""
-			err := inferenceModel.PredictStream(o.Context, opts, func(s string) {
-				tokenCallback(s)
-				ss += s
+
+			var partialRune []byte
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
+				partialRune = append(partialRune, chars...)
+
+				for len(partialRune) > 0 {
+					r, size := utf8.DecodeRune(partialRune)
+					if r == utf8.RuneError {
+						// incomplete rune, wait for more bytes
+						break
+					}
+
+					tokenCallback(string(r), tokenUsage)
+					ss += string(r)
+
+					partialRune = partialRune[size:]
+				}
 			})
-			return ss, err
+			return LLMResponse{
+				Response: ss,
+				Usage:    tokenUsage,
+			}, err
 		} else {
-			reply, err := inferenceModel.Predict(o.Context, opts)
+			// TODO: Is the chicken bit the only way to get here? is that acceptable?
+			reply, err := inferenceModel.Predict(ctx, opts)
 			if err != nil {
-				return "", err
+				return LLMResponse{}, err
 			}
-			return reply.Message, err
+			return LLMResponse{
+				Response: string(reply.Message),
+				Usage:    tokenUsage,
+			}, err
 		}
 	}

-	return func() (string, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		return fn()
-	}, nil
+	return fn, nil
 }

 var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
--- a/api/backend/lock.go
+++ b/api/backend/lock.go
@@ -1,22 +0,0 @@
-package backend
-
-import "sync"
-
-// mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-var mutexMap sync.Mutex
-var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)
-
-func Lock(s string) *sync.Mutex {
-	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-	mutexMap.Lock()
-	l, ok := mutexes[s]
-	if !ok {
-		m := &sync.Mutex{}
-		mutexes[s] = m
-		l = m
-	}
-	mutexMap.Unlock()
-	l.Lock()
-
-	return l
-}
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -5,29 +5,69 @@ import (
 	"path/filepath"

 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"

 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
 )

+func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
+	if o.SingleBackend {
+		opts = append(opts, model.WithSingleActiveBackend())
+	}
+
+	if c.GRPC.Attempts != 0 {
+		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+	}
+
+	if c.GRPC.AttemptsSleepTime != 0 {
+		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+	}
+
+	for k, v := range o.ExternalGRPCBackends {
+		opts = append(opts, model.WithExternalBackend(k, v))
+	}
+
+	return opts
+}
+
 func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
 	}
+
 	return &pb.ModelOptions{
-		ContextSize: int32(c.ContextSize),
-		Seed:        int32(c.Seed),
-		NBatch:      int32(b),
-		F16Memory:   c.F16,
-		MLock:       c.MMlock,
-		NUMA:        c.NUMA,
-		Embeddings:  c.Embeddings,
-		LowVRAM:     c.LowVRAM,
-		NGPULayers:  int32(c.NGPULayers),
-		MMap:        c.MMap,
-		MainGPU:     c.MainGPU,
-		Threads:     int32(c.Threads),
-		TensorSplit: c.TensorSplit,
+		ContextSize:   int32(c.ContextSize),
+		Seed:          int32(c.Seed),
+		NBatch:        int32(b),
+		NoMulMatQ:     c.NoMulMatQ,
+		DraftModel:    c.DraftModel,
+		AudioPath:     c.VallE.AudioPath,
+		Quantization:  c.Quantization,
+		LoraAdapter:   c.LoraAdapter,
+		LoraBase:      c.LoraBase,
+		NGQA:          c.NGQA,
+		RMSNormEps:    c.RMSNormEps,
+		F16Memory:     c.F16,
+		MLock:         c.MMlock,
+		RopeFreqBase:  c.RopeFreqBase,
+		RopeFreqScale: c.RopeFreqScale,
+		NUMA:          c.NUMA,
+		Embeddings:    c.Embeddings,
+		LowVRAM:       c.LowVRAM,
+		NGPULayers:    int32(c.NGPULayers),
+		MMap:          c.MMap,
+		MainGPU:       c.MainGPU,
+		Threads:       int32(c.Threads),
+		TensorSplit:   c.TensorSplit,
+		// AutoGPTQ
+		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
+		Device:           c.AutoGPTQ.Device,
+		UseTriton:        c.AutoGPTQ.Triton,
+		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
+		// RWKV
+		Tokenizer: c.Tokenizer,
 	}
 }

@@ -39,34 +79,38 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
 		promptCachePath = p
 	}
 	return &pb.PredictOptions{
-		Temperature:     float32(c.Temperature),
-		TopP:            float32(c.TopP),
-		TopK:            int32(c.TopK),
-		Tokens:          int32(c.Maxtokens),
-		Threads:         int32(c.Threads),
-		PromptCacheAll:  c.PromptCacheAll,
-		PromptCacheRO:   c.PromptCacheRO,
-		PromptCachePath: promptCachePath,
-		F16KV:           c.F16,
-		DebugMode:       c.Debug,
-		Grammar:         c.Grammar,
-
-		Mirostat:          int32(c.Mirostat),
-		MirostatETA:       float32(c.MirostatETA),
-		MirostatTAU:       float32(c.MirostatTAU),
-		Debug:             c.Debug,
-		StopPrompts:       c.StopWords,
-		Repeat:            int32(c.RepeatPenalty),
-		NKeep:             int32(c.Keep),
-		Batch:             int32(c.Batch),
-		IgnoreEOS:         c.IgnoreEOS,
-		Seed:              int32(c.Seed),
-		FrequencyPenalty:  float32(c.FrequencyPenalty),
-		MLock:             c.MMlock,
-		MMap:              c.MMap,
-		MainGPU:           c.MainGPU,
-		TensorSplit:       c.TensorSplit,
-		TailFreeSamplingZ: float32(c.TFZ),
-		TypicalP:          float32(c.TypicalP),
+		Temperature:         float32(c.Temperature),
+		TopP:                float32(c.TopP),
+		NDraft:              c.NDraft,
+		TopK:                int32(c.TopK),
+		Tokens:              int32(c.Maxtokens),
+		Threads:             int32(c.Threads),
+		PromptCacheAll:      c.PromptCacheAll,
+		PromptCacheRO:       c.PromptCacheRO,
+		PromptCachePath:     promptCachePath,
+		F16KV:               c.F16,
+		DebugMode:           c.Debug,
+		Grammar:             c.Grammar,
+		NegativePromptScale: c.NegativePromptScale,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeFreqScale:       c.RopeFreqScale,
+		NegativePrompt:      c.NegativePrompt,
+		Mirostat:            int32(c.LLMConfig.Mirostat),
+		MirostatETA:         float32(c.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(c.LLMConfig.MirostatTAU),
+		Debug:               c.Debug,
+		StopPrompts:         c.StopWords,
+		Repeat:              int32(c.RepeatPenalty),
+		NKeep:               int32(c.Keep),
+		Batch:               int32(c.Batch),
+		IgnoreEOS:           c.IgnoreEOS,
+		Seed:                int32(c.Seed),
+		FrequencyPenalty:    float32(c.FrequencyPenalty),
+		MLock:               c.MMlock,
+		MMap:                c.MMap,
+		MainGPU:             c.MainGPU,
+		TensorSplit:         c.TensorSplit,
+		TailFreeSamplingZ:   float32(c.TFZ),
+		TypicalP:            float32(c.TypicalP),
 	}
 }
--- a/api/backend/transcript.go
+++ b/api/backend/transcript.go
@@ -5,25 +5,22 @@ import (
 	"fmt"

 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"

 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*api.Result, error) {
-	opts := []model.Option{
+func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
+
+	opts := modelOpts(c, o, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
-		model.WithModelFile(c.Model),
+		model.WithModel(c.Model),
 		model.WithContext(o.Context),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
+	})

 	whisperModel, err := o.Loader.BackendLoader(opts...)
 	if err != nil {
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -6,6 +6,7 @@ import (
 	"os"
 	"path/filepath"

+	api_config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	model "github.com/go-skynet/LocalAI/pkg/model"
@@ -28,18 +29,17 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }

-func ModelTTS(text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
-	opts := []model.Option{
-		model.WithBackendString(model.PiperBackend),
-		model.WithModelFile(modelFile),
+func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
+	bb := backend
+	if bb == "" {
+		bb = model.PiperBackend
+	}
+	opts := modelOpts(api_config.Config{}, o, []model.Option{
+		model.WithBackendString(bb),
+		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 		model.WithAssetDir(o.AssetsDestination),
-	}
-
-	for k, v := range o.ExternalGRPCBackends {
-		opts = append(opts, model.WithExternalBackend(k, v))
-	}
-
+	})
 	piperModel, err := o.Loader.BackendLoader(opts...)
 	if err != nil {
 		return "", nil, err
@@ -56,10 +56,13 @@ func ModelTTS(text, modelFile string, loader *model.ModelLoader, o *options.Opti
 	fileName := generateUniqueFileName(o.AudioDir, "piper", ".wav")
 	filePath := filepath.Join(o.AudioDir, fileName)

-	modelPath := filepath.Join(o.Loader.ModelPath, modelFile)
-
-	if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
-		return "", nil, err
+	// If the model file is not empty, we pass it joined with the model path
+	modelPath := ""
+	if modelFile != "" {
+		modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
+		if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+			return "", nil, err
+		}
 	}

 	res, err := piperModel.TTS(context.Background(), &proto.TTSRequest{
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -13,44 +13,104 @@ import (

 type Config struct {
 	PredictionOptions `yaml:"parameters"`
-	Name              string            `yaml:"name"`
-	StopWords         []string          `yaml:"stopwords"`
-	Cutstrings        []string          `yaml:"cutstrings"`
-	TrimSpace         []string          `yaml:"trimspace"`
-	ContextSize       int               `yaml:"context_size"`
-	F16               bool              `yaml:"f16"`
-	NUMA              bool              `yaml:"numa"`
-	Threads           int               `yaml:"threads"`
-	Debug             bool              `yaml:"debug"`
-	Roles             map[string]string `yaml:"roles"`
-	Embeddings        bool              `yaml:"embeddings"`
-	Backend           string            `yaml:"backend"`
-	TemplateConfig    TemplateConfig    `yaml:"template"`
-	MirostatETA       float64           `yaml:"mirostat_eta"`
-	MirostatTAU       float64           `yaml:"mirostat_tau"`
-	Mirostat          int               `yaml:"mirostat"`
-	NGPULayers        int               `yaml:"gpu_layers"`
-	MMap              bool              `yaml:"mmap"`
-	MMlock            bool              `yaml:"mmlock"`
-	LowVRAM           bool              `yaml:"low_vram"`
+	Name              string `yaml:"name"`

-	TensorSplit           string `yaml:"tensor_split"`
-	MainGPU               string `yaml:"main_gpu"`
-	ImageGenerationAssets string `yaml:"asset_dir"`
+	F16            bool              `yaml:"f16"`
+	Threads        int               `yaml:"threads"`
+	Debug          bool              `yaml:"debug"`
+	Roles          map[string]string `yaml:"roles"`
+	Embeddings     bool              `yaml:"embeddings"`
+	Backend        string            `yaml:"backend"`
+	TemplateConfig TemplateConfig    `yaml:"template"`

-	PromptCachePath string `yaml:"prompt_cache_path"`
-	PromptCacheAll  bool   `yaml:"prompt_cache_all"`
-	PromptCacheRO   bool   `yaml:"prompt_cache_ro"`
-
-	Grammar string `yaml:"grammar"`
-
-	PromptStrings, InputStrings                []string
-	InputToken                                 [][]int
-	functionCallString, functionCallNameString string
+	PromptStrings, InputStrings                []string `yaml:"-"`
+	InputToken                                 [][]int  `yaml:"-"`
+	functionCallString, functionCallNameString string   `yaml:"-"`

 	FunctionsConfig Functions `yaml:"function"`

-	SystemPrompt string `yaml:"system_prompt"`
+	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
+	// LLM configs (GPT4ALL, Llama.cpp, ...)
+	LLMConfig `yaml:",inline"`
+
+	// AutoGPTQ specifics
+	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
+
+	// Diffusers
+	Diffusers Diffusers `yaml:"diffusers"`
+
+	Step int `yaml:"step"`
+
+	// GRPC Options
+	GRPC GRPC `yaml:"grpc"`
+
+	// Vall-e-x
+	VallE VallE `yaml:"vall-e"`
+}
+
+type VallE struct {
+	AudioPath string `yaml:"audio_path"`
+}
+
+type FeatureFlag map[string]*bool
+
+func (ff FeatureFlag) Enabled(s string) bool {
+	v, exist := ff[s]
+	return exist && v != nil && *v
+}
+
+type GRPC struct {
+	Attempts          int `yaml:"attempts"`
+	AttemptsSleepTime int `yaml:"attempts_sleep_time"`
+}
+
+type Diffusers struct {
+	PipelineType     string  `yaml:"pipeline_type"`
+	SchedulerType    string  `yaml:"scheduler_type"`
+	CUDA             bool    `yaml:"cuda"`
+	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
+	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
+	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
+	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
+	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
+	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+}
+
+type LLMConfig struct {
+	SystemPrompt    string   `yaml:"system_prompt"`
+	TensorSplit     string   `yaml:"tensor_split"`
+	MainGPU         string   `yaml:"main_gpu"`
+	RMSNormEps      float32  `yaml:"rms_norm_eps"`
+	NGQA            int32    `yaml:"ngqa"`
+	PromptCachePath string   `yaml:"prompt_cache_path"`
+	PromptCacheAll  bool     `yaml:"prompt_cache_all"`
+	PromptCacheRO   bool     `yaml:"prompt_cache_ro"`
+	MirostatETA     float64  `yaml:"mirostat_eta"`
+	MirostatTAU     float64  `yaml:"mirostat_tau"`
+	Mirostat        int      `yaml:"mirostat"`
+	NGPULayers      int      `yaml:"gpu_layers"`
+	MMap            bool     `yaml:"mmap"`
+	MMlock          bool     `yaml:"mmlock"`
+	LowVRAM         bool     `yaml:"low_vram"`
+	Grammar         string   `yaml:"grammar"`
+	StopWords       []string `yaml:"stopwords"`
+	Cutstrings      []string `yaml:"cutstrings"`
+	TrimSpace       []string `yaml:"trimspace"`
+	ContextSize     int      `yaml:"context_size"`
+	NUMA            bool     `yaml:"numa"`
+	LoraAdapter     string   `yaml:"lora_adapter"`
+	LoraBase        string   `yaml:"lora_base"`
+	NoMulMatQ       bool     `yaml:"no_mulmatq"`
+	DraftModel      string   `yaml:"draft_model"`
+	NDraft          int32    `yaml:"n_draft"`
+	Quantization    string   `yaml:"quantization"`
+}
+
+type AutoGPTQ struct {
+	ModelBaseName    string `yaml:"model_base_name"`
+	Device           string `yaml:"device"`
+	Triton           bool   `yaml:"triton"`
+	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

 type Functions struct {
@@ -172,6 +232,16 @@ func (cm *ConfigLoader) GetConfig(m string) (Config, bool) {
 	return v, exists
 }

+func (cm *ConfigLoader) GetAllConfigs() []Config {
+	cm.Lock()
+	defer cm.Unlock()
+	var res []Config
+	for _, v := range cm.configs {
+		res = append(res, v)
+	}
+	return res
+}
+
 func (cm *ConfigLoader) ListConfigs() []string {
 	cm.Lock()
 	defer cm.Unlock()
--- a/api/config/prediction.go
+++ b/api/config/prediction.go
@@ -34,4 +34,17 @@ type PredictionOptions struct {

 	TypicalP float64 `json:"typical_p" yaml:"typical_p"`
 	Seed     int     `json:"seed" yaml:"seed"`
+
+	NegativePrompt      string  `json:"negative_prompt" yaml:"negative_prompt"`
+	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
+	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
+	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
+	// AutoGPTQ
+	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
+
+	// Diffusers
+	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
+
+	// RWKV (?)
+	Tokenizer string `json:"tokenizer" yaml:"tokenizer"`
 }
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -0,0 +1,163 @@
+package localai
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+
+	gopsutil "github.com/shirou/gopsutil/v3/process"
+)
+
+type BackendMonitorRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
+type BackendMonitorResponse struct {
+	MemoryInfo    *gopsutil.MemoryInfoStat
+	MemoryPercent float32
+	CPUPercent    float64
+}
+
+type BackendMonitor struct {
+	configLoader *config.ConfigLoader
+	options      *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
+}
+
+func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
+	return BackendMonitor{
+		configLoader: configLoader,
+		options:      options,
+	}
+}
+
+func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
+	config, exists := bm.configLoader.GetConfig(model)
+	var backend string
+	if exists {
+		backend = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backend = model
+	}
+
+	if !strings.HasSuffix(backend, ".bin") {
+		backend = fmt.Sprintf("%s.bin", backend)
+	}
+
+	pid, err := bm.options.Loader.GetGRPCPID(backend)
+
+	if err != nil {
+		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
+		return nil, err
+	}
+
+	// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
+	backendProcess, err := gopsutil.NewProcess(int32(pid))
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memInfo, err := backendProcess.MemoryInfo()
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memPercent, err := backendProcess.MemoryPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	cpuPercent, err := backendProcess.CPUPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	return &BackendMonitorResponse{
+		MemoryInfo:    memInfo,
+		MemoryPercent: memPercent,
+		CPUPercent:    cpuPercent,
+	}, nil
+}
+
+func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
+	input := new(BackendMonitorRequest)
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", err
+	}
+
+	config, exists := bm.configLoader.GetConfig(input.Model)
+	var backendId string
+	if exists {
+		backendId = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backendId = input.Model
+	}
+
+	if !strings.HasSuffix(backendId, ".bin") {
+		backendId = fmt.Sprintf("%s.bin", backendId)
+	}
+
+	return backendId, nil
+}
+
+func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		client := bm.options.Loader.CheckIsLoaded(backendId)
+
+		if client == nil {
+			return fmt.Errorf("backend %s is not currently loaded", backendId)
+		}
+
+		status, rpcErr := client.Status(context.TODO())
+		if rpcErr != nil {
+			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
+			val, slbErr := bm.SampleLocalBackendProcess(backendId)
+			if slbErr != nil {
+				return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
+			}
+			return c.JSON(proto.StatusResponse{
+				State: proto.StatusResponse_ERROR,
+				Memory: &proto.MemoryUsageData{
+					Total: val.MemoryInfo.VMS,
+					Breakdown: map[string]uint64{
+						"gopsutil-RSS": val.MemoryInfo.RSS,
+					},
+				},
+			})
+		}
+
+		return c.JSON(status)
+	}
+}
+
+func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		return bm.options.Loader.ShutdownModel(backendId)
+	}
+}
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -4,10 +4,12 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"slices"
 	"strings"
 	"sync"

 	json "github.com/json-iterator/go"
+	"gopkg.in/yaml.v3"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
@@ -26,6 +28,7 @@ type galleryOp struct {
 }

 type galleryOpStatus struct {
+	FileName           string  `json:"file_name"`
 	Error              error   `json:"error"`
 	Processed          bool    `json:"processed"`
 	Message            string  `json:"message"`
@@ -49,7 +52,6 @@ func NewGalleryService(modelPath string) *galleryApplier {
 	}
 }

-// prepareModel applies a
 func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {

 	config, err := gallery.GetGalleryConfigFromURL(req.URL)
@@ -75,6 +77,13 @@ func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
 	return g.statuses[s]
 }

+func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
+	g.Lock()
+	defer g.Unlock()
+
+	return g.statuses
+}
+
 func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 	go func() {
 		for {
@@ -93,7 +102,7 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {

 				// displayDownload displays the download progress
 				progressCallback := func(fileName string, current string, total string, percentage float64) {
-					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
+					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
 					utils.DisplayDownloadFunction(fileName, current, total, percentage)
 				}

@@ -128,8 +137,27 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 }

 type galleryModel struct {
-	gallery.GalleryModel
-	ID string `json:"id"`
+	gallery.GalleryModel `yaml:",inline"` // https://github.com/go-yaml/yaml/issues/63
+	ID                   string           `json:"id"`
+}
+
+func processRequests(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
+	var err error
+	for _, r := range requests {
+		utils.ResetDownloadTimers()
+		if r.ID == "" {
+			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
+		} else {
+			if strings.Contains(r.ID, "@") {
+				err = gallery.InstallModelFromGallery(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			} else {
+				err = gallery.InstallModelFromGalleryByName(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			}
+		}
+	}
+	return err
 }

 func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
@@ -137,7 +165,13 @@ func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, gallerie
 	if err != nil {
 		return err
 	}
-	return ApplyGalleryFromString(modelPath, string(dat), cm, galleries)
+	var requests []galleryModel
+
+	if err := yaml.Unmarshal(dat, &requests); err != nil {
+		return err
+	}
+
+	return processRequests(modelPath, s, cm, galleries, requests)
 }

 func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
@@ -147,30 +181,15 @@ func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galler
 		return err
 	}

-	for _, r := range requests {
-		utils.ResetDownloadTimers()
-		if r.ID == "" {
-			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
-		} else {
-			err = gallery.InstallModelFromGallery(galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
-		}
-	}
-
-	return err
+	return processRequests(modelPath, s, cm, galleries, requests)
 }

-/// Endpoints
+/// Endpoint Service

-func GetOpStatusEndpoint(g *galleryApplier) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		status := g.getStatus(c.Params("uuid"))
-		if status == nil {
-			return fmt.Errorf("could not find any status for ID")
-		}
-
-		return c.JSON(status)
-	}
+type ModelGalleryService struct {
+	galleries      []gallery.Gallery
+	modelPath      string
+	galleryApplier *galleryApplier
 }

 type GalleryModel struct {
@@ -178,7 +197,31 @@ type GalleryModel struct {
 	gallery.GalleryModel
 }

-func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan galleryOp, galleries []gallery.Gallery) func(c *fiber.Ctx) error {
+func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
+	return ModelGalleryService{
+		galleries:      galleries,
+		modelPath:      modelPath,
+		galleryApplier: galleryApplier,
+	}
+}
+
+func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		status := mgs.galleryApplier.getStatus(c.Params("uuid"))
+		if status == nil {
+			return fmt.Errorf("could not find any status for ID")
+		}
+		return c.JSON(status)
+	}
+}
+
+func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		return c.JSON(mgs.galleryApplier.getAllStatus())
+	}
+}
+
+func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(GalleryModel)
 		// Get input data from the request body
@@ -190,11 +233,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 		if err != nil {
 			return err
 		}
-		g <- galleryOp{
+		mgs.galleryApplier.C <- galleryOp{
 			req:         input.GalleryModel,
 			id:          uuid.String(),
 			galleryName: input.ID,
-			galleries:   galleries,
+			galleries:   mgs.galleries,
 		}
 		return c.JSON(struct {
 			ID        string `json:"uuid"`
@@ -203,11 +246,11 @@ func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan
 	}
 }

-func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string) func(c *fiber.Ctx) error {
+func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing models from galleries: %+v", galleries)
+		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)

-		models, err := gallery.AvailableGalleryModels(galleries, basePath)
+		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
 		if err != nil {
 			return err
 		}
@@ -222,3 +265,56 @@ func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string)
 		return c.Send(dat)
 	}
 }
+
+// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
+func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s already exists", input.Name)
+		}
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Adding %+v to gallery list", *input)
+		mgs.galleries = append(mgs.galleries, *input)
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s is not currently registered", input.Name)
+		}
+		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		})
+		return c.Send(nil)
+	}
+}
--- a/api/localai/localai.go
+++ b/api/localai/localai.go
@@ -9,8 +9,9 @@ import (
 )

 type TTSRequest struct {
-	Model string `json:"model" yaml:"model"`
-	Input string `json:"input" yaml:"input"`
+	Model   string `json:"model" yaml:"model"`
+	Input   string `json:"input" yaml:"input"`
+	Backend string `json:"backend" yaml:"backend"`
 }

 func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
@@ -22,7 +23,7 @@ func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			return err
 		}

-		filePath, _, err := backend.ModelTTS(input.Input, input.Model, o.Loader, o)
+		filePath, _, err := backend.ModelTTS(input.Backend, input.Input, input.Model, o.Loader, o)
 		if err != nil {
 			return err
 		}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -6,33 +6,48 @@ import (
 	"encoding/json"
 	"fmt"
 	"strings"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )

 func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	emptyMessage := ""
+	id := uuid.New().String()
+	created := int(time.Now().Unix())

-	process := func(s string, req *OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		initialMessage := OpenAIResponse{
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		initialMessage := schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []Choice{{Delta: &Message{Role: "assistant", Content: &emptyMessage}}},
+			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
 			Object:  "chat.completion.chunk",
 		}
 		responses <- initialMessage

-		ComputeChoices(s, req.N, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{{Delta: &Message{Content: &s}, Index: 0}},
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
 				Object:  "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
 			}

 			responses <- resp
@@ -43,7 +58,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 	return func(c *fiber.Ctx) error {
 		processFunctions := false
 		funcs := grammar.Functions{}
-		modelFile, input, err := readInput(c, o.Loader, true)
+		modelFile, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -109,6 +124,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		var predInput string

+		suppressConfigSystemPrompt := false
 		mess := []string{}
 		for messageIndex, i := range input.Messages {
 			var content string
@@ -146,7 +162,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					content = templatedChatMessage
 				}
 			}
-			// If this model doesn't have such a template, or if
+			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
 			if content == "" {
 				if r != "" {
 					if contentExists {
@@ -177,6 +193,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 						}
 					}
 				}
+				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+				if contentExists && role == "system" {
+					suppressConfigSystemPrompt = true
+				}
 			}

 			mess = append(mess, content)
@@ -207,8 +227,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 		templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-			Input:     predInput,
-			Functions: funcs,
+			SystemPrompt:         config.SystemPrompt,
+			SuppressSystemPrompt: suppressConfigSystemPrompt,
+			Input:                predInput,
+			Functions:            funcs,
 		})
 		if err == nil {
 			predInput = templatedInput
@@ -223,31 +245,41 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}

 		if toStream {
-			responses := make(chan OpenAIResponse)
+			responses := make(chan schema.OpenAIResponse)

 			go process(predInput, input, config, o.Loader, responses)

 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {

+				usage := &schema.OpenAIUsage{}
+
 				for ev := range responses {
+					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
 					var buf bytes.Buffer
 					enc := json.NewEncoder(&buf)
 					enc.Encode(ev)
-
 					log.Debug().Msgf("Sending chunk: %s", buf.String())
-					fmt.Fprintf(w, "data: %v\n", buf.String())
+					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
+					if err != nil {
+						log.Debug().Msgf("Sending chunk failed: %v", err)
+						input.Cancel()
+						break
+					}
 					w.Flush()
 				}

-				resp := &OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{
+				resp := &schema.OpenAIResponse{
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{
 						{
 							FinishReason: "stop",
 							Index:        0,
-							Delta:        &Message{Content: &emptyMessage},
+							Delta:        &schema.Message{Content: &emptyMessage},
 						}},
 					Object: "chat.completion.chunk",
+					Usage:  *usage,
 				}
 				respData, _ := json.Marshal(resp)

@@ -258,10 +290,12 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			return nil
 		}

-		result, err := ComputeChoices(predInput, input.N, config, o, o.Loader, func(s string, c *[]Choice) {
+		result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
 			if processFunctions {
 				// As we have to change the result before processing, we can't stream the answer (yet?)
 				ss := map[string]interface{}{}
+				// This prevent newlines to break JSON parsing for clients
+				s = utils.EscapeNewLines(s)
 				json.Unmarshal([]byte(s), &ss)
 				log.Debug().Msgf("Function return: %s %+v", s, ss)

@@ -290,7 +324,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 								message = backend.Finetune(*config, predInput, message)
 								log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)

-								*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: &message}})
+								*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
 								return
 							}
 						}
@@ -300,7 +334,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
 					// Note: This costs (in term of CPU) another computation
 					config.Grammar = ""
-					predFunc, err := backend.ModelInference(predInput, o.Loader, *config, o, nil)
+					predFunc, err := backend.ModelInference(input.Context, predInput, o.Loader, *config, o, nil)
 					if err != nil {
 						log.Error().Msgf("inference error: %s", err.Error())
 						return
@@ -312,28 +346,35 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 						return
 					}

-					prediction = backend.Finetune(*config, predInput, prediction)
-					*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: &prediction}})
+					fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+					*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
 				} else {
 					// otherwise reply with the function call
-					*c = append(*c, Choice{
+					*c = append(*c, schema.Choice{
 						FinishReason: "function_call",
-						Message:      &Message{Role: "assistant", FunctionCall: ss},
+						Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
 					})
 				}

 				return
 			}
-			*c = append(*c, Choice{FinishReason: "stop", Index: 0, Message: &Message{Role: "assistant", Content: &s}})
+			*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
 		}, nil)
 		if err != nil {
 			return err
 		}

-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "chat.completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     tokenUsage.Prompt,
+				CompletionTokens: tokenUsage.Completion,
+				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+			},
 		}
 		respData, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", respData)
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -6,28 +6,42 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"time"

+	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )

 // https://platform.openai.com/docs/api-reference/completions
 func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
-	process := func(s string, req *OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		ComputeChoices(s, req.N, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
-				Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{
+	id := uuid.New().String()
+	created := int(time.Now().Unix())
+
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{
 					{
 						Index: 0,
 						Text:  s,
 					},
 				},
 				Object: "text_completion",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
 			}
 			log.Debug().Msgf("Sending goroutine: %s", s)

@@ -38,7 +52,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 	}

 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readInput(c, o.Loader, true)
+		modelFile, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -84,7 +98,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 				log.Debug().Msgf("Template found, input modified to: %s", predInput)
 			}

-			responses := make(chan OpenAIResponse)
+			responses := make(chan schema.OpenAIResponse)

 			go process(predInput, input, config, o.Loader, responses)

@@ -100,9 +114,11 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 					w.Flush()
 				}

-				resp := &OpenAIResponse{
-					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{
+				resp := &schema.OpenAIResponse{
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{
 						{
 							Index:        0,
 							FinishReason: "stop",
@@ -119,31 +135,46 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			return nil
 		}

-		var result []Choice
+		var result []schema.Choice
+
+		totalTokenUsage := backend.TokenUsage{}
+
 		for k, i := range config.PromptStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-				Input: i,
+				SystemPrompt: config.SystemPrompt,
+				Input:        i,
 			})
 			if err == nil {
 				i = templatedInput
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

-			r, err := ComputeChoices(i, input.N, config, o, o.Loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s, FinishReason: "stop", Index: k})
-			}, nil)
+			r, tokenUsage, err := ComputeChoices(
+				input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
+				}, nil)
 			if err != nil {
 				return err
 			}

+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
 			result = append(result, r...)
 		}

-		resp := &OpenAIResponse{
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "text_completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -3,17 +3,22 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

+	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+
 	"github.com/rs/zerolog/log"
 )

 func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readInput(c, o.Loader, true)
+		modelFile, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -31,32 +36,47 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			templateFile = config.TemplateConfig.Edit
 		}

-		var result []Choice
+		var result []schema.Choice
+		totalTokenUsage := backend.TokenUsage{}
+
 		for _, i := range config.InputStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
-				Input:       i,
-				Instruction: input.Instruction,
+				Input:        i,
+				Instruction:  input.Instruction,
+				SystemPrompt: config.SystemPrompt,
 			})
 			if err == nil {
 				i = templatedInput
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

-			r, err := ComputeChoices(i, input.N, config, o, o.Loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s})
+			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+				*c = append(*c, schema.Choice{Text: s})
 			}, nil)
 			if err != nil {
 				return err
 			}

+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
 			result = append(result, r...)
 		}

-		resp := &OpenAIResponse{
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
 			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: result,
 			Object:  "edit",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -3,9 +3,13 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
+	"time"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"
+
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
@@ -14,7 +18,7 @@ import (
 // https://platform.openai.com/docs/api-reference/embeddings
 func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.Loader, true)
+		model, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -25,7 +29,7 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		}

 		log.Debug().Msgf("Parameter Config: %+v", config)
-		items := []Item{}
+		items := []schema.Item{}

 		for i, s := range config.InputToken {
 			// get the model function to call for the result
@@ -38,7 +42,7 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			if err != nil {
 				return err
 			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}

 		for i, s := range config.InputStrings {
@@ -52,13 +56,17 @@ func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 			if err != nil {
 				return err
 			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
 		}

-		resp := &OpenAIResponse{
-			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:   items,
-			Object: "list",
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:    items,
+			Object:  "list",
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -1,14 +1,18 @@
 package openai

 import (
+	"bufio"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"

 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
@@ -35,7 +39,7 @@ import (
 */
 func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.Loader, false)
+		m, input, err := readInput(c, o, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -50,6 +54,31 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}

+		src := ""
+		if input.File != "" {
+			//base 64 decode the file and write it somewhere
+			// that we will cleanup
+			decoded, err := base64.StdEncoding.DecodeString(input.File)
+			if err != nil {
+				return err
+			}
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
+			if err != nil {
+				return err
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(decoded)
+			if err != nil {
+				outputFile.Close()
+				return err
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
 		log.Debug().Msgf("Parameter Config: %+v", config)

 		// XXX: Only stablediffusion is supported for now
@@ -74,8 +103,8 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 		if input.ResponseFormat == "b64_json" {
 			b64JSON = true
 		}
-
-		var result []Item
+		// src and clip_skip
+		var result []schema.Item
 		for _, i := range config.PromptStrings {
 			n := input.N
 			if input.N == 0 {
@@ -90,7 +119,10 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 				}

 				mode := 0
-				step := 15
+				step := config.Step
+				if step == 0 {
+					step = 15
+				}

 				if input.Mode != 0 {
 					mode = input.Mode
@@ -105,7 +137,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 					tempDir = o.ImageDir
 				}
 				// Create a temporary file
-				outputFile, err := ioutil.TempFile(tempDir, "b64")
+				outputFile, err := os.CreateTemp(tempDir, "b64")
 				if err != nil {
 					return err
 				}
@@ -119,7 +151,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx

 				baseURL := c.BaseURL()

-				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, output, o.Loader, *config, o)
+				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, src, output, o.Loader, *config, o)
 				if err != nil {
 					return err
 				}
@@ -127,7 +159,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 					return err
 				}

-				item := &Item{}
+				item := &schema.Item{}

 				if b64JSON {
 					defer os.RemoveAll(output)
@@ -145,8 +177,12 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 			}
 		}

-		resp := &OpenAIResponse{
-			Data: result,
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Data:    result,
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -4,33 +4,47 @@ import (
 	"github.com/go-skynet/LocalAI/api/backend"
 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ComputeChoices(predInput string, n int, config *config.Config, o *options.Option, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
-	result := []Choice{}
+func ComputeChoices(
+	req *schema.OpenAIRequest,
+	predInput string,
+	config *config.Config,
+	o *options.Option,
+	loader *model.ModelLoader,
+	cb func(string, *[]schema.Choice),
+	tokenCallback func(string, backend.TokenUsage) bool) ([]schema.Choice, backend.TokenUsage, error) {
+	n := req.N // number of completions to return
+	result := []schema.Choice{}

 	if n == 0 {
 		n = 1
 	}

 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(predInput, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
 	if err != nil {
-		return result, err
+		return result, backend.TokenUsage{}, err
 	}

+	tokenUsage := backend.TokenUsage{}
+
 	for i := 0; i < n; i++ {
 		prediction, err := predFunc()
 		if err != nil {
-			return result, err
+			return result, backend.TokenUsage{}, err
 		}

-		prediction = backend.Finetune(*config, predInput, prediction)
-		cb(prediction, &result)
+		tokenUsage.Prompt += prediction.Usage.Prompt
+		tokenUsage.Completion += prediction.Usage.Completion
+
+		finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+		cb(finetunedResponse, &result)

 		//result = append(result, Choice{Text: prediction})

 	}
-	return result, err
+	return result, tokenUsage, err
 }
--- a/api/openai/list.go
+++ b/api/openai/list.go
@@ -1,7 +1,10 @@
 package openai

 import (
+	"regexp"
+
 	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 )
@@ -14,21 +17,50 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 		}
 		var mm map[string]interface{} = map[string]interface{}{}

-		dataModels := []OpenAIModel{}
-		for _, m := range models {
-			mm[m] = nil
-			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+		dataModels := []schema.OpenAIModel{}
+
+		var filterFn func(name string) bool
+		filter := c.Query("filter")
+
+		// If filter is not specified, do not filter the list by model name
+		if filter == "" {
+			filterFn = func(_ string) bool { return true }
+		} else {
+			// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
+			rxp, err := regexp.Compile(filter)
+			if err != nil {
+				return err
+			}
+			filterFn = func(name string) bool {
+				return rxp.MatchString(name)
+			}
 		}

-		for _, k := range cm.ListConfigs() {
-			if _, exists := mm[k]; !exists {
-				dataModels = append(dataModels, OpenAIModel{ID: k, Object: "model"})
+		// By default, exclude any loose files that are already referenced by a configuration file.
+		excludeConfigured := c.QueryBool("excludeConfigured", true)
+
+		// Start with the known configurations
+		for _, c := range cm.GetAllConfigs() {
+			if excludeConfigured {
+				mm[c.Model] = nil
+			}
+
+			if filterFn(c.Name) {
+				dataModels = append(dataModels, schema.OpenAIModel{ID: c.Name, Object: "model"})
+			}
+		}
+
+		// Then iterate through the loose files:
+		for _, m := range models {
+			// And only adds them if they shouldn't be skipped.
+			if _, exists := mm[m]; !exists && filterFn(m) {
+				dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
 			}
 		}

 		return c.JSON(struct {
-			Object string        `json:"object"`
-			Data   []OpenAIModel `json:"data"`
+			Object string               `json:"object"`
+			Data   []schema.OpenAIModel `json:"data"`
 		}{
 			Object: "list",
 			Data:   dataModels,
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -1,6 +1,7 @@
 package openai

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"os"
@@ -8,13 +9,19 @@ import (
 	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
+	options "github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

-func readInput(c *fiber.Ctx, loader *model.ModelLoader, randomModel bool) (string, *OpenAIRequest, error) {
-	input := new(OpenAIRequest)
+func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *schema.OpenAIRequest, error) {
+	loader := o.Loader
+	input := new(schema.OpenAIRequest)
+	ctx, cancel := context.WithCancel(o.Context)
+	input.Context = ctx
+	input.Cancel = cancel
 	// Get input data from the request body
 	if err := c.BodyParser(input); err != nil {
 		return "", nil, err
@@ -54,7 +61,7 @@ func readInput(c *fiber.Ctx, loader *model.ModelLoader, randomModel bool) (strin
 	return modelFile, input, nil
 }

-func updateConfig(config *config.Config, input *OpenAIRequest) {
+func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
 	}
@@ -65,6 +72,38 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		config.TopP = input.TopP
 	}

+	if input.Backend != "" {
+		config.Backend = input.Backend
+	}
+
+	if input.ClipSkip != 0 {
+		config.Diffusers.ClipSkip = input.ClipSkip
+	}
+
+	if input.ModelBaseName != "" {
+		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
+	}
+
+	if input.NegativePromptScale != 0 {
+		config.NegativePromptScale = input.NegativePromptScale
+	}
+
+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
+	if input.NegativePrompt != "" {
+		config.NegativePrompt = input.NegativePrompt
+	}
+
+	if input.RopeFreqBase != 0 {
+		config.RopeFreqBase = input.RopeFreqBase
+	}
+
+	if input.RopeFreqScale != 0 {
+		config.RopeFreqScale = input.RopeFreqScale
+	}
+
 	if input.Grammar != "" {
 		config.Grammar = input.Grammar
 	}
@@ -115,15 +154,15 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 	}

 	if input.Mirostat != 0 {
-		config.Mirostat = input.Mirostat
+		config.LLMConfig.Mirostat = input.Mirostat
 	}

 	if input.MirostatETA != 0 {
-		config.MirostatETA = input.MirostatETA
+		config.LLMConfig.MirostatETA = input.MirostatETA
 	}

 	if input.MirostatTAU != 0 {
-		config.MirostatTAU = input.MirostatTAU
+		config.LLMConfig.MirostatTAU = input.MirostatTAU
 	}

 	if input.TypicalP != 0 {
@@ -161,7 +200,7 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		n, exists := fnc["name"]
 		if exists {
 			nn, e := n.(string)
-			if !e {
+			if e {
 				name = nn
 			}
 		}
@@ -180,7 +219,7 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 	}
 }

-func readConfig(modelFile string, input *OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *OpenAIRequest, error) {
+func readConfig(modelFile string, input *schema.OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *schema.OpenAIRequest, error) {
 	// Load a config file if present after the model name
 	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")

--- a/api/openai/transcription.go
+++ b/api/openai/transcription.go
@@ -19,7 +19,7 @@ import (
 // https://platform.openai.com/docs/api-reference/audio/create
 func TranscriptEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.Loader, false)
+		m, input, err := readInput(c, o, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -23,6 +23,7 @@ type Option struct {
 	PreloadJSONModels                   string
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
+	ApiKeys                             []string

 	Galleries []gallery.Gallery

@@ -32,6 +33,8 @@ type Option struct {
 	ExternalGRPCBackends map[string]string

 	AutoloadGalleries bool
+
+	SingleBackend bool
 }

 type AppOption func(*Option)
@@ -57,6 +60,10 @@ func WithCors(b bool) AppOption {
 	}
 }

+var EnableSingleBackend = func(o *Option) {
+	o.SingleBackend = true
+}
+
 var EnableGalleriesAutoload = func(o *Option) {
 	o.AutoloadGalleries = true
 }
@@ -92,6 +99,7 @@ func WithStringGalleries(galls string) AppOption {
 	return func(o *Option) {
 		if galls == "" {
 			log.Debug().Msgf("no galleries to load")
+			o.Galleries = []gallery.Gallery{}
 			return
 		}
 		var galleries []gallery.Gallery
@@ -184,3 +192,9 @@ func WithImageDir(imageDir string) AppOption {
 		o.ImageDir = imageDir
 	}
 }
+
+func WithApiKeys(apiKeys []string) AppOption {
+	return func(o *Option) {
+		o.ApiKeys = apiKeys
+	}
+}
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -1,6 +1,8 @@
-package openai
+package schema

 import (
+	"context"
+
 	config "github.com/go-skynet/LocalAI/api/config"

 	"github.com/go-skynet/LocalAI/pkg/grammar"
@@ -70,6 +72,9 @@ type OpenAIModel struct {
 type OpenAIRequest struct {
 	config.PredictionOptions

+	Context context.Context
+	Cancel  context.CancelFunc
+
 	// whisper
 	File string `json:"file" validate:"required"`
 	//whisper/image
@@ -102,4 +107,9 @@ type OpenAIRequest struct {
 	Grammar string `json:"grammar" yaml:"grammar"`

 	JSONFunctionGrammarObject *grammar.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`
+
+	Backend string `json:"backend" yaml:"backend"`
+
+	// AutoGPTQ
+	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
 }
--- a/pkg/grpc/whisper/api/api.go
+++ b/pkg/grpc/whisper/api/api.go
@@ -1,4 +1,4 @@
-package api
+package schema

 import "time"

--- a/cmd/grpc/bert-embeddings/main.go
+++ b/cmd/grpc/bert-embeddings/main.go
@@ -5,8 +5,8 @@ package main
 import (
 	"flag"

+	bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-	bert "github.com/go-skynet/LocalAI/pkg/grpc/llm/bert"
 )

 var (
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	bloomz "github.com/go-skynet/LocalAI/pkg/grpc/llm/bloomz"
+	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/dolly/main.go
+++ b/cmd/grpc/dolly/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/falcon-ggml/main.go
+++ b/cmd/grpc/falcon-ggml/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -7,7 +7,7 @@ package main
 import (
 	"flag"

-	falcon "github.com/go-skynet/LocalAI/pkg/grpc/llm/falcon"
+	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gpt2/main.go
+++ b/cmd/grpc/gpt2/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gpt4all/main.go
+++ b/cmd/grpc/gpt4all/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	gpt4all "github.com/go-skynet/LocalAI/pkg/grpc/llm/gpt4all"
+	gpt4all "github.com/go-skynet/LocalAI/pkg/backend/llm/gpt4all"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gptj/main.go
+++ b/cmd/grpc/gptj/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/gptneox/main.go
+++ b/cmd/grpc/gptneox/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/langchain-huggingface/main.go
+++ b/cmd/grpc/langchain-huggingface/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	langchain "github.com/go-skynet/LocalAI/pkg/grpc/llm/langchain"
+	langchain "github.com/go-skynet/LocalAI/pkg/backend/llm/langchain"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/llama-grammar/main.go
+++ b/cmd/grpc/llama-grammar/main.go
@@ -1,13 +1,9 @@
 package main

-// GRPC Falcon server
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
 import (
 	"flag"

-	llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama-grammar"
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/llama/main.go
+++ b/cmd/grpc/llama/main.go
@@ -7,7 +7,7 @@ package main
 import (
 	"flag"

-	llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama"
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/mpt/main.go
+++ b/cmd/grpc/mpt/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/piper/main.go
+++ b/cmd/grpc/piper/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	tts "github.com/go-skynet/LocalAI/pkg/grpc/tts"
+	tts "github.com/go-skynet/LocalAI/pkg/backend/tts"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/replit/main.go
+++ b/cmd/grpc/replit/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/rwkv/main.go
+++ b/cmd/grpc/rwkv/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	rwkv "github.com/go-skynet/LocalAI/pkg/grpc/llm/rwkv"
+	rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/stablediffusion/main.go
+++ b/cmd/grpc/stablediffusion/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	image "github.com/go-skynet/LocalAI/pkg/grpc/image"
+	image "github.com/go-skynet/LocalAI/pkg/backend/image"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/starcoder/main.go
+++ b/cmd/grpc/starcoder/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transformers "github.com/go-skynet/LocalAI/pkg/grpc/llm/transformers"
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/cmd/grpc/whisper/main.go
+++ b/cmd/grpc/whisper/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	transcribe "github.com/go-skynet/LocalAI/pkg/grpc/transcribe"
+	transcribe "github.com/go-skynet/LocalAI/pkg/backend/transcribe"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
 )
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,4 +12,5 @@ services:
      - .env
    volumes:
      - ./models:/models:cached
+      - ./images/:/tmp/generated/images/
    command: ["/usr/bin/local-ai" ]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -16,6 +16,25 @@ else
 	echo "see the documentation at: https://localai.io/basics/build/index.html"
 	echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288"
 	echo "@@@@@"
+	echo "CPU info:"
+	grep -e "model\sname" /proc/cpuinfo | head -1
+	grep -e "flags" /proc/cpuinfo | head -1
+	if grep -q -e "\savx\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX    found OK"
+	else
+		echo "CPU: no AVX    found"
+	fi
+	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX2   found OK"
+	else
+		echo "CPU: no AVX2   found"
+	fi
+	if grep -q -e "\savx512" /proc/cpuinfo ; then
+		echo "CPU:    AVX512 found OK"
+	else
+		echo "CPU: no AVX512 found"
+	fi
+	echo "@@@@@"
 fi

 ./local-ai "$@"
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,7 +1,16 @@
 # Examples

+| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
+
+|                                                                    [Telegram bot](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot)   | [Flowise](https://github.com/go-skynet/LocalAI/tree/master/examples/flowise)                                                                                                                     |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)   |  ![Screenshot from 2023-05-30 18-01-03](https://github.com/go-skynet/LocalAI/assets/2420543/02458782-0549-4131-971c-95ee56ec1af8)|    |
+
 Here is a list of projects that can easily be integrated with the LocalAI backend. 

+
 ### Projects

 ### AutoGPT
@@ -148,6 +157,26 @@ Allows to run any LocalAI-compatible model as a backend on the servers of https:

 [Check it out here](https://runpod.io/gsc?template=uv9mtqnrd0&ref=984wlcra)

+### Continue
+
+_by [@gruberdev](https://github.com/gruberdev)_
+
+<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
+
+Demonstrates how to integrate an open-source copilot alternative that enhances code analysis, completion, and improvements. This approach seamlessly integrates with any LocalAI model, offering a more user-friendly experience.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/continue/)
+
+### Streamlit bot
+
+_by [@majoshi1](https://github.com/majoshi1)_
+
+![Screenshot](streamlit-bot/streamlit-bot.png)
+
+A chat bot made using `Streamlit` & LocalAI.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/streamlit-bot/)
+
 ## Want to contribute?

 Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/autoGPT/.env.example
+++ b/examples/autoGPT/.env.example
@@ -1,5 +1,9 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
-PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
+# see other options in the model gallery at https://github.com/go-skynet/model-gallery
+PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
--- a/examples/autoGPT/README.md
+++ b/examples/autoGPT/README.md
@@ -10,12 +10,16 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/autoGPT

+cp -rfv .env.example .env
+
+# Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
+vim .env
+
 docker-compose run --rm auto-gpt
 ```

 Note: The example automatically downloads the `gpt4all` model as it is under a permissive license. The GPT4All model does not seem to be enough to run AutoGPT. WizardLM-7b-uncensored seems to perform better (with `f16: true`).

-See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.

 ## Without docker

--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: backend monitor
+  type: http
+  seq: 4
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
+  body: none
+  auth: none
+}
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -0,0 +1,21 @@
+meta {
+  name: backend-shutdown
+  type: http
+  seq: 3
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,5 @@
+{
+  "version": "1",
+  "name": "LocalAI Test Requests",
+  "type": "collection"
+}
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -0,0 +1,6 @@
+vars {
+  HOST: localhost
+  PORT: 8080
+  DEFAULT_MODEL: gpt-3.5-turbo
+  PROTOCOL: http://
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: get models list
+  type: http
+  seq: 2
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,24 @@
+meta {
+  name: -completions
+  type: http
+  seq: 4
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "prompt": "function downloadFile(string url, string outputPath) {",
+      "max_tokens": 256,
+      "temperature": 0.5
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,23 @@
+meta {
+  name: -edits
+  type: http
+  seq: 5
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "What day of the wek is it?",
+      "instruction": "Fix the spelling mistakes"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: -embeddings
+  type: http
+  seq: 6
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,24 @@
+meta {
+  name: chat completion -simple- 1 message-
+  type: http
+  seq: 4
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "user", "content": "How could one use friction to cook an egg?"}],
+       "max_tokens": 256,
+       "temperature": 0.2
+  }
+}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,29 @@
+meta {
+  name: chat-completions -long-
+  type: http
+  seq: 5
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
+           {"role": "user", "content": "How could one use electricity to cook an egg?"},
+           {"role": "assistant",
+                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
+              },
+              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
+       "max_tokens": 1024,
+       "temperature": 0.5
+  }
+}
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,25 @@
+meta {
+  name: chat-completions -stream-
+  type: http
+  seq: 6
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+       "model": "{{DEFAULT_MODEL}}",
+       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
+       "max_tokens": 256,
+       "temperature": 0.9,
+       "stream": true
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: add model gallery
+  type: http
+  seq: 10
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
+      "name": "test"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
+meta {
+  name: delete model gallery
+  type: http
+  seq: 11
+}
+
+delete {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "name": "test"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: list MODELS in galleries
+  type: http
+  seq: 7
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: list model GALLERIES
+  type: http
+  seq: 8
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,21 @@
+meta {
+  name: model gallery apply -gist-
+  type: http
+  seq: 12
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: model gallery apply
+  type: http
+  seq: 9
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
+      "name": "codellama7b"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: -tts
+  type: http
+  seq: 2
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
+  }
+}
--- a/examples/chatbot-ui-manual/README.md
+++ b/examples/chatbot-ui-manual/README.md
@@ -24,10 +24,13 @@ docker-compose up -d --pull always
 # docker-compose up -d --build
 ```

+Then browse to `http://localhost:3000` to view the Web UI.
+
 ## Pointing chatbot-ui to a separately managed LocalAI service

-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+
+```yaml
 version: '3.6'

 services:
@@ -40,9 +43,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```

-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.

 ## Accessing chatbot-ui

 Open http://localhost:3000 for the Web UI.
-
--- a/examples/chatbot-ui-manual/models
+++ b/examples/chatbot-ui-manual/models
@@ -0,0 +1 @@
+../models
--- a/examples/chatbot-ui/README.md
+++ b/examples/chatbot-ui/README.md
@@ -20,10 +20,13 @@ docker-compose up --pull always
 # docker-compose up -d --build
 ```

+Then browse to `http://localhost:3000` to view the Web UI.
+
 ## Pointing chatbot-ui to a separately managed LocalAI service

-If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
-```
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose.yaml` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+
+```yaml
 version: '3.6'

 services:
@@ -36,9 +39,8 @@ services:
      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
 ```

-Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+Once you've edited the `docker-compose.yaml`, you can start it with `docker compose up`, then browse to `http://localhost:3000` to view the Web UI.

 ## Accessing chatbot-ui

 Open http://localhost:3000 for the Web UI.
-
--- a/examples/continue/README.md
+++ b/examples/continue/README.md
@@ -0,0 +1,53 @@
+# Continue
+
+![logo](https://continue.dev/docs/assets/images/continue-cover-logo-aa135cc83fe8a14af480d1633ed74eb5.png)
+
+This document presents an example of integration with [continuedev/continue](https://github.com/continuedev/continue).
+
+![Screenshot](https://continue.dev/docs/assets/images/continue-screenshot-1f36b99467817f755739d7f4c4c08fe3.png)
+
+For a live demonstration, please click on the link below:
+
+- [How it works (Video demonstration)](https://www.youtube.com/watch?v=3Ocrc-WX4iQ)
+
+## Integration Setup Walkthrough
+
+1. [As outlined in `continue`'s documentation](https://continue.dev/docs/getting-started), install the [Visual Studio Code extension from the marketplace](https://marketplace.visualstudio.com/items?itemName=Continue.continue) and open it.
+2. In this example, LocalAI will download the gpt4all model and set it up as "gpt-3.5-turbo". Refer to the `docker-compose.yaml` file for details.
+
+    ```bash
+    # Clone LocalAI
+    git clone https://github.com/go-skynet/LocalAI
+
+    cd LocalAI/examples/continue
+
+    # Start with docker-compose
+    docker-compose up --build -d
+    ```
+
+3. Type `/config` within Continue's VSCode extension, or edit the file located at `~/.continue/config.py` on your system with the following configuration:
+
+    ```py
+    from continuedev.src.continuedev.libs.llm.openai import OpenAI
+
+    config = ContinueConfig(
+       ...
+       models=Models(
+            default=OpenAI(
+               api_key="my-api-key",
+               model="gpt-3.5-turbo",
+               api_base="http://localhost:8080",
+            )
+       ),
+    )
+    ```
+
+This setup enables you to make queries directly to your model running in the Docker container. Note that the `api_key` does not need to be properly set up; it is included here as a placeholder.
+
+If editing the configuration seems confusing, you may copy and paste the provided default `config.py` file over the existing one in `~/.continue/config.py` after initializing the extension in the VSCode IDE.
+
+## Additional Resources
+
+- [Official Continue documentation](https://continue.dev/docs/intro)
+- [Documentation page on using self-hosted models](https://continue.dev/docs/customization#self-hosting-an-open-source-model)
+- [Official extension link](https://marketplace.visualstudio.com/items?itemName=Continue.continue)
--- a/examples/continue/config.py
+++ b/examples/continue/config.py
@@ -0,0 +1,148 @@
+"""
+This is the Continue configuration file.
+
+See https://continue.dev/docs/customization to learn more.
+"""
+
+import subprocess
+
+from continuedev.src.continuedev.core.main import Step
+from continuedev.src.continuedev.core.sdk import ContinueSDK
+from continuedev.src.continuedev.core.models import Models
+from continuedev.src.continuedev.core.config import CustomCommand, SlashCommand, ContinueConfig
+from continuedev.src.continuedev.plugins.context_providers.github import GitHubIssuesContextProvider
+from continuedev.src.continuedev.plugins.context_providers.google import GoogleContextProvider
+from continuedev.src.continuedev.plugins.policies.default import DefaultPolicy
+from continuedev.src.continuedev.libs.llm.openai import OpenAI, OpenAIServerInfo
+from continuedev.src.continuedev.libs.llm.ggml import GGML
+
+from continuedev.src.continuedev.plugins.steps.open_config import OpenConfigStep
+from continuedev.src.continuedev.plugins.steps.clear_history import ClearHistoryStep
+from continuedev.src.continuedev.plugins.steps.feedback import FeedbackStep
+from continuedev.src.continuedev.plugins.steps.comment_code import CommentCodeStep
+from continuedev.src.continuedev.plugins.steps.share_session import ShareSessionStep
+from continuedev.src.continuedev.plugins.steps.main import EditHighlightedCodeStep
+from continuedev.src.continuedev.plugins.context_providers.search import SearchContextProvider
+from continuedev.src.continuedev.plugins.context_providers.diff import DiffContextProvider
+from continuedev.src.continuedev.plugins.context_providers.url import URLContextProvider
+
+class CommitMessageStep(Step):
+    """
+    This is a Step, the building block of Continue.
+    It can be used below as a slash command, so that
+    run will be called when you type '/commit'.
+    """
+    async def run(self, sdk: ContinueSDK):
+
+        # Get the root directory of the workspace
+        dir = sdk.ide.workspace_directory
+
+        # Run git diff in that directory
+        diff = subprocess.check_output(
+            ["git", "diff"], cwd=dir).decode("utf-8")
+
+        # Ask the LLM to write a commit message,
+        # and set it as the description of this step
+        self.description = await sdk.models.default.complete(
+            f"{diff}\n\nWrite a short, specific (less than 50 chars) commit message about the above changes:")
+
+
+config = ContinueConfig(
+
+    # If set to False, we will not collect any usage data
+    # See here to learn what anonymous data we collect: https://continue.dev/docs/telemetry
+    allow_anonymous_telemetry=True,
+
+    models = Models(
+        default = OpenAI(
+            api_key = "my-api-key",
+            model = "gpt-3.5-turbo",
+            openai_server_info = OpenAIServerInfo(
+                api_base = "http://localhost:8080",
+                model = "gpt-3.5-turbo"
+            )
+        )
+    ),
+    # Set a system message with information that the LLM should always keep in mind
+    # E.g. "Please give concise answers. Always respond in Spanish."
+    system_message=None,
+
+    # Set temperature to any value between 0 and 1. Higher values will make the LLM
+    # more creative, while lower values will make it more predictable.
+    temperature=0.5,
+
+    # Custom commands let you map a prompt to a shortened slash command
+    # They are like slash commands, but more easily defined - write just a prompt instead of a Step class
+    # Their output will always be in chat form
+    custom_commands=[
+        # CustomCommand(
+        #     name="test",
+        #     description="Write unit tests for the higlighted code",
+        #     prompt="Write a comprehensive set of unit tests for the selected code. It should setup, run tests that check for correctness including important edge cases, and teardown. Ensure that the tests are complete and sophisticated. Give the tests just as chat output, don't edit any file.",
+        # )
+    ],
+
+    # Slash commands let you run a Step from a slash command
+    slash_commands=[
+        # SlashCommand(
+        #     name="commit",
+        #     description="This is an example slash command. Use /config to edit it and create more",
+        #     step=CommitMessageStep,
+        # )
+        SlashCommand(
+            name="edit",
+            description="Edit code in the current file or the highlighted code",
+            step=EditHighlightedCodeStep,
+        ),
+        SlashCommand(
+            name="config",
+            description="Customize Continue - slash commands, LLMs, system message, etc.",
+            step=OpenConfigStep,
+        ),
+        SlashCommand(
+            name="comment",
+            description="Write comments for the current file or highlighted code",
+            step=CommentCodeStep,
+        ),
+        SlashCommand(
+            name="feedback",
+            description="Send feedback to improve Continue",
+            step=FeedbackStep,
+        ),
+        SlashCommand(
+            name="clear",
+            description="Clear step history",
+            step=ClearHistoryStep,
+        ),
+        SlashCommand(
+            name="share",
+            description="Download and share the session transcript",
+            step=ShareSessionStep,
+        )
+    ],
+
+    # Context providers let you quickly select context by typing '@'
+    # Uncomment the following to
+    # - quickly reference GitHub issues
+    # - show Google search results to the LLM
+    context_providers=[
+        # GitHubIssuesContextProvider(
+        #     repo_name="<your github username or organization>/<your repo name>",
+        #     auth_token="<your github auth token>"
+        # ),
+        # GoogleContextProvider(
+        #     serper_api_key="<your serper.dev api key>"
+        # )
+        SearchContextProvider(),
+        DiffContextProvider(),
+        URLContextProvider(
+            preset_urls = [
+                # Add any common urls you reference here so they appear in autocomplete
+            ]
+        )
+    ],
+
+    # Policies hold the main logic that decides which Step to take next
+    # You can use them to design agents, or deeply customize Continue
+    policy=DefaultPolicy()
+)
--- a/examples/continue/docker-compose.yml
+++ b/examples/continue/docker-compose.yml
@@ -0,0 +1,27 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    # As initially LocalAI will download the models defined in PRELOAD_MODELS
+    # you might need to tweak the healthcheck values here according to your network connection.
+    # Here we give a timespan of 20m to download all the required files.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 20
+    build:
+      context: ../../
+      dockerfile: Dockerfile
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+      # You can preload different models here as well.
+      # See: https://github.com/go-skynet/model-gallery
+      - 'PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}]'
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
--- a/examples/continue/img/screen.png
+++ b/examples/continue/img/screen.png
--- a/examples/discord-bot/.env.example
+++ b/examples/discord-bot/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=x
 DISCORD_BOT_TOKEN=x
 DISCORD_CLIENT_ID=x
--- a/examples/discord-bot/models
+++ b/examples/discord-bot/models
@@ -1 +1 @@
-../chatbot-ui/models/
+../models
--- a/examples/functions/.env.example
+++ b/examples/functions/.env.example
@@ -1,7 +1,11 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 OPENAI_API_KEY=sk---anystringhere
 OPENAI_API_BASE=http://api:8080/v1
 # Models to preload at start
-# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
+# see other options in the model gallery at https://github.com/go-skynet/model-gallery
 PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/openllama-7b-open-instruct.yaml", "name": "gpt-3.5-turbo"}]

 ## Change the default number of threads
--- a/examples/functions/README.md
+++ b/examples/functions/README.md
@@ -10,9 +10,12 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/functions

+cp -rfv .env.example .env
+
+# Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
+vim .env
+
 docker-compose run --rm functions
 ```

 Note: The example automatically downloads the `openllama` model as it is under a permissive license.
-
-See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.
--- a/examples/insomnia/Insomnia_LocalAI.json
+++ b/examples/insomnia/Insomnia_LocalAI.json
--- a/examples/insomnia/README.md
+++ b/examples/insomnia/README.md
@@ -0,0 +1,17 @@
+# Insomnia
+
+Developer Testing Request Collection for [Insomnia](https://insomnia.rest/), an open-source REST client
+
+## Instructions
+
+* Install Insomnia as normal
+* [Import](https://docs.insomnia.rest/insomnia/import-export-data) `Insomnia_LocalAI.json`
+* Control + E opens the environment settings - 
+
+| **Parameter Name** | **Default Value** | **Description**                          |
+|--------------------|-------------------|------------------------------------------|
+| HOST               | localhost         | LocalAI base URL                         |
+| PORT               | 8080              | LocalAI port                             |
+| DEFAULT_MODEL      | gpt-3.5-turbo     | Name of the model used on most requests. |
+
+** you may want to duplicate localhost into a "Private" environment to avoid saving private settings back to this file **
--- a/examples/k8sgpt/README.md
+++ b/examples/k8sgpt/README.md
@@ -38,7 +38,7 @@ helm install local-ai go-skynet/local-ai --create-namespace --namespace local-ai
 # Install k8sgpt
 helm repo add k8sgpt https://charts.k8sgpt.ai/
 helm repo update
-helm install release k8sgpt/k8sgpt-operator -n k8sgpt-operator-system --create-namespace
+helm install release k8sgpt/k8sgpt-operator -n k8sgpt-operator-system --create-namespace --version 0.0.17
 ```

 Apply the k8sgpt-operator configuration:
@@ -55,7 +55,6 @@ spec:
  baseUrl: http://local-ai.local-ai.svc.cluster.local:8080/v1
  noCache: false
  model: gpt-3.5-turbo
-  noCache: false
  version: v0.3.0
  enableAI: true
 EOF
@@ -67,4 +66,7 @@ Apply a broken pod:

 ```
 kubectl apply -f broken-pod.yaml
-```
+```
+
+## ArgoCD Deployment Example
+[Deploy K8sgpt + localai with Argocd](https://github.com/tyler-harpool/gitops/tree/main/infra/k8gpt)
--- a/examples/k8sgpt/values.yaml
+++ b/examples/k8sgpt/values.yaml
@@ -2,12 +2,13 @@ replicaCount: 1

 deployment:
  # https://quay.io/repository/go-skynet/local-ai?tab=tags
-  image: quay.io/go-skynet/local-ai:latest
+  image: quay.io/go-skynet/local-ai:v1.23.0
  env:
    threads: 4
    debug: "true"
    context_size: 512
-    preload_models: '[{ "url": "github:go-skynet/model-gallery/wizard.yaml", "name": "gpt-3.5-turbo", "overrides": { "parameters": { "model": "WizardLM-7B-uncensored.ggmlv3.q5_1" }},"files": [ { "uri": "https://huggingface.co//WizardLM-7B-uncensored-GGML/resolve/main/WizardLM-7B-uncensored.ggmlv3.q5_1.bin", "sha256": "d92a509d83a8ea5e08ba4c2dbaf08f29015932dc2accd627ce0665ac72c2bb2b", "filename": "WizardLM-7B-uncensored.ggmlv3.q5_1" }]}]'
+    galleries: '[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
+    preload_models: '[{ "id": "huggingface@thebloke__open-llama-13b-open-instruct-ggml__open-llama-13b-open-instruct.ggmlv3.q3_k_m.bin", "name": "gpt-3.5-turbo", "overrides": { "f16": true, "mmap": true }}]'
  modelsPath: "/models"

 resources:
--- a/examples/langchain-chroma/.env.example
+++ b/examples/langchain-chroma/.env.example
@@ -1,3 +1,6 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
 THREADS=4
 CONTEXT_SIZE=512
 MODELS_PATH=/models
--- a/examples/langchain-chroma/models
+++ b/examples/langchain-chroma/models
@@ -0,0 +1 @@
+../models
--- a/examples/langchain-chroma/models/completion.tmpl
+++ b/examples/langchain-chroma/models/completion.tmpl
@@ -1 +0,0 @@
-{{.Input}}
--- a/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain-chroma/models/gpt-3.5-turbo.yaml
@@ -1,16 +0,0 @@
-name: gpt-3.5-turbo
-parameters:
-  model: ggml-gpt4all-j
-  top_k: 80
-  temperature: 0.2
-  top_p: 0.7
-context_size: 1024
-stopwords:
- "HUMAN:"
- "GPT:"
-roles:
-  user: " "
-  system: " "
-template:
-  completion: completion
-  chat: gpt4all
--- a/examples/langchain-chroma/models/gpt4all.tmpl
+++ b/examples/langchain-chroma/models/gpt4all.tmpl
@@ -1,4 +0,0 @@
-The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
-### Prompt:
-{{.Input}}
-### Response:
--- a/Show More
+++ b/Show More