fix(deps): update module github.com/otiai10/openaigo to v1.6.0 (#960 )

[![Mend Renovate](https://app.renovatebot.com/images/banner.svg)](https://renovatebot.com) This PR contains the following updates: | Package | Type | Update | Change | |---|---|---|---| | [github.com/otiai10/openaigo](https://togithub.com/otiai10/openaigo) | require | minor | `v1.5.2` -> `v1.6.0` | --- ### Release Notes <details> <summary>otiai10/openaigo (github.com/otiai10/openaigo)</summary> ### [`v1.6.0`](https://togithub.com/otiai10/openaigo/compare/v1.5.2...v1.6.0) [Compare Source](https://togithub.com/otiai10/openaigo/compare/v1.5.2...v1.6.0) </details> --- ### Configuration 📅 **Schedule**: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the rebase/retry checkbox. 🔕 **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [Mend Renovate](https://www.mend.io/free-developer-tools/renovate/). View repository job log [here](https://developer.mend.io/github/go-skynet/LocalAI).  Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
fix(deps): update github.com/tmc/langchaingo digest to 1e2a401 (#948 )
2026-02-03 03:02:38 -05:00 · 2023-08-26 14:18:06 +02:00 · 2023-08-26 14:17:48 +02:00 · 2023-08-26 14:17:22 +02:00 · 2023-08-25 21:58:46 +02:00 · 2023-08-25 18:45:40 +02:00
166 changed files with 15067 additions and 3118 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,3 @@
-.git
 .idea
 models
 examples/chatbot-ui/models
--- a/.env
+++ b/.env
@@ -7,24 +7,40 @@

 ## Default models context size
 # CONTEXT_SIZE=512
+#
+## Define galleries.
+## models will to install will be visible in `/models/available`
+# GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+
+## CORS settings
+# CORS=true
+# CORS_ALLOW_ORIGINS=*

 ## Default path for models
+#
 MODELS_PATH=/models

 ## Enable debug mode
 # DEBUG=true

-## Specify a build type. Available: cublas, openblas.
+## Specify a build type. Available: cublas, openblas, clblas.
+## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
+## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
+## clBLAS:   This is an open-source implementation of the BLAS library that uses OpenCL, a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. clBLAS is designed to take advantage of the parallel computing power of GPUs but can also run on any hardware that supports OpenCL. This includes hardware from different vendors like Nvidia, AMD, and Intel.
 # BUILD_TYPE=openblas

-## Uncomment and set to false to disable rebuilding from source
-# REBUILD=false
+## Uncomment and set to true to enable rebuilding from source
+# REBUILD=true

-## Enable image generation with stablediffusion (requires REBUILD=true)
+## Enable go tags, available: stablediffusion, tts
+## stablediffusion: image generation with stablediffusion
+## tts: enables text-to-speech with go-piper 
+## (requires REBUILD=true)
+#
 # GO_TAGS=stablediffusion

 ## Path where to store generated images
 # IMAGE_PATH=/tmp

 ## Specify a default upload limit in MB (whisper)
-# UPLOAD_LIMIT
+# UPLOAD_LIMIT
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.sh text eol=lf
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,5 @@
+# These are supported funding model platforms
+
+github: [mudler]
+custom: 
+- https://www.buymeacoffee.com/mudler
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -30,6 +30,15 @@ jobs:
          - repository: "nomic-ai/gpt4all"
            variable: "GPT4ALL_VERSION"
            branch: "main"
+          - repository: "mudler/go-ggllm.cpp"
+            variable: "GOGGLLM_VERSION"
+            branch: "master"
+          - repository: "mudler/go-stable-diffusion"
+            variable: "STABLEDIFFUSION_VERSION"
+            branch: "master"
+          - repository: "mudler/go-piper"
+            variable: "PIPER_VERSION"
+            branch: "master"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -59,6 +59,38 @@ jobs:

    runs-on: ubuntu-latest
    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
      - name: Checkout
        uses: actions/checkout@v3

--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -22,6 +22,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Dependencies
        run: |
          sudo apt-get update
@@ -60,11 +63,9 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: true
-
-      - name: Dependencies
-        run: |
-          brew update
-          brew install sdl2 ffmpeg
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
      - name: Build
        id: build
        env:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,33 +16,66 @@ concurrency:
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
        with: 
          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
+          
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          sudo pip install -r extra/requirements.txt
+
+          sudo mkdir /build && sudo chmod -R 777 /build && cd /build && \
+          curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v1.11.0.tar.gz" | \
+          tar -xzvf - && \
+          mkdir -p "spdlog-1.11.0/build" && \
+          cd "spdlog-1.11.0/build" && \
+          cmake ..  && \
+          make -j8 && \
+          sudo cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
+          cd /build && \
+          mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
+          curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz" | \
+          tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
+          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
+          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
      - name: Test
        run: |
-          make test
+          ESPEAK_DATA="/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data" GO_TAGS="tts stablediffusion" make test

  macOS-latest:
    runs-on: macOS-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
      - name: Clone
        uses: actions/checkout@v3
        with: 
          submodules: true
-
-      - name: Dependencies
-        run: |
-          brew update
-          brew install sdl2 ffmpeg
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Test
        run: |
-          make test
+          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,21 @@
 # go-llama build artifacts
 go-llama
-gpt4all
+go-llama-stable
+/gpt4all
 go-stable-diffusion
+go-piper
+/go-bert
+go-ggllm
+/piper
+__pycache__/
+*.a
+get-sources
+
 go-ggml-transformers
 go-gpt2
 go-rwkv
 whisper.cpp
-bloomz
+/bloomz
 go-bert

 # LocalAI build binary
@@ -14,6 +23,8 @@ LocalAI
 local-ai
 # prevent above rules from omitting the helm chart
 !charts/*
+# prevent above rules from omitting the api/localai folder
+!api/localai

 # Ignore models
 models/*
@@ -27,4 +38,6 @@ release/
 .idea

 # Generated during build
-backend-assets/
+backend-assets/
+prepare
+/ggml-metal.metal
--- a/135
+++ b/135
@@ -1,24 +1,26 @@
-ARG GO_VERSION=1.20
+ARG GO_VERSION=1.21-bullseye

-FROM golang:$GO_VERSION as builder
+FROM golang:$GO_VERSION as requirements

-ARG BUILD_TYPE=
-ARG GO_TAGS=stablediffusion
+ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
 ARG CUDA_MINOR_VERSION=7
+ARG SPDLOG_VERSION="1.11.0"
+ARG PIPER_PHONEMIZE_VERSION='1.0.0'
+ARG TARGETARCH
+ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV GO_TAGS=${GO_TAGS}
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
-ENV NVIDIA_VISIBLE_DEVICES=all
-ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
-ENV REBUILD=true
-
-WORKDIR /build
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py"
+ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
+ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl
+    apt-get install -y ca-certificates cmake curl patch pip
+
+# Use the variables in subsequent instructions
+RUN echo "Target Architecture: $TARGETARCH"
+RUN echo "Target Variant: $TARGETVARIANT"

 # CuBLAS requirements
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
@@ -28,10 +30,25 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

+# Extras requirements
+COPY extra/requirements.txt /build/extra/requirements.txt
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+    fi
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install torch && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+    fi
+RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+
+WORKDIR /build
+
 # OpenBLAS requirements
 RUN apt-get install -y libopenblas-dev

@@ -39,60 +56,84 @@ RUN apt-get install -y libopenblas-dev
 RUN apt-get install -y libopencv-dev && \
    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

-COPY . .
-RUN make build

-FROM golang:$GO_VERSION
+# piper requirements
+# Use pre-compiled Piper phonemization library (includes onnxruntime)
+#RUN if echo "${GO_TAGS}" | grep -q "tts"; then \
+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')

-ARG BUILD_TYPE=
-ARG GO_TAGS=stablediffusion
-ARG CUDA_MAJOR_VERSION=11
-ARG CUDA_MINOR_VERSION=7
-ARG FFMPEG=
+RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz" | \
+    tar -xzvf - && \
+    mkdir -p "spdlog-${SPDLOG_VERSION}/build" && \
+    cd "spdlog-${SPDLOG_VERSION}/build" && \
+    cmake ..  && \
+    make -j8 && \
+    cmake --install . --prefix /usr && mkdir -p "lib/Linux-$(uname -m)" && \
+    cd /build && \
+    mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
+    curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH:-$(go env GOARCH)}${TARGETVARIANT}.tar.gz" | \
+    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
+    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
+    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
+# \
+#    ; fi
+
+###################################
+###################################
+
+FROM requirements as builder
+
+ARG GO_TAGS="stablediffusion tts"

-ENV BUILD_TYPE=${BUILD_TYPE}
 ENV GO_TAGS=${GO_TAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
-ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
-
-ENV REBUILD=true

 WORKDIR /build

-RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl
+COPY Makefile .
+RUN make get-sources
+COPY go.mod .
+RUN make prepare
+COPY . .
+COPY .git .

-# CuBLAS requirements
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
-    apt-get install -y software-properties-common && \
-    apt-add-repository contrib && \
-    curl -O https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
-    dpkg -i cuda-keyring_1.0-1_all.deb && \
-    rm -f cuda-keyring_1.0-1_all.deb && \
-    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
-    ; fi
+RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build
+
+###################################
+###################################
+
+FROM requirements
+
+ARG FFMPEG
+ARG BUILD_TYPE
+ARG TARGETARCH
+
+ENV BUILD_TYPE=${BUILD_TYPE}
+ENV REBUILD=false
+ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
    apt-get install -y ffmpeg \
    ; fi

-ENV PATH /usr/local/cuda/bin:${PATH}
-
-# OpenBLAS requirements
-RUN apt-get install -y libopenblas-dev
-
-# Stable Diffusion requirements
-RUN apt-get install -y libopencv-dev && \
-    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+WORKDIR /build

+# we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
+# so when `entrypoint.sh` runs `make build` again (which it does by default), the build would fail
+# see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
+# https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
 RUN make prepare-sources
 COPY --from=builder /build/local-ai ./
-
+# To resolve exllama import error
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
+    fi
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/328
+++ b/328
@@ -3,23 +3,54 @@ GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

-GOLLAMA_VERSION?=37ef81d01ae0848575e416e48b41d112ef0d520e
-GPT4ALL_REPO?=https://github.com/go-skynet/gpt4all
-GPT4ALL_VERSION?=f7498c9
-GOGGMLTRANSFORMERS_VERSION?=bd765bb6f3b38a63f915f3725e488aad492eedd4
+# llama.cpp versions
+GOLLAMA_VERSION?=bf63302a2be787674e6ca4227a8aaeb95a8eb6b1
+
+GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
+
+# gpt4all version
+GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
+GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
+
+# go-ggml-transformers version
+GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
+
+# go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
-RWKV_VERSION?=1e18b2490e7e32f6b00e16f6a9ec0dd3a3d09266
-WHISPER_CPP_VERSION?=57543c169e27312e7546d07ed0d8c6eb806ebc36
-BERT_VERSION?=0548994371f7081e45fcf8d472f3941a12f179aa
+RWKV_VERSION?=c898cd0f62df8f2a7830e53d1d513bef4f6f792b
+
+# whisper.cpp version
+WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
+
+# bert.cpp version
+BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
+
+# go-piper version
+PIPER_VERSION?=56b8a81b4760a6fbee1a82e62f007ae7e8f010a7
+
+# go-bloomz version
 BLOOMZ_VERSION?=1834e77b83faafe912ad4092ccf7f77937349e2f
+
+# stablediffusion version
+STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
+
+# Go-ggllm
+GOGGLLM_VERSION?=862477d16eefb0805261c19c9b0d053e3b2b684b
+
 export BUILD_TYPE?=
 CGO_LDFLAGS?=
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
-STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
 GO_TAGS?=
 BUILD_ID?=git
-LD_FLAGS=?=
+
+VERSION?=$(shell git describe --always --tags || echo "dev" )
+# go tool nm ./local-ai | grep Commit
+LD_FLAGS?=
+override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION)"
+override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
+
 OPTIONAL_TARGETS?=
+ESPEAK_DATA?=

 OS := $(shell uname -s)
 ARCH := $(shell uname -m)
@@ -29,8 +60,14 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

-C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-stable-diffusion/:$(shell pwd)/gpt4all/gpt4all-bindings/golang/:$(shell pwd)/go-ggml-transformers:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp:$(shell pwd)/go-bert:$(shell pwd)/bloomz
-LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-stable-diffusion/:$(shell pwd)/gpt4all/gpt4all-bindings/golang/:$(shell pwd)/go-ggml-transformers:$(shell pwd)/go-rwkv:$(shell pwd)/whisper.cpp:$(shell pwd)/go-bert:$(shell pwd)/bloomz
+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
+endif
+
+# workaround for rwkv.cpp
+ifeq ($(UNAME_S),Darwin)
+        CGO_LDFLAGS += -lcblas -framework Accelerate 
+endif

 ifeq ($(BUILD_TYPE),openblas)
 	CGO_LDFLAGS+=-lopenblas
@@ -55,10 +92,19 @@ ifeq ($(STATIC),true)
 	LD_FLAGS=-linkmode external -extldflags -static
 endif

-ifeq ($(GO_TAGS),stablediffusion)
-	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
+ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
+#	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
+	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
 endif

+ifeq ($(findstring tts,$(GO_TAGS)),tts)
+#	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
+#	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
+	OPTIONAL_GRPC+=backend-assets/grpc/piper
+endif
+
+GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+
 .PHONY: all test build vendor

 all: help
@@ -67,28 +113,24 @@ all: help
 gpt4all:
 	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
 	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
-	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
-	@find ./gpt4all -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt4all_/g' {} +
-	@find ./gpt4all -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt4all_/g' {} +
-	@find ./gpt4all -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gpt4all_/g' {} +
-	@find ./gpt4all -type f -name "*.c" -exec sed -i'' -e 's/llama_/llama_gpt4all_/g' {} +
-	@find ./gpt4all -type f -name "*.cpp" -exec sed -i'' -e 's/llama_/llama_gpt4all_/g' {} +
-	@find ./gpt4all -type f -name "*.h" -exec sed -i'' -e 's/llama_/llama_gpt4all_/g' {} +
-	@find ./gpt4all/gpt4all-backend -type f -name "llama_util.h" -execdir mv {} "llama_gpt4all_util.h" \;
-	@find ./gpt4all -type f -name "*.cmake" -exec sed -i'' -e 's/llama_util/llama_gpt4all_util/g' {} +
-	@find ./gpt4all -type f -name "*.txt" -exec sed -i'' -e 's/llama_util/llama_gpt4all_util/g' {} +
-	@find ./gpt4all/gpt4all-bindings/golang -type f -name "*.cpp" -exec sed -i'' -e 's/load_model/load_gpt4all_model/g' {} +
-	@find ./gpt4all/gpt4all-bindings/golang -type f -name "*.go" -exec sed -i'' -e 's/load_model/load_gpt4all_model/g' {} +
-	@find ./gpt4all/gpt4all-bindings/golang -type f -name "*.h" -exec sed -i'' -e 's/load_model/load_gpt4all_model/g' {} +

+## go-ggllm
+go-ggllm:
+	git clone --recurse-submodules https://github.com/mudler/go-ggllm.cpp go-ggllm
+	cd go-ggllm && git checkout -b build $(GOGGLLM_VERSION) && git submodule update --init --recursive --depth 1
+
+go-ggllm/libggllm.a: go-ggllm
+	$(MAKE) -C go-ggllm BUILD_TYPE=$(BUILD_TYPE) libggllm.a
+
+## go-piper
+go-piper:
+	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
+	cd go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1

 ## BERT embeddings
 go-bert:
 	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp go-bert
 	cd go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
-	@find ./go-bert -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_bert_/g' {} +
-	@find ./go-bert -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_bert_/g' {} +
-	@find ./go-bert -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_bert_/g' {} +

 ## stable diffusion
 go-stable-diffusion:
@@ -102,9 +144,6 @@ go-stable-diffusion/libstablediffusion.a:
 go-rwkv:
 	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
 	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
-	@find ./go-rwkv -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_rwkv_/g' {} +
-	@find ./go-rwkv -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_rwkv_/g' {} +
-	@find ./go-rwkv -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_rwkv_/g' {} +

 go-rwkv/librwkv.a: go-rwkv
 	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
@@ -112,13 +151,7 @@ go-rwkv/librwkv.a: go-rwkv
 ## bloomz
 bloomz:
 	git clone --recurse-submodules https://github.com/go-skynet/bloomz.cpp bloomz
-	@find ./bloomz -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_bloomz_/g' {} +
-	@find ./bloomz -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_bloomz_/g' {} +
-	@find ./bloomz -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_bloomz_/g' {} +
-	@find ./bloomz -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gpt_bloomz_/g' {} +
-	@find ./bloomz -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gpt_bloomz_/g' {} +
-	@find ./bloomz -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_bloomz_replace/g' {} +
-	@find ./bloomz -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_bloomz_replace/g' {} +
+	cd bloomz && git checkout -b build $(BLOOMZ_VERSION) && git submodule update --init --recursive --depth 1

 bloomz/libbloomz.a: bloomz
 	cd bloomz && make libbloomz.a
@@ -132,34 +165,29 @@ backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true

+backend-assets/espeak-ng-data:
+	mkdir -p backend-assets/espeak-ng-data
+ifdef ESPEAK_DATA
+	@cp -rf $(ESPEAK_DATA)/. backend-assets/espeak-ng-data
+else
+	@echo "ESPEAK_DATA not set, skipping tts. Note that this will break the tts functionality."
+	@touch backend-assets/espeak-ng-data/keep
+endif
+
 gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## CEREBRAS GPT
-go-ggml-transformers: 
+go-ggml-transformers:
 	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp go-ggml-transformers
 	cd go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
-	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
-	@find ./go-ggml-transformers -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_print_usage/gpt2_print_usage/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.h" -exec sed -i'' -e 's/gpt_print_usage/gpt2_print_usage/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_params_parse/gpt2_params_parse/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.h" -exec sed -i'' -e 's/gpt_params_parse/gpt2_params_parse/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_random_prompt/gpt2_random_prompt/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.h" -exec sed -i'' -e 's/gpt_random_prompt/gpt2_random_prompt/g' {} +
-	@find ./go-ggml-transformers -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gpt2_/g' {} +

 go-ggml-transformers/libtransformers.a: go-ggml-transformers
-	$(MAKE) -C go-ggml-transformers libtransformers.a
+	$(MAKE) -C go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a

 whisper.cpp:
 	git clone https://github.com/ggerganov/whisper.cpp.git
 	cd whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
-	@find ./whisper.cpp -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_whisper_/g' {} +
-	@find ./whisper.cpp -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_whisper_/g' {} +
-	@find ./whisper.cpp -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_whisper_/g' {} +

 whisper.cpp/libwhisper.a: whisper.cpp
 	cd whisper.cpp && make libwhisper.a
@@ -168,11 +196,23 @@ go-llama:
 	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
 	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

-go-llama/libbinding.a: go-llama 
+go-llama-stable:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
+	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
+
+go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

+go-llama-stable/libbinding.a: go-llama-stable
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+
+go-piper/libpiper_binding.a:
+	$(MAKE) -C go-piper libpiper_binding.a example/main
+
+get-sources: go-llama go-llama-stable go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+	touch $@
+
 replace:
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
@@ -180,13 +220,17 @@ replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
 	$(GOCMD) mod edit -replace github.com/go-skynet/bloomz.cpp=$(shell pwd)/bloomz
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
+	$(GOCMD) mod edit -replace github.com/mudler/go-ggllm.cpp=$(shell pwd)/go-ggllm

-prepare-sources: go-llama go-ggml-transformers gpt4all go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion replace
+prepare-sources: get-sources replace
 	$(GOCMD) mod download

 ## GENERIC
 rebuild: ## Rebuilds the project
+	$(GOCMD) clean -cache
 	$(MAKE) -C go-llama clean
+	$(MAKE) -C go-llama-stable clean
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C go-ggml-transformers clean
 	$(MAKE) -C go-rwkv clean
@@ -194,13 +238,19 @@ rebuild: ## Rebuilds the project
 	$(MAKE) -C go-stable-diffusion clean
 	$(MAKE) -C go-bert clean
 	$(MAKE) -C bloomz clean
+	$(MAKE) -C go-piper clean
+	$(MAKE) -C go-ggllm clean
 	$(MAKE) build

-prepare: prepare-sources backend-assets/gpt4all $(OPTIONAL_TARGETS) go-llama/libbinding.a go-bert/libgobert.a go-ggml-transformers/libtransformers.a go-rwkv/librwkv.a whisper.cpp/libwhisper.a bloomz/libbloomz.a  ## Prepares for building
+prepare: prepare-sources $(OPTIONAL_TARGETS) 
+	touch $@

 clean: ## Remove build related file
-	rm -fr ./go-llama
-	rm -rf ./gpt4all
+	$(GOCMD) clean -cache
+	rm -f prepare
+	rm -rf ./go-llama
+	rm -rf ./gpt4all	
+	rm -rf ./go-llama-stable
 	rm -rf ./go-gpt2
 	rm -rf ./go-stable-diffusion
 	rm -rf ./go-ggml-transformers
@@ -209,27 +259,28 @@ clean: ## Remove build related file
 	rm -rf ./go-bert
 	rm -rf ./bloomz
 	rm -rf ./whisper.cpp
+	rm -rf ./go-piper
+	rm -rf ./go-ggllm
 	rm -rf $(BINARY_NAME)
 	rm -rf release/

 ## Build:

-build: prepare ## Build the project
+build: grpcs prepare ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
+	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
+
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 dist: build
 	mkdir -p release
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)

-generic-build: ## Build the project using generic
-	BUILD_TYPE="generic" $(MAKE) build
-
 ## Run
 run: prepare ## run local-ai
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) run ./
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./

 test-models/testmodel:
 	mkdir test-models
@@ -242,11 +293,45 @@ test-models/testmodel:
 	wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models

-test: prepare test-models/testmodel
-	cp -r backend-assets api
+prepare-test: grpcs
+	cp -rf backend-assets api
 	cp tests/models_fixtures/* test-models
-	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all" --flake-attempts 5 -v -r ./api ./pkg
-	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
+
+test: prepare test-models/testmodel grpcs
+	@echo 'Running tests'
+	export GO_TAGS="tts stablediffusion"
+	$(MAKE) prepare-test
+	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+	$(MAKE) test-gpt4all
+	$(MAKE) test-llama
+	$(MAKE) test-llama-gguf
+	$(MAKE) test-tts
+	$(MAKE) test-stablediffusion
+
+test-gpt4all: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
+
+test-llama: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
+
+test-llama-gguf: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+
+test-tts: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
+
+test-stablediffusion: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r ./api ./pkg
+
+test-container:
+	docker build --target requirements -t local-ai-test-container .
+	docker run -ti --rm --entrypoint /bin/bash -ti -v $(abspath ./):/build local-ai-test-container

 ## Help:
 help: ## Show this help.
@@ -259,3 +344,104 @@ help: ## Show this help.
 		if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf "    ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
 		}' $(MAKEFILE_LIST)
+
+protogen: protogen-go protogen-python
+
+protogen-go:
+	protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative \
+    pkg/grpc/proto/backend.proto
+
+protogen-python:
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
+
+## GRPC
+
+backend-assets/grpc:
+	mkdir -p backend-assets/grpc
+
+backend-assets/grpc/falcon: backend-assets/grpc go-ggllm/libggllm.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggllm LIBRARY_PATH=$(shell pwd)/go-ggllm \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon ./cmd/grpc/falcon/
+
+backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
+endif
+
+backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/
+
+backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
+
+backend-assets/grpc/dolly: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./cmd/grpc/dolly/
+
+backend-assets/grpc/gpt2: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./cmd/grpc/gpt2/
+
+backend-assets/grpc/gptj: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./cmd/grpc/gptj/
+
+backend-assets/grpc/gptneox: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./cmd/grpc/gptneox/
+
+backend-assets/grpc/mpt: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./cmd/grpc/mpt/
+
+backend-assets/grpc/replit: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./cmd/grpc/replit/
+
+backend-assets/grpc/falcon-ggml: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./cmd/grpc/falcon-ggml/
+
+backend-assets/grpc/starcoder: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./cmd/grpc/starcoder/
+
+backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/
+
+backend-assets/grpc/bloomz: backend-assets/grpc bloomz/libbloomz.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/bloomz LIBRARY_PATH=$(shell pwd)/bloomz \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bloomz ./cmd/grpc/bloomz/
+
+backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
+
+backend-assets/grpc/langchain-huggingface: backend-assets/grpc
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/
+
+backend-assets/grpc/stablediffusion: backend-assets/grpc go-stable-diffusion/libstablediffusion.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/
+
+backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/
+
+backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/
+
+grpcs: prepare $(GRPC_BACKENDS)
--- a/README.md
+++ b/README.md
@@ -1,204 +1,143 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>

-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml) [![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)
+<p align="center">
+<a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
+<img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
+<img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
+<img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
+</a>
+<a href='https://github.com/go-skynet/LocalAI/releases'>
+<img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
+</a>
+</p>

-[![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 
+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+> 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)

-**LocalAI** is a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.

-For a list of the supported model families, please see [the model compatibility table](https://localai.io/model-compatibility/index.html#model-compatibility-table).
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
+
+**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.
+
+<p align="center"><b>Follow LocalAI </b></p>
+
+<p align="center">
+<a href="https://twitter.com/LocalAI_API" target="blank">
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+</a>
+<a href="https://discord.gg/uJAeKSAGDy" target="blank">
+<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
+</a>
+
+<p align="center"><b>Connect with the Creator </b></p>
+
+<p align="center">
+<a href="https://twitter.com/mudler_it" target="blank">
+<img src="https://img.shields.io/twitter/follow/mudler_it?label=Follow: mudler_it&style=social" alt="Follow mudler_it"/>
+</a>
+<a href='https://github.com/mudler'>
+<img alt="Follow on Github" src="https://img.shields.io/badge/Follow-mudler-black?logo=github&link=https%3A%2F%2Fgithub.com%2Fmudler">
+</a>
+</p>
+
+<p align="center"><b>Share LocalAI Repository</b></p>
+
+<p align="center">
+
+<a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
+<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a> 
+<a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
+<a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
+" target="blank">
+<img src="https://img.shields.io/twitter/url?label=Reddit&logo=Reddit&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Reddit"/>
+</a> <a href="mailto:?subject=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%3A%0Ahttps://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Gmail&logo=Gmail&style=social&url=https://github.com/go-skynet/LocalAI"/></a> <a href="https://www.buymeacoffee.com/mudler" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="23" width="100" style="border-radius:1px"></a>
+
+</p>
+
+<hr>

 In a nutshell:

 - Local, OpenAI drop-in alternative REST API. You own your data.
- NO GPU required. NO Internet access is required either. Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. [See building instructions](https://localai.io/basics/build/index.html).
- Supports multiple models, Audio transcription, Text generation with GPTs, Image generation with stable diffusion (experimental)
- Once loaded the first time, it keep models loaded in memory for faster inference
- Doesn't shell-out, but uses C++ bindings for a faster inference and better performance. 
+- NO GPU required. NO Internet access is required either
+  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
+- Supports multiple models
+- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
+- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.

-LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!
+LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 

-| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
-|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
-|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
+Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
+
+## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)
+
+## 🚀 [Features](https://localai.io/features/)
+
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
+- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
+- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
+- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)


-See the [Getting started](https://localai.io/basics/getting_started/index.html) and [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) sections to learn how to use LocalAI. For a list of curated models check out the [model gallery](https://localai.io/models/).
+## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

-## News
+- [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
+- [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
+- [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
+- [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)

- 29-05-2023: LocalAI now has a website, [https://localai.io](https://localai.io)! check the news in the [dedicated section](https://localai.io/basics/news/index.html)!
+## 💻 Usage

-For latest news, follow also on Twitter [@LocalAI_API](https://twitter.com/LocalAI_API) and [@mudler_it](https://twitter.com/mudler_it)
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

-## Contribute and help
+### 💡 Example: Use GPT4ALL-J model

-To help the project you can:
+See the [documentation](https://localai.io/basics/getting_started/#example-use-gpt4all-j-model-with-docker-compose)

- Upvote the [Reddit post](https://www.reddit.com/r/selfhosted/comments/12w4p2f/localai_openai_compatible_api_to_run_llm_models/) about LocalAI.
+### 🔗 Resources

- [Hacker news post](https://news.ycombinator.com/item?id=35726934) - help us out by voting if you like this project.
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/integrations/)

- If you have technological skills and want to contribute to development, have a look at the open issues. If you are new you can have a look at the [good-first-issue](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and [help-wanted](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.
+## ❤️ Sponsors

- If you don't have technological skills you can still help improving documentation or add examples or share your user-stories with our community, any help and contribution is welcome!
+> Do you find LocalAI useful?

-## Usage
+Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.

-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section. Here below you will find generic, quick instructions to get ready and use LocalAI.
+A huge thank you to our generous sponsors who support this project:

-The easiest way to run LocalAI is by using `docker-compose` (to build locally, see [building LocalAI](https://localai.io/basics/build/index.html)):
+| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) | 
+|:-----------------------------------------------:|
+|  [Spectro Cloud](https://www.spectrocloud.com/)  |  
+|  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

-```bash
-
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-
-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# copy your models to models/
-cp your-model.bin models/
-
-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
-
-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "your-model.bin",            
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
-```
-
-### Example: Use GPT4ALL-J model
-
-<details>
-
-```bash
-# Clone LocalAI
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-
-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# Download gpt4all-j to models/
-wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
-
-# Use a template from the examples
-cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
-
-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"ggml-gpt4all-j","object":"model"}]}
-
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-gpt4all-j",
-     "messages": [{"role": "user", "content": "How are you?"}],
-     "temperature": 0.9 
-   }'
-
-# {"model":"ggml-gpt4all-j","choices":[{"message":{"role":"assistant","content":"I'm doing well, thanks. How about you?"}}]}
-```
-</details>
-
-
-### Build locally
-
-<details>
-
-In order to build the `LocalAI` container image locally you can use `docker`:
-
-```
-# build the image
-docker build -t localai .
-docker run localai
-```
-
-Or you can build the binary with `make`:
-
-```
-make build
-```
-
-</details>
-
-See the [build section](https://localai.io/basics/build/index.html) in our documentation for detailed instructions.
-
-### Run LocalAI in Kubernetes
-
-LocalAI can be installed inside Kubernetes with helm. See [installation instructions](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes).
-
-## Supported API endpoints
-
-See the [list of the supported API endpoints](https://localai.io/api-endpoints/index.html) and how to configure image generation and audio transcription.
-
-## Frequently asked questions
-
-See [the FAQ](https://localai.io/faq/index.html) section for a list of common questions.
-
-## Projects already using LocalAI to run local models
-
-Feel free to open up a PR to get your project listed!
-
- [Kairos](https://github.com/kairos-io/kairos)
- [k8sgpt](https://github.com/k8sgpt-ai/k8sgpt#running-local-models)
- [Spark](https://github.com/cedriking/spark)
- [autogpt4all](https://github.com/aorumbayev/autogpt4all)
- [Mods](https://github.com/charmbracelet/mods)
- [Flowise](https://github.com/FlowiseAI/Flowise)
-
-## Short-term roadmap
-
- [x] Mimic OpenAI API (https://github.com/go-skynet/LocalAI/issues/10)
- [ ] Binary releases (https://github.com/go-skynet/LocalAI/issues/6)
- [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351) and [gpt4all](https://github.com/go-skynet/LocalAI/issues/85)
- [x] Multi-model support
- [x] Have a webUI!
- [x] Allow configuration of defaults for models.
- [x] Support for embeddings
- [x] Support for audio transcription with https://github.com/ggerganov/whisper.cpp
- [ ] GPU/CUDA support ( https://github.com/go-skynet/LocalAI/issues/69 )
- [ ] Enable automatic downloading of models from a curated gallery, with only free-licensed models, directly from the webui.
-
-## Star history
+## 🌟 Star history

 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)

-## License
+## 📖 License

 LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-MIT
+MIT - Author Ettore Di Giacinto

-## Author
-
-Ettore Di Giacinto and others
-
-## Acknowledgements
+## 🙇 Acknowledgements

 LocalAI couldn't have been built without the help of great software already available from the community. Thank you!

@@ -209,9 +148,12 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
+- https://github.com/rhasspy/piper
+- https://github.com/cmp-nct/ggllm.cpp

-## Contributors
+## 🤗 Contributors

+This is a community project, a special thanks to our contributors! 🤗
 <a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
 </a>
--- a/api/api.go
+++ b/api/api.go
@@ -2,6 +2,16 @@ package api

 import (
 	"errors"
+	"fmt"
+	"strings"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/localai"
+	"github.com/go-skynet/LocalAI/api/openai"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/pkg/assets"

 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
@@ -11,18 +21,77 @@ import (
 	"github.com/rs/zerolog/log"
 )

-func App(opts ...AppOption) (*fiber.App, error) {
-	options := newOptions(opts...)
+func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader, error) {
+	options := options.NewOptions(opts...)

 	zerolog.SetGlobalLevel(zerolog.InfoLevel)
-	if options.debug {
+	if options.Debug {
 		zerolog.SetGlobalLevel(zerolog.DebugLevel)
 	}

+	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
+	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
+
+	cl := config.NewConfigLoader()
+	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
+		log.Error().Msgf("error loading config files: %s", err.Error())
+	}
+
+	if options.ConfigFile != "" {
+		if err := cl.LoadConfigFile(options.ConfigFile); err != nil {
+			log.Error().Msgf("error loading config file: %s", err.Error())
+		}
+	}
+
+	if options.Debug {
+		for _, v := range cl.ListConfigs() {
+			cfg, _ := cl.GetConfig(v)
+			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
+		}
+	}
+
+	if options.AssetsDestination != "" {
+		// Extract files from the embedded FS
+		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
+		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
+		if err != nil {
+			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+		}
+	}
+
+	if options.PreloadJSONModels != "" {
+		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	// turn off any process that was started by GRPC if the context is canceled
+	go func() {
+		<-options.Context.Done()
+		log.Debug().Msgf("Context canceled, shutting down")
+		options.Loader.StopAllGRPC()
+	}()
+
+	return options, cl, nil
+}
+
+func App(opts ...options.AppOption) (*fiber.App, error) {
+
+	options, cl, err := Startup(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed basic startup tasks with error %s", err.Error())
+	}
+
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
-		BodyLimit:             options.uploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
-		DisableStartupMessage: options.disableMessage,
+		BodyLimit:             options.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		DisableStartupMessage: options.DisableMessage,
 		// Override default error handler
 		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
 			// Status code defaults to 500
@@ -36,101 +105,106 @@ func App(opts ...AppOption) (*fiber.App, error) {

 			// Send custom error page
 			return ctx.Status(code).JSON(
-				ErrorResponse{
-					Error: &APIError{Message: err.Error(), Code: code},
+				schema.ErrorResponse{
+					Error: &schema.APIError{Message: err.Error(), Code: code},
 				},
 			)
 		},
 	})

-	if options.debug {
+	if options.Debug {
 		app.Use(logger.New(logger.Config{
 			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
 		}))
 	}

-	cm := NewConfigMerger()
-	if err := cm.LoadConfigs(options.loader.ModelPath); err != nil {
-		log.Error().Msgf("error loading config files: %s", err.Error())
-	}
-
-	if options.configFile != "" {
-		if err := cm.LoadConfigFile(options.configFile); err != nil {
-			log.Error().Msgf("error loading config file: %s", err.Error())
-		}
-	}
-
-	if options.debug {
-		for _, v := range cm.ListConfigs() {
-			cfg, _ := cm.GetConfig(v)
-			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
-		}
-	}
-
-	if options.assetsDestination != "" {
-		if err := PrepareBackendAssets(options.backendAssets, options.assetsDestination); err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
-		}
-	}
-
 	// Default middleware config
 	app.Use(recover.New())

-	if options.preloadJSONModels != "" {
-		if err := ApplyGalleryFromString(options.loader.ModelPath, options.preloadJSONModels, cm); err != nil {
-			return nil, err
+	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
+	auth := func(c *fiber.Ctx) error {
+		if len(options.ApiKeys) > 0 {
+			authHeader := c.Get("Authorization")
+			if authHeader == "" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+			}
+			authHeaderParts := strings.Split(authHeader, " ")
+			if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+			}
+
+			apiKey := authHeaderParts[1]
+			validApiKey := false
+			for _, key := range options.ApiKeys {
+				if apiKey == key {
+					validApiKey = true
+				}
+			}
+			if !validApiKey {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+			}
 		}
+		return c.Next()
 	}

-	if options.preloadModelsFromPath != "" {
-		if err := ApplyGalleryFromFile(options.loader.ModelPath, options.preloadModelsFromPath, cm); err != nil {
-			return nil, err
-		}
-	}
-
-	if options.cors {
-		if options.corsAllowOrigins == "" {
-			app.Use(cors.New())
+	if options.CORS {
+		var c func(ctx *fiber.Ctx) error
+		if options.CORSAllowOrigins == "" {
+			c = cors.New()
 		} else {
-			app.Use(cors.New(cors.Config{
-				AllowOrigins: options.corsAllowOrigins,
-			}))
+			c = cors.New(cors.Config{AllowOrigins: options.CORSAllowOrigins})
 		}
+
+		app.Use(c)
 	}

 	// LocalAI API endpoints
-	applier := newGalleryApplier(options.loader.ModelPath)
-	applier.start(options.context, cm)
-	app.Post("/models/apply", applyModelGallery(options.loader.ModelPath, cm, applier.C))
-	app.Get("/models/jobs/:uuid", getOpStatus(applier))
+	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
+	galleryService.Start(options.Context, cl)
+
+	app.Get("/version", auth, func(c *fiber.Ctx) error {
+		return c.JSON(struct {
+			Version string `json:"version"`
+		}{Version: internal.PrintableVersion()})
+	})
+
+	app.Post("/models/apply", auth, localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cl, galleryService.C, options.Galleries))
+	app.Get("/models/available", auth, localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
+	app.Get("/models/jobs/:uuid", auth, localai.GetOpStatusEndpoint(galleryService))

 	// openAI compatible API endpoint

 	// chat
-	app.Post("/v1/chat/completions", chatEndpoint(cm, options))
-	app.Post("/chat/completions", chatEndpoint(cm, options))
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, options))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, options))

 	// edit
-	app.Post("/v1/edits", editEndpoint(cm, options))
-	app.Post("/edits", editEndpoint(cm, options))
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, options))
+	app.Post("/edits", auth, openai.EditEndpoint(cl, options))

 	// completion
-	app.Post("/v1/completions", completionEndpoint(cm, options))
-	app.Post("/completions", completionEndpoint(cm, options))
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, options))

 	// embeddings
-	app.Post("/v1/embeddings", embeddingsEndpoint(cm, options))
-	app.Post("/embeddings", embeddingsEndpoint(cm, options))
-	app.Post("/v1/engines/:model/embeddings", embeddingsEndpoint(cm, options))
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))

 	// audio
-	app.Post("/v1/audio/transcriptions", transcriptEndpoint(cm, options))
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, options))
+	app.Post("/tts", auth, localai.TTSEndpoint(cl, options))

 	// images
-	app.Post("/v1/images/generations", imageEndpoint(cm, options))
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, options))

-	if options.imageDir != "" {
-		app.Static("/generated-images", options.imageDir)
+	if options.ImageDir != "" {
+		app.Static("/generated-images", options.ImageDir)
+	}
+
+	if options.AudioDir != "" {
+		app.Static("/generated-audio", options.AudioDir)
 	}

 	ok := func(c *fiber.Ctx) error {
@@ -141,9 +215,14 @@ func App(opts ...AppOption) (*fiber.App, error) {
 	app.Get("/healthz", ok)
 	app.Get("/readyz", ok)

+	// Experimental Backend Statistics Module
+	backendMonitor := localai.NewBackendMonitor(cl, options) // Split out for now
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
+	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))
+
 	// models
-	app.Get("/v1/models", listModels(options.loader, cm))
-	app.Get("/models", listModels(options.loader, cm))
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
+	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))

 	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -5,15 +5,19 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
+	"errors"
 	"fmt"
-	"io/ioutil"
+	"io"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"

 	. "github.com/go-skynet/LocalAI/api"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -21,12 +25,14 @@ import (

 	openaigo "github.com/otiai10/openaigo"
 	"github.com/sashabaranov/go-openai"
+	"github.com/sashabaranov/go-openai/jsonschema"
 )

 type modelApplyRequest struct {
-	URL       string            `json:"url"`
-	Name      string            `json:"name"`
-	Overrides map[string]string `json:"overrides"`
+	ID        string                 `json:"id"`
+	URL       string                 `json:"url"`
+	Name      string                 `json:"name"`
+	Overrides map[string]interface{} `json:"overrides"`
 }

 func getModelStatus(url string) (response map[string]interface{}) {
@@ -38,7 +44,7 @@ func getModelStatus(url string) (response map[string]interface{}) {
 	}
 	defer resp.Body.Close()

-	body, err := ioutil.ReadAll(resp.Body)
+	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		fmt.Println("Error reading response body:", err)
 		return
@@ -52,6 +58,15 @@ func getModelStatus(url string) (response map[string]interface{}) {
 	}
 	return
 }
+
+func getModels(url string) (response []gallery.GalleryModel) {
+	utils.GetURI(url, func(url string, i []byte) error {
+		// Unmarshal YAML data into a struct
+		return json.Unmarshal(i, &response)
+	})
+	return
+}
+
 func postModelApplyRequest(url string, request modelApplyRequest) (response map[string]interface{}) {

 	//url := "http://localhost:AI/models/apply"
@@ -81,7 +96,7 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 	}
 	defer resp.Body.Close()

-	body, err := ioutil.ReadAll(resp.Body)
+	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		fmt.Println("Error reading response body:", err)
 		return
@@ -109,6 +124,11 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string

+	commonOpts := []options.AppOption{
+		options.WithDebug(true),
+		options.WithDisableMessage(true),
+	}
+
 	Context("API with ephemeral models", func() {
 		BeforeEach(func() {
 			var err error
@@ -118,7 +138,35 @@ var _ = Describe("API test", func() {
 			modelLoader = model.NewModelLoader(tmpdir)
 			c, cancel = context.WithCancel(context.Background())

-			app, err = App(WithContext(c), WithModelLoader(modelLoader), WithBackendAssets(backendAssets), WithBackendAssetsOutput(tmpdir))
+			g := []gallery.GalleryModel{
+				{
+					Name: "bert",
+					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
+				},
+				{
+					Name:            "bert2",
+					URL:             "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
+					Overrides:       map[string]interface{}{"foo": "bar"},
+					AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml"}},
+				},
+			}
+			out, err := yaml.Marshal(g)
+			Expect(err).ToNot(HaveOccurred())
+			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
+			Expect(err).ToNot(HaveOccurred())
+
+			galleries := []gallery.Gallery{
+				{
+					Name: "test",
+					URL:  "file://" + filepath.Join(tmpdir, "gallery_simple.yaml"),
+				},
+			}
+
+			app, err = App(
+				append(commonOpts,
+					options.WithContext(c),
+					options.WithGalleries(galleries),
+					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

@@ -143,11 +191,58 @@ var _ = Describe("API test", func() {
 		})

 		Context("Applying models", func() {
+			It("applies models from a gallery", func() {
+
+				models := getModels("http://127.0.0.1:9090/models/available")
+				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
+				Expect(models[0].Installed).To(BeFalse(), fmt.Sprint(models))
+				Expect(models[1].Installed).To(BeFalse(), fmt.Sprint(models))
+
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					ID: "test@bert2",
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+				resp := map[string]interface{}{}
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					fmt.Println(response)
+					resp = response
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+				Expect(resp["message"]).ToNot(ContainSubstring("error"))
+
+				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert2.yaml"))
+				Expect(err).ToNot(HaveOccurred())
+
+				_, err = os.ReadFile(filepath.Join(tmpdir, "foo.yaml"))
+				Expect(err).ToNot(HaveOccurred())
+
+				content := map[string]interface{}{}
+				err = yaml.Unmarshal(dat, &content)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(content["backend"]).To(Equal("bert-embeddings"))
+				Expect(content["foo"]).To(Equal("bar"))
+
+				models = getModels("http://127.0.0.1:9090/models/available")
+				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
+				Expect(models[0].Name).To(Or(Equal("bert"), Equal("bert2")))
+				Expect(models[1].Name).To(Or(Equal("bert"), Equal("bert2")))
+				for _, m := range models {
+					if m.Name == "bert2" {
+						Expect(m.Installed).To(BeTrue())
+					} else {
+						Expect(m.Installed).To(BeFalse())
+					}
+				}
+			})
 			It("overrides models", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
 					Name: "bert",
-					Overrides: map[string]string{
+					Overrides: map[string]interface{}{
 						"backend": "llama",
 					},
 				})
@@ -158,9 +253,8 @@ var _ = Describe("API test", func() {

 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					fmt.Println(response)
 					return response["processed"].(bool)
-				}, "360s").Should(Equal(true))
+				}, "360s", "10s").Should(Equal(true))

 				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
 				Expect(err).ToNot(HaveOccurred())
@@ -174,7 +268,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
 					Name:      "bert",
-					Overrides: map[string]string{},
+					Overrides: map[string]interface{}{},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -183,9 +277,8 @@ var _ = Describe("API test", func() {

 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					fmt.Println(response)
 					return response["processed"].(bool)
-				}, "360s").Should(Equal(true))
+				}, "360s", "10s").Should(Equal(true))

 				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
 				Expect(err).ToNot(HaveOccurred())
@@ -195,15 +288,15 @@ var _ = Describe("API test", func() {
 				Expect(err).ToNot(HaveOccurred())
 				Expect(content["backend"]).To(Equal("bert-embeddings"))
 			})
-			It("runs gpt4all", Label("gpt4all"), func() {
+
+			It("runs openllama", Label("llama"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
-
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/gpt4all-j.yaml",
-					Name:      "gpt4all-j",
-					Overrides: map[string]string{},
+					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
+					Name:      "openllama_3b",
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -212,15 +305,270 @@ var _ = Describe("API test", func() {

 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					fmt.Println(response)
 					return response["processed"].(bool)
-				}, "360s").Should(Equal(true))
+				}, "360s", "10s").Should(Equal(true))
+
+				By("testing completion")
+				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+
+				By("testing functions")
+				resp2, err := client.CreateChatCompletion(
+					context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: "openllama_3b",
+						Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "What is the weather like in San Francisco (celsius)?",
+							},
+						},
+						Functions: []openai.FunctionDefinition{
+							openai.FunctionDefinition{
+								Name:        "get_current_weather",
+								Description: "Get the current weather",
+								Parameters: jsonschema.Definition{
+									Type: jsonschema.Object,
+									Properties: map[string]jsonschema.Definition{
+										"location": {
+											Type:        jsonschema.String,
+											Description: "The city and state, e.g. San Francisco, CA",
+										},
+										"unit": {
+											Type: jsonschema.String,
+											Enum: []string{"celcius", "fahrenheit"},
+										},
+									},
+									Required: []string{"location"},
+								},
+							},
+						},
+					})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Choices)).To(Equal(1))
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+
+				var res map[string]string
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
+				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
+				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+			})
+
+			It("runs openllama gguf", Label("llama-gguf"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:       "github:go-skynet/model-gallery/openllama-3b-gguf.yaml",
+					Name:      "openllama_3b_gguf",
+					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				By("testing completion")
+				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+
+				By("testing functions")
+				resp2, err := client.CreateChatCompletion(
+					context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: "openllama_3b_gguf",
+						Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "What is the weather like in San Francisco (celsius)?",
+							},
+						},
+						Functions: []openai.FunctionDefinition{
+							openai.FunctionDefinition{
+								Name:        "get_current_weather",
+								Description: "Get the current weather",
+								Parameters: jsonschema.Definition{
+									Type: jsonschema.Object,
+									Properties: map[string]jsonschema.Definition{
+										"location": {
+											Type:        jsonschema.String,
+											Description: "The city and state, e.g. San Francisco, CA",
+										},
+										"unit": {
+											Type: jsonschema.String,
+											Enum: []string{"celcius", "fahrenheit"},
+										},
+									},
+									Required: []string{"location"},
+								},
+							},
+						},
+					})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Choices)).To(Equal(1))
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+
+				var res map[string]string
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res))
+				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
+				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+			})
+
+			It("runs gpt4all", Label("gpt4all"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:  "github:go-skynet/model-gallery/gpt4all-j.yaml",
+					Name: "gpt4all-j",
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))

 				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
 				Expect(resp.Choices[0].Message.Content).To(ContainSubstring("well"))
 			})
+
+		})
+	})
+
+	Context("Model gallery", func() {
+		BeforeEach(func() {
+			var err error
+			tmpdir, err = os.MkdirTemp("", "")
+			Expect(err).ToNot(HaveOccurred())
+
+			modelLoader = model.NewModelLoader(tmpdir)
+			c, cancel = context.WithCancel(context.Background())
+
+			galleries := []gallery.Gallery{
+				{
+					Name: "model-gallery",
+					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/index.yaml",
+				},
+			}
+
+			app, err = App(
+				append(commonOpts,
+					options.WithContext(c),
+					options.WithAudioDir(tmpdir),
+					options.WithImageDir(tmpdir),
+					options.WithGalleries(galleries),
+					options.WithModelLoader(modelLoader),
+					options.WithBackendAssets(backendAssets),
+					options.WithBackendAssetsOutput(tmpdir))...,
+			)
+			Expect(err).ToNot(HaveOccurred())
+			go app.Listen("127.0.0.1:9090")
+
+			defaultConfig := openai.DefaultConfig("")
+			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
+
+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+
+			// Wait for API to be ready
+			client = openai.NewClientWithConfig(defaultConfig)
+			Eventually(func() error {
+				_, err := client.ListModels(context.TODO())
+				return err
+			}, "2m").ShouldNot(HaveOccurred())
+		})
+
+		AfterEach(func() {
+			cancel()
+			app.Shutdown()
+			os.RemoveAll(tmpdir)
+		})
+		It("installs and is capable to run tts", Label("tts"), func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+
+			response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+				ID: "model-gallery@voice-en-us-kathleen-low",
+			})
+
+			Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+			uuid := response["uuid"].(string)
+
+			Eventually(func() bool {
+				response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+				fmt.Println(response)
+				return response["processed"].(bool)
+			}, "360s", "10s").Should(Equal(true))
+
+			// An HTTP Post to the /tts endpoint should return a wav audio file
+			resp, err := http.Post("http://127.0.0.1:9090/tts", "application/json", bytes.NewBuffer([]byte(`{"input": "Hello world", "model": "en-us-kathleen-low.onnx"}`)))
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
+			dat, err := io.ReadAll(resp.Body)
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
+
+			Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat)))
+			Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav"))
+		})
+		It("installs and is capable to generate images", Label("stablediffusion"), func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+
+			response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+				ID: "model-gallery@stablediffusion",
+				Overrides: map[string]interface{}{
+					"parameters": map[string]interface{}{"model": "stablediffusion_assets"},
+				},
+			})
+
+			Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+			uuid := response["uuid"].(string)
+
+			Eventually(func() bool {
+				response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+				fmt.Println(response)
+				return response["processed"].(bool)
+			}, "360s", "10s").Should(Equal(true))
+
+			resp, err := http.Post(
+				"http://127.0.0.1:9090/v1/images/generations",
+				"application/json",
+				bytes.NewBuffer([]byte(`{
+					 			"prompt": "floating hair, portrait, ((loli)), ((one girl)), cute face, hidden hands, asymmetrical bangs, beautiful detailed eyes, eye shadow, hair ornament, ribbons, bowties, buttons, pleated skirt, (((masterpiece))), ((best quality)), colorful|((part of the head)), ((((mutated hands and fingers)))), deformed, blurry, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, poorly drawn hands, missing limb, blurry, floating limbs, disconnected limbs, malformed hands, blur, out of focus, long neck, long body, Octane renderer, lowres, bad anatomy, bad hands, text",
+								"mode": 2,  "seed":9000,
+					 			"size": "256x256", "n":2}`)))
+			// The response should contain an URL
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
+			dat, err := io.ReadAll(resp.Body)
+			Expect(err).ToNot(HaveOccurred(), string(dat))
+			Expect(string(dat)).To(ContainSubstring("http://127.0.0.1:9090/"), string(dat))
+			Expect(string(dat)).To(ContainSubstring(".png"), string(dat))
+
 		})
 	})

@@ -230,7 +578,12 @@ var _ = Describe("API test", func() {
 			c, cancel = context.WithCancel(context.Background())

 			var err error
-			app, err = App(WithContext(c), WithModelLoader(modelLoader))
+			app, err = App(
+				append(commonOpts,
+					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
+					options.WithContext(c),
+					options.WithModelLoader(modelLoader),
+				)...)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

@@ -254,7 +607,7 @@ var _ = Describe("API test", func() {
 		It("returns the models list", func() {
 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(10))
+			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions", func() {
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
@@ -285,9 +638,10 @@ var _ = Describe("API test", func() {
 		})

 		It("returns errors", func() {
+			backends := len(model.AutoLoadBackends) + 1 // +1 for huggingface
 			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
 			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 11 errors occurred:"))
+			Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends)))
 		})
 		It("transcribes audio", func() {
 			if runtime.GOOS != "linux" {
@@ -331,15 +685,98 @@ var _ = Describe("API test", func() {
 			Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
 		})

+		Context("External gRPC calls", func() {
+			It("calculate embeddings with huggingface", func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				resp, err := client.CreateEmbeddings(
+					context.Background(),
+					openai.EmbeddingRequest{
+						Model: openai.AdaCodeSearchCode,
+						Input: []string{"sun", "cat"},
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
+				Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
+
+				sunEmbedding := resp.Data[0].Embedding
+				resp2, err := client.CreateEmbeddings(
+					context.Background(),
+					openai.EmbeddingRequest{
+						Model: openai.AdaCodeSearchCode,
+						Input: []string{"sun"},
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
+				Expect(resp2.Data[0].Embedding).ToNot(Equal(resp.Data[1].Embedding))
+			})
+		})
+
 		Context("backends", func() {
-			It("runs rwkv", func() {
+			It("runs rwkv completion", func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices) > 0).To(BeTrue())
-				Expect(resp.Choices[0].Text).To(Equal(" five."))
+				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+
+				stream, err := client.CreateCompletionStream(context.TODO(), openai.CompletionRequest{
+					Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,", Stream: true,
+				})
+				Expect(err).ToNot(HaveOccurred())
+				defer stream.Close()
+
+				tokens := 0
+				text := ""
+				for {
+					response, err := stream.Recv()
+					if errors.Is(err, io.EOF) {
+						break
+					}
+
+					Expect(err).ToNot(HaveOccurred())
+					text += response.Choices[0].Text
+					tokens++
+				}
+				Expect(text).ToNot(BeEmpty())
+				Expect(text).To(ContainSubstring("five"))
+				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
+			})
+			It("runs rwkv chat completion", func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				resp, err := client.CreateChatCompletion(context.TODO(),
+					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices) > 0).To(BeTrue())
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
+
+				stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
+				Expect(err).ToNot(HaveOccurred())
+				defer stream.Close()
+
+				tokens := 0
+				text := ""
+				for {
+					response, err := stream.Recv()
+					if errors.Is(err, io.EOF) {
+						break
+					}
+
+					Expect(err).ToNot(HaveOccurred())
+					text += response.Choices[0].Delta.Content
+					tokens++
+				}
+				Expect(text).ToNot(BeEmpty())
+				Expect(text).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
+
+				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
 			})
 		})
 	})
@@ -350,7 +787,12 @@ var _ = Describe("API test", func() {
 			c, cancel = context.WithCancel(context.Background())

 			var err error
-			app, err = App(WithContext(c), WithModelLoader(modelLoader), WithConfigFile(os.Getenv("CONFIG_FILE")))
+			app, err = App(
+				append(commonOpts,
+					options.WithContext(c),
+					options.WithModelLoader(modelLoader),
+					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
+			)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

@@ -369,19 +811,14 @@ var _ = Describe("API test", func() {
 			cancel()
 			app.Shutdown()
 		})
-		It("can generate chat completions from config file", func() {
-			models, err := client.ListModels(context.TODO())
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(12))
-		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+		It("can generate chat completions from config file (list1)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+		It("can generate chat completions from config file (list2)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
--- a/api/backend/embeddings.go
+++ b/api/backend/embeddings.go
@@ -0,0 +1,92 @@
+package backend
+
+import (
+	"fmt"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.Config, o *options.Option) (func() ([]float32, error), error) {
+	if !c.Embeddings {
+		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
+	}
+
+	modelFile := c.Model
+
+	grpcOpts := gRPCModelOpts(c)
+
+	var inferenceModel interface{}
+	var err error
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
+	})
+
+	if c.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
+	} else {
+		opts = append(opts, model.WithBackendString(c.Backend))
+		inferenceModel, err = loader.BackendLoader(opts...)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	var fn func() ([]float32, error)
+	switch model := inferenceModel.(type) {
+	case *grpc.Client:
+		fn = func() ([]float32, error) {
+			predictOptions := gRPCPredictOpts(c, loader.ModelPath)
+			if len(tokens) > 0 {
+				embeds := []int32{}
+
+				for _, t := range tokens {
+					embeds = append(embeds, int32(t))
+				}
+				predictOptions.EmbeddingTokens = embeds
+
+				res, err := model.Embeddings(o.Context, predictOptions)
+				if err != nil {
+					return nil, err
+				}
+
+				return res.Embeddings, nil
+			}
+			predictOptions.Embeddings = s
+
+			res, err := model.Embeddings(o.Context, predictOptions)
+			if err != nil {
+				return nil, err
+			}
+
+			return res.Embeddings, nil
+		}
+	default:
+		fn = func() ([]float32, error) {
+			return nil, fmt.Errorf("embeddings not supported by the backend")
+		}
+	}
+
+	return func() ([]float32, error) {
+		embeds, err := fn()
+		if err != nil {
+			return embeds, err
+		}
+		// Remove trailing 0s
+		for i := len(embeds) - 1; i >= 0; i-- {
+			if embeds[i] == 0.0 {
+				embeds = embeds[:i]
+			} else {
+				break
+			}
+		}
+		return embeds, nil
+	}, nil
+}
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -0,0 +1,57 @@
+package backend
+
+import (
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithBackendString(c.Backend),
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithContext(o.Context),
+		model.WithModel(c.Model),
+		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
+			CUDA:          c.Diffusers.CUDA,
+			SchedulerType: c.Diffusers.SchedulerType,
+			PipelineType:  c.Diffusers.PipelineType,
+			CFGScale:      c.Diffusers.CFGScale,
+			IMG2IMG:       c.Diffusers.IMG2IMG,
+			CLIPModel:     c.Diffusers.ClipModel,
+			CLIPSubfolder: c.Diffusers.ClipSubFolder,
+			CLIPSkip:      int32(c.Diffusers.ClipSkip),
+		}),
+	})
+
+	inferenceModel, err := loader.BackendLoader(
+		opts...,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	fn := func() error {
+		_, err := inferenceModel.GenerateImage(
+			o.Context,
+			&proto.GenerateImageRequest{
+				Height:           int32(height),
+				Width:            int32(width),
+				Mode:             int32(mode),
+				Step:             int32(step),
+				Seed:             int32(seed),
+				CLIPSkip:         int32(c.Diffusers.ClipSkip),
+				PositivePrompt:   positive_prompt,
+				NegativePrompt:   negative_prompt,
+				Dst:              dst,
+				Src:              src,
+				EnableParameters: c.Diffusers.EnableParameters,
+			})
+		return err
+	}
+
+	return fn, nil
+}
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -0,0 +1,148 @@
+package backend
+
+import (
+	"context"
+	"os"
+	"regexp"
+	"strings"
+	"sync"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/grpc"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+)
+
+type LLMResponse struct {
+	Response string // should this be []byte?
+	Usage    TokenUsage
+}
+
+type TokenUsage struct {
+	Prompt     int
+	Completion int
+}
+
+func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+	modelFile := c.Model
+
+	grpcOpts := gRPCModelOpts(c)
+
+	var inferenceModel *grpc.Client
+	var err error
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
+		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
+	})
+
+	if c.Backend != "" {
+		opts = append(opts, model.WithBackendString(c.Backend))
+	}
+
+	// Check if the modelFile exists, if it doesn't try to load it from the gallery
+	if o.AutoloadGalleries { // experimental
+		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
+			utils.ResetDownloadTimers()
+			// if we failed to load the model, we try to download it
+			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	if c.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
+	} else {
+		inferenceModel, err = loader.BackendLoader(opts...)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
+	fn := func() (LLMResponse, error) {
+		opts := gRPCPredictOpts(c, loader.ModelPath)
+		opts.Prompt = s
+
+		tokenUsage := TokenUsage{}
+
+		// check the per-model feature flag for usage, since tokenCallback may have a cost.
+		// Defaults to off as for now it is still experimental
+		if c.FeatureFlag.Enabled("usage") {
+			userTokenCallback := tokenCallback
+			if userTokenCallback == nil {
+				userTokenCallback = func(token string, usage TokenUsage) bool {
+					return true
+				}
+			}
+
+			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
+			if pErr == nil && promptInfo.Length > 0 {
+				tokenUsage.Prompt = int(promptInfo.Length)
+			}
+
+			tokenCallback = func(token string, usage TokenUsage) bool {
+				tokenUsage.Completion++
+				return userTokenCallback(token, tokenUsage)
+			}
+		}
+
+		if tokenCallback != nil {
+			ss := ""
+			err := inferenceModel.PredictStream(ctx, opts, func(s []byte) {
+				tokenCallback(string(s), tokenUsage)
+				ss += string(s)
+			})
+			return LLMResponse{
+				Response: ss,
+				Usage:    tokenUsage,
+			}, err
+		} else {
+			// TODO: Is the chicken bit the only way to get here? is that acceptable?
+			reply, err := inferenceModel.Predict(ctx, opts)
+			if err != nil {
+				return LLMResponse{}, err
+			}
+			return LLMResponse{
+				Response: string(reply.Message),
+				Usage:    tokenUsage,
+			}, err
+		}
+	}
+
+	return fn, nil
+}
+
+var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
+var mu sync.Mutex = sync.Mutex{}
+
+func Finetune(config config.Config, input, prediction string) string {
+	if config.Echo {
+		prediction = input + prediction
+	}
+
+	for _, c := range config.Cutstrings {
+		mu.Lock()
+		reg, ok := cutstrings[c]
+		if !ok {
+			cutstrings[c] = regexp.MustCompile(c)
+			reg = cutstrings[c]
+		}
+		mu.Unlock()
+		prediction = reg.ReplaceAllString(prediction, "")
+	}
+
+	for _, c := range config.TrimSpace {
+		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
+	}
+	return prediction
+
+}
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -0,0 +1,112 @@
+package backend
+
+import (
+	"os"
+	"path/filepath"
+
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+)
+
+func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
+	if o.SingleBackend {
+		opts = append(opts, model.WithSingleActiveBackend())
+	}
+
+	if c.GRPC.Attempts != 0 {
+		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+	}
+
+	if c.GRPC.AttemptsSleepTime != 0 {
+		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+	}
+
+	for k, v := range o.ExternalGRPCBackends {
+		opts = append(opts, model.WithExternalBackend(k, v))
+	}
+
+	return opts
+}
+
+func gRPCModelOpts(c config.Config) *pb.ModelOptions {
+	b := 512
+	if c.Batch != 0 {
+		b = c.Batch
+	}
+
+	return &pb.ModelOptions{
+		ContextSize:   int32(c.ContextSize),
+		Seed:          int32(c.Seed),
+		NBatch:        int32(b),
+		NoMulMatQ:     c.NoMulMatQ,
+		LoraAdapter:   c.LoraAdapter,
+		LoraBase:      c.LoraBase,
+		NGQA:          c.NGQA,
+		RMSNormEps:    c.RMSNormEps,
+		F16Memory:     c.F16,
+		MLock:         c.MMlock,
+		RopeFreqBase:  c.RopeFreqBase,
+		RopeFreqScale: c.RopeFreqScale,
+		NUMA:          c.NUMA,
+		Embeddings:    c.Embeddings,
+		LowVRAM:       c.LowVRAM,
+		NGPULayers:    int32(c.NGPULayers),
+		MMap:          c.MMap,
+		MainGPU:       c.MainGPU,
+		Threads:       int32(c.Threads),
+		TensorSplit:   c.TensorSplit,
+		// AutoGPTQ
+		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
+		Device:           c.AutoGPTQ.Device,
+		UseTriton:        c.AutoGPTQ.Triton,
+		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
+		// RWKV
+		Tokenizer: c.Tokenizer,
+	}
+}
+
+func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
+	promptCachePath := ""
+	if c.PromptCachePath != "" {
+		p := filepath.Join(modelPath, c.PromptCachePath)
+		os.MkdirAll(filepath.Dir(p), 0755)
+		promptCachePath = p
+	}
+	return &pb.PredictOptions{
+		Temperature:         float32(c.Temperature),
+		TopP:                float32(c.TopP),
+		TopK:                int32(c.TopK),
+		Tokens:              int32(c.Maxtokens),
+		Threads:             int32(c.Threads),
+		PromptCacheAll:      c.PromptCacheAll,
+		PromptCacheRO:       c.PromptCacheRO,
+		PromptCachePath:     promptCachePath,
+		F16KV:               c.F16,
+		DebugMode:           c.Debug,
+		Grammar:             c.Grammar,
+		NegativePromptScale: c.NegativePromptScale,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeFreqScale:       c.RopeFreqScale,
+		NegativePrompt:      c.NegativePrompt,
+		Mirostat:            int32(c.LLMConfig.Mirostat),
+		MirostatETA:         float32(c.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(c.LLMConfig.MirostatTAU),
+		Debug:               c.Debug,
+		StopPrompts:         c.StopWords,
+		Repeat:              int32(c.RepeatPenalty),
+		NKeep:               int32(c.Keep),
+		Batch:               int32(c.Batch),
+		IgnoreEOS:           c.IgnoreEOS,
+		Seed:                int32(c.Seed),
+		FrequencyPenalty:    float32(c.FrequencyPenalty),
+		MLock:               c.MMlock,
+		MMap:                c.MMap,
+		MainGPU:             c.MainGPU,
+		TensorSplit:         c.TensorSplit,
+		TailFreeSamplingZ:   float32(c.TFZ),
+		TypicalP:            float32(c.TypicalP),
+	}
+}
--- a/api/backend/transcript.go
+++ b/api/backend/transcript.go
@@ -0,0 +1,39 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithBackendString(model.WhisperBackend),
+		model.WithModel(c.Model),
+		model.WithContext(o.Context),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithAssetDir(o.AssetsDestination),
+	})
+
+	whisperModel, err := o.Loader.BackendLoader(opts...)
+	if err != nil {
+		return nil, err
+	}
+
+	if whisperModel == nil {
+		return nil, fmt.Errorf("could not load whisper model")
+	}
+
+	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
+		Dst:      audio,
+		Language: language,
+		Threads:  uint32(c.Threads),
+	})
+}
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -0,0 +1,75 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	api_config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+)
+
+func generateUniqueFileName(dir, baseName, ext string) string {
+	counter := 1
+	fileName := baseName + ext
+
+	for {
+		filePath := filepath.Join(dir, fileName)
+		_, err := os.Stat(filePath)
+		if os.IsNotExist(err) {
+			return fileName
+		}
+
+		counter++
+		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
+	}
+}
+
+func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
+	bb := backend
+	if bb == "" {
+		bb = model.PiperBackend
+	}
+	opts := modelOpts(api_config.Config{}, o, []model.Option{
+		model.WithBackendString(bb),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
+		model.WithAssetDir(o.AssetsDestination),
+	})
+	piperModel, err := o.Loader.BackendLoader(opts...)
+	if err != nil {
+		return "", nil, err
+	}
+
+	if piperModel == nil {
+		return "", nil, fmt.Errorf("could not load piper model")
+	}
+
+	if err := os.MkdirAll(o.AudioDir, 0755); err != nil {
+		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
+	}
+
+	fileName := generateUniqueFileName(o.AudioDir, "piper", ".wav")
+	filePath := filepath.Join(o.AudioDir, fileName)
+
+	// If the model file is not empty, we pass it joined with the model path
+	modelPath := ""
+	if modelFile != "" {
+		modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
+		if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+			return "", nil, err
+		}
+	}
+
+	res, err := piperModel.TTS(context.Background(), &proto.TTSRequest{
+		Text:  text,
+		Model: modelPath,
+		Dst:   filePath,
+	})
+
+	return filePath, res, err
+}
--- a/api/backend_assets.go
+++ b/api/backend_assets.go
@@ -1,27 +0,0 @@
-package api
-
-import (
-	"embed"
-	"os"
-	"path/filepath"
-
-	"github.com/go-skynet/LocalAI/pkg/assets"
-	"github.com/rs/zerolog/log"
-)
-
-func PrepareBackendAssets(backendAssets embed.FS, dst string) error {
-
-	// Extract files from the embedded FS
-	err := assets.ExtractFiles(backendAssets, dst)
-	if err != nil {
-		return err
-	}
-
-	// Set GPT4ALL libs where we extracted the files
-	// https://github.com/nomic-ai/gpt4all/commit/27e80e1d10985490c9fd4214e4bf458cfcf70896
-	gpt4alldir := filepath.Join(dst, "backend-assets", "gpt4all")
-	os.Setenv("GPT4ALL_IMPLEMENTATIONS_PATH", gpt4alldir)
-	log.Debug().Msgf("GPT4ALL_IMPLEMENTATIONS_PATH: %s", gpt4alldir)
-
-	return nil
-}
--- a/api/config.go
+++ b/api/config.go
@@ -1,340 +0,0 @@
-package api
-
-import (
-	"encoding/json"
-	"fmt"
-	"io/fs"
-	"os"
-	"path/filepath"
-	"strings"
-	"sync"
-
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-	"gopkg.in/yaml.v3"
-)
-
-type Config struct {
-	OpenAIRequest         `yaml:"parameters"`
-	Name                  string            `yaml:"name"`
-	StopWords             []string          `yaml:"stopwords"`
-	Cutstrings            []string          `yaml:"cutstrings"`
-	TrimSpace             []string          `yaml:"trimspace"`
-	ContextSize           int               `yaml:"context_size"`
-	F16                   bool              `yaml:"f16"`
-	Threads               int               `yaml:"threads"`
-	Debug                 bool              `yaml:"debug"`
-	Roles                 map[string]string `yaml:"roles"`
-	Embeddings            bool              `yaml:"embeddings"`
-	Backend               string            `yaml:"backend"`
-	TemplateConfig        TemplateConfig    `yaml:"template"`
-	MirostatETA           float64           `yaml:"mirostat_eta"`
-	MirostatTAU           float64           `yaml:"mirostat_tau"`
-	Mirostat              int               `yaml:"mirostat"`
-	NGPULayers            int               `yaml:"gpu_layers"`
-	ImageGenerationAssets string            `yaml:"asset_dir"`
-
-	PromptCachePath string `yaml:"prompt_cache_path"`
-	PromptCacheAll  bool   `yaml:"prompt_cache_all"`
-
-	PromptStrings, InputStrings []string
-	InputToken                  [][]int
-}
-
-type TemplateConfig struct {
-	Completion string `yaml:"completion"`
-	Chat       string `yaml:"chat"`
-	Edit       string `yaml:"edit"`
-}
-
-type ConfigMerger struct {
-	configs map[string]Config
-	sync.Mutex
-}
-
-func NewConfigMerger() *ConfigMerger {
-	return &ConfigMerger{
-		configs: make(map[string]Config),
-	}
-}
-func ReadConfigFile(file string) ([]*Config, error) {
-	c := &[]*Config{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	return *c, nil
-}
-
-func ReadConfig(file string) (*Config, error) {
-	c := &Config{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	return c, nil
-}
-
-func (cm ConfigMerger) LoadConfigFile(file string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadConfigFile(file)
-	if err != nil {
-		return fmt.Errorf("cannot load config file: %w", err)
-	}
-
-	for _, cc := range c {
-		cm.configs[cc.Name] = *cc
-	}
-	return nil
-}
-
-func (cm ConfigMerger) LoadConfig(file string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadConfig(file)
-	if err != nil {
-		return fmt.Errorf("cannot read config file: %w", err)
-	}
-
-	cm.configs[c.Name] = *c
-	return nil
-}
-
-func (cm ConfigMerger) GetConfig(m string) (Config, bool) {
-	cm.Lock()
-	defer cm.Unlock()
-	v, exists := cm.configs[m]
-	return v, exists
-}
-
-func (cm ConfigMerger) ListConfigs() []string {
-	cm.Lock()
-	defer cm.Unlock()
-	var res []string
-	for k := range cm.configs {
-		res = append(res, k)
-	}
-	return res
-}
-
-func (cm ConfigMerger) LoadConfigs(path string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	entries, err := os.ReadDir(path)
-	if err != nil {
-		return err
-	}
-	files := make([]fs.FileInfo, 0, len(entries))
-	for _, entry := range entries {
-		info, err := entry.Info()
-		if err != nil {
-			return err
-		}
-		files = append(files, info)
-	}
-	for _, file := range files {
-		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") {
-			continue
-		}
-		c, err := ReadConfig(filepath.Join(path, file.Name()))
-		if err == nil {
-			cm.configs[c.Name] = *c
-		}
-	}
-
-	return nil
-}
-
-func updateConfig(config *Config, input *OpenAIRequest) {
-	if input.Echo {
-		config.Echo = input.Echo
-	}
-	if input.TopK != 0 {
-		config.TopK = input.TopK
-	}
-	if input.TopP != 0 {
-		config.TopP = input.TopP
-	}
-
-	if input.Temperature != 0 {
-		config.Temperature = input.Temperature
-	}
-
-	if input.Maxtokens != 0 {
-		config.Maxtokens = input.Maxtokens
-	}
-
-	switch stop := input.Stop.(type) {
-	case string:
-		if stop != "" {
-			config.StopWords = append(config.StopWords, stop)
-		}
-	case []interface{}:
-		for _, pp := range stop {
-			if s, ok := pp.(string); ok {
-				config.StopWords = append(config.StopWords, s)
-			}
-		}
-	}
-
-	if input.RepeatPenalty != 0 {
-		config.RepeatPenalty = input.RepeatPenalty
-	}
-
-	if input.Keep != 0 {
-		config.Keep = input.Keep
-	}
-
-	if input.Batch != 0 {
-		config.Batch = input.Batch
-	}
-
-	if input.F16 {
-		config.F16 = input.F16
-	}
-
-	if input.IgnoreEOS {
-		config.IgnoreEOS = input.IgnoreEOS
-	}
-
-	if input.Seed != 0 {
-		config.Seed = input.Seed
-	}
-
-	if input.Mirostat != 0 {
-		config.Mirostat = input.Mirostat
-	}
-
-	if input.MirostatETA != 0 {
-		config.MirostatETA = input.MirostatETA
-	}
-
-	if input.MirostatTAU != 0 {
-		config.MirostatTAU = input.MirostatTAU
-	}
-
-	switch inputs := input.Input.(type) {
-	case string:
-		if inputs != "" {
-			config.InputStrings = append(config.InputStrings, inputs)
-		}
-	case []interface{}:
-		for _, pp := range inputs {
-			switch i := pp.(type) {
-			case string:
-				config.InputStrings = append(config.InputStrings, i)
-			case []interface{}:
-				tokens := []int{}
-				for _, ii := range i {
-					tokens = append(tokens, int(ii.(float64)))
-				}
-				config.InputToken = append(config.InputToken, tokens)
-			}
-		}
-	}
-
-	switch p := input.Prompt.(type) {
-	case string:
-		config.PromptStrings = append(config.PromptStrings, p)
-	case []interface{}:
-		for _, pp := range p {
-			if s, ok := pp.(string); ok {
-				config.PromptStrings = append(config.PromptStrings, s)
-			}
-		}
-	}
-}
-func readInput(c *fiber.Ctx, loader *model.ModelLoader, randomModel bool) (string, *OpenAIRequest, error) {
-	input := new(OpenAIRequest)
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return "", nil, err
-	}
-
-	modelFile := input.Model
-
-	if c.Params("model") != "" {
-		modelFile = c.Params("model")
-	}
-
-	received, _ := json.Marshal(input)
-
-	log.Debug().Msgf("Request received: %s", string(received))
-
-	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
-	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
-
-	// If no model was specified, take the first available
-	if modelFile == "" && !bearerExists && randomModel {
-		models, _ := loader.ListModels()
-		if len(models) > 0 {
-			modelFile = models[0]
-			log.Debug().Msgf("No model specified, using: %s", modelFile)
-		} else {
-			log.Debug().Msgf("No model specified, returning error")
-			return "", nil, fmt.Errorf("no model specified")
-		}
-	}
-
-	// If a model is found in bearer token takes precedence
-	if bearerExists {
-		log.Debug().Msgf("Using model from bearer token: %s", bearer)
-		modelFile = bearer
-	}
-	return modelFile, input, nil
-}
-
-func readConfig(modelFile string, input *OpenAIRequest, cm *ConfigMerger, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*Config, *OpenAIRequest, error) {
-	// Load a config file if present after the model name
-	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
-	if _, err := os.Stat(modelConfig); err == nil {
-		if err := cm.LoadConfig(modelConfig); err != nil {
-			return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-		}
-	}
-
-	var config *Config
-	cfg, exists := cm.GetConfig(modelFile)
-	if !exists {
-		config = &Config{
-			OpenAIRequest: defaultRequest(modelFile),
-			ContextSize:   ctx,
-			Threads:       threads,
-			F16:           f16,
-			Debug:         debug,
-		}
-	} else {
-		config = &cfg
-	}
-
-	// Set the parameters for the language model prediction
-	updateConfig(config, input)
-
-	// Don't allow 0 as setting
-	if config.Threads == 0 {
-		if threads != 0 {
-			config.Threads = threads
-		} else {
-			config.Threads = 4
-		}
-	}
-
-	// Enforce debug flag if passed from CLI
-	if debug {
-		config.Debug = true
-	}
-
-	return config, input, nil
-}
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -0,0 +1,272 @@
+package api_config
+
+import (
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+
+	"gopkg.in/yaml.v3"
+)
+
+type Config struct {
+	PredictionOptions `yaml:"parameters"`
+	Name              string `yaml:"name"`
+
+	F16            bool              `yaml:"f16"`
+	Threads        int               `yaml:"threads"`
+	Debug          bool              `yaml:"debug"`
+	Roles          map[string]string `yaml:"roles"`
+	Embeddings     bool              `yaml:"embeddings"`
+	Backend        string            `yaml:"backend"`
+	TemplateConfig TemplateConfig    `yaml:"template"`
+
+	PromptStrings, InputStrings                []string `yaml:"-"`
+	InputToken                                 [][]int  `yaml:"-"`
+	functionCallString, functionCallNameString string   `yaml:"-"`
+
+	FunctionsConfig Functions `yaml:"function"`
+
+	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
+	// LLM configs (GPT4ALL, Llama.cpp, ...)
+	LLMConfig `yaml:",inline"`
+
+	// AutoGPTQ specifics
+	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
+
+	// Diffusers
+	Diffusers Diffusers `yaml:"diffusers"`
+
+	Step int `yaml:"step"`
+
+	// GRPC Options
+	GRPC GRPC `yaml:"grpc"`
+}
+
+type FeatureFlag map[string]*bool
+
+func (ff FeatureFlag) Enabled(s string) bool {
+	v, exist := ff[s]
+	return exist && v != nil && *v
+}
+
+type GRPC struct {
+	Attempts          int `yaml:"attempts"`
+	AttemptsSleepTime int `yaml:"attempts_sleep_time"`
+}
+
+type Diffusers struct {
+	PipelineType     string  `yaml:"pipeline_type"`
+	SchedulerType    string  `yaml:"scheduler_type"`
+	CUDA             bool    `yaml:"cuda"`
+	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
+	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
+	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
+	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
+	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
+	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+}
+
+type LLMConfig struct {
+	SystemPrompt    string   `yaml:"system_prompt"`
+	TensorSplit     string   `yaml:"tensor_split"`
+	MainGPU         string   `yaml:"main_gpu"`
+	RMSNormEps      float32  `yaml:"rms_norm_eps"`
+	NGQA            int32    `yaml:"ngqa"`
+	PromptCachePath string   `yaml:"prompt_cache_path"`
+	PromptCacheAll  bool     `yaml:"prompt_cache_all"`
+	PromptCacheRO   bool     `yaml:"prompt_cache_ro"`
+	MirostatETA     float64  `yaml:"mirostat_eta"`
+	MirostatTAU     float64  `yaml:"mirostat_tau"`
+	Mirostat        int      `yaml:"mirostat"`
+	NGPULayers      int      `yaml:"gpu_layers"`
+	MMap            bool     `yaml:"mmap"`
+	MMlock          bool     `yaml:"mmlock"`
+	LowVRAM         bool     `yaml:"low_vram"`
+	Grammar         string   `yaml:"grammar"`
+	StopWords       []string `yaml:"stopwords"`
+	Cutstrings      []string `yaml:"cutstrings"`
+	TrimSpace       []string `yaml:"trimspace"`
+	ContextSize     int      `yaml:"context_size"`
+	NUMA            bool     `yaml:"numa"`
+	LoraAdapter     string   `yaml:"lora_adapter"`
+	LoraBase        string   `yaml:"lora_base"`
+	NoMulMatQ       bool     `yaml:"no_mulmatq"`
+}
+
+type AutoGPTQ struct {
+	ModelBaseName    string `yaml:"model_base_name"`
+	Device           string `yaml:"device"`
+	Triton           bool   `yaml:"triton"`
+	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
+}
+
+type Functions struct {
+	DisableNoAction         bool   `yaml:"disable_no_action"`
+	NoActionFunctionName    string `yaml:"no_action_function_name"`
+	NoActionDescriptionName string `yaml:"no_action_description_name"`
+}
+
+type TemplateConfig struct {
+	Chat        string `yaml:"chat"`
+	ChatMessage string `yaml:"chat_message"`
+	Completion  string `yaml:"completion"`
+	Edit        string `yaml:"edit"`
+	Functions   string `yaml:"function"`
+}
+
+type ConfigLoader struct {
+	configs map[string]Config
+	sync.Mutex
+}
+
+func (c *Config) SetFunctionCallString(s string) {
+	c.functionCallString = s
+}
+
+func (c *Config) SetFunctionCallNameString(s string) {
+	c.functionCallNameString = s
+}
+
+func (c *Config) ShouldUseFunctions() bool {
+	return ((c.functionCallString != "none" || c.functionCallString == "") || c.ShouldCallSpecificFunction())
+}
+
+func (c *Config) ShouldCallSpecificFunction() bool {
+	return len(c.functionCallNameString) > 0
+}
+
+func (c *Config) FunctionToCall() string {
+	return c.functionCallNameString
+}
+
+func defaultPredictOptions(modelFile string) PredictionOptions {
+	return PredictionOptions{
+		TopP:        0.7,
+		TopK:        80,
+		Maxtokens:   512,
+		Temperature: 0.9,
+		Model:       modelFile,
+	}
+}
+
+func DefaultConfig(modelFile string) *Config {
+	return &Config{
+		PredictionOptions: defaultPredictOptions(modelFile),
+	}
+}
+
+func NewConfigLoader() *ConfigLoader {
+	return &ConfigLoader{
+		configs: make(map[string]Config),
+	}
+}
+func ReadConfigFile(file string) ([]*Config, error) {
+	c := &[]*Config{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	return *c, nil
+}
+
+func ReadConfig(file string) (*Config, error) {
+	c := &Config{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	return c, nil
+}
+
+func (cm *ConfigLoader) LoadConfigFile(file string) error {
+	cm.Lock()
+	defer cm.Unlock()
+	c, err := ReadConfigFile(file)
+	if err != nil {
+		return fmt.Errorf("cannot load config file: %w", err)
+	}
+
+	for _, cc := range c {
+		cm.configs[cc.Name] = *cc
+	}
+	return nil
+}
+
+func (cm *ConfigLoader) LoadConfig(file string) error {
+	cm.Lock()
+	defer cm.Unlock()
+	c, err := ReadConfig(file)
+	if err != nil {
+		return fmt.Errorf("cannot read config file: %w", err)
+	}
+
+	cm.configs[c.Name] = *c
+	return nil
+}
+
+func (cm *ConfigLoader) GetConfig(m string) (Config, bool) {
+	cm.Lock()
+	defer cm.Unlock()
+	v, exists := cm.configs[m]
+	return v, exists
+}
+
+func (cm *ConfigLoader) GetAllConfigs() []Config {
+	cm.Lock()
+	defer cm.Unlock()
+	var res []Config
+	for _, v := range cm.configs {
+		res = append(res, v)
+	}
+	return res
+}
+
+func (cm *ConfigLoader) ListConfigs() []string {
+	cm.Lock()
+	defer cm.Unlock()
+	var res []string
+	for k := range cm.configs {
+		res = append(res, k)
+	}
+	return res
+}
+
+func (cm *ConfigLoader) LoadConfigs(path string) error {
+	cm.Lock()
+	defer cm.Unlock()
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		return err
+	}
+	files := make([]fs.FileInfo, 0, len(entries))
+	for _, entry := range entries {
+		info, err := entry.Info()
+		if err != nil {
+			return err
+		}
+		files = append(files, info)
+	}
+	for _, file := range files {
+		// Skip templates, YAML and .keep files
+		if !strings.Contains(file.Name(), ".yaml") {
+			continue
+		}
+		c, err := ReadConfig(filepath.Join(path, file.Name()))
+		if err == nil {
+			cm.configs[c.Name] = *c
+		}
+	}
+
+	return nil
+}
--- a/api/config/config_test.go
+++ b/api/config/config_test.go
@@ -1,8 +1,10 @@
-package api
+package api_config_test

 import (
 	"os"

+	. "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -26,29 +28,29 @@ var _ = Describe("Test cases for config related functions", func() {
 		})

 		It("Test LoadConfigs", func() {
-			cm := NewConfigMerger()
-			options := newOptions()
+			cm := NewConfigLoader()
+			opts := options.NewOptions()
 			modelLoader := model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			WithModelLoader(modelLoader)(options)
+			options.WithModelLoader(modelLoader)(opts)

-			err := cm.LoadConfigs(options.loader.ModelPath)
+			err := cm.LoadConfigs(opts.Loader.ModelPath)
 			Expect(err).To(BeNil())
-			Expect(cm.configs).ToNot(BeNil())
+			Expect(cm.ListConfigs()).ToNot(BeNil())

 			// config should includes gpt4all models's api.config
-			Expect(cm.configs).To(HaveKey("gpt4all"))
+			Expect(cm.ListConfigs()).To(ContainElements("gpt4all"))

 			// config should includes gpt2 models's api.config
-			Expect(cm.configs).To(HaveKey("gpt4all-2"))
+			Expect(cm.ListConfigs()).To(ContainElements("gpt4all-2"))

 			// config should includes text-embedding-ada-002 models's api.config
-			Expect(cm.configs).To(HaveKey("text-embedding-ada-002"))
+			Expect(cm.ListConfigs()).To(ContainElements("text-embedding-ada-002"))

 			// config should includes rwkv_test models's api.config
-			Expect(cm.configs).To(HaveKey("rwkv_test"))
+			Expect(cm.ListConfigs()).To(ContainElements("rwkv_test"))

 			// config should includes whisper-1 models's api.config
-			Expect(cm.configs).To(HaveKey("whisper-1"))
+			Expect(cm.ListConfigs()).To(ContainElements("whisper-1"))
 		})
 	})
 })
--- a/api/config/prediction.go
+++ b/api/config/prediction.go
@@ -0,0 +1,50 @@
+package api_config
+
+type PredictionOptions struct {
+
+	// Also part of the OpenAI official spec
+	Model string `json:"model" yaml:"model"`
+
+	// Also part of the OpenAI official spec
+	Language string `json:"language"`
+
+	// Also part of the OpenAI official spec. use it for returning multiple results
+	N int `json:"n"`
+
+	// Common options between all the API calls, part of the OpenAI spec
+	TopP        float64 `json:"top_p" yaml:"top_p"`
+	TopK        int     `json:"top_k" yaml:"top_k"`
+	Temperature float64 `json:"temperature" yaml:"temperature"`
+	Maxtokens   int     `json:"max_tokens" yaml:"max_tokens"`
+	Echo        bool    `json:"echo"`
+
+	// Custom parameters - not present in the OpenAI API
+	Batch         int     `json:"batch" yaml:"batch"`
+	F16           bool    `json:"f16" yaml:"f16"`
+	IgnoreEOS     bool    `json:"ignore_eos" yaml:"ignore_eos"`
+	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
+	Keep          int     `json:"n_keep" yaml:"n_keep"`
+
+	MirostatETA float64 `json:"mirostat_eta" yaml:"mirostat_eta"`
+	MirostatTAU float64 `json:"mirostat_tau" yaml:"mirostat_tau"`
+	Mirostat    int     `json:"mirostat" yaml:"mirostat"`
+
+	FrequencyPenalty float64 `json:"frequency_penalty" yaml:"frequency_penalty"`
+	TFZ              float64 `json:"tfz" yaml:"tfz"`
+
+	TypicalP float64 `json:"typical_p" yaml:"typical_p"`
+	Seed     int     `json:"seed" yaml:"seed"`
+
+	NegativePrompt      string  `json:"negative_prompt" yaml:"negative_prompt"`
+	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
+	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
+	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
+	// AutoGPTQ
+	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
+
+	// Diffusers
+	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
+
+	// RWKV (?)
+	Tokenizer string `json:"tokenizer" yaml:"tokenizer"`
+}
--- a/api/gallery.go
+++ b/api/gallery.go
@@ -1,233 +0,0 @@
-package api
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"io/ioutil"
-	"net/http"
-	"net/url"
-	"os"
-	"strings"
-	"sync"
-
-	"github.com/go-skynet/LocalAI/pkg/gallery"
-	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
-	"gopkg.in/yaml.v3"
-)
-
-type galleryOp struct {
-	req ApplyGalleryModelRequest
-	id  string
-}
-
-type galleryOpStatus struct {
-	Error     error  `json:"error"`
-	Processed bool   `json:"processed"`
-	Message   string `json:"message"`
-}
-
-type galleryApplier struct {
-	modelPath string
-	sync.Mutex
-	C        chan galleryOp
-	statuses map[string]*galleryOpStatus
-}
-
-func newGalleryApplier(modelPath string) *galleryApplier {
-	return &galleryApplier{
-		modelPath: modelPath,
-		C:         make(chan galleryOp),
-		statuses:  make(map[string]*galleryOpStatus),
-	}
-}
-
-func applyGallery(modelPath string, req ApplyGalleryModelRequest, cm *ConfigMerger) error {
-	url, err := req.DecodeURL()
-	if err != nil {
-		return err
-	}
-
-	// Send a GET request to the URL
-	response, err := http.Get(url)
-	if err != nil {
-		return err
-	}
-	defer response.Body.Close()
-
-	// Read the response body
-	body, err := ioutil.ReadAll(response.Body)
-	if err != nil {
-		return err
-	}
-
-	// Unmarshal YAML data into a Config struct
-	var config gallery.Config
-	err = yaml.Unmarshal(body, &config)
-	if err != nil {
-		return err
-	}
-
-	config.Files = append(config.Files, req.AdditionalFiles...)
-
-	if err := gallery.Apply(modelPath, req.Name, &config, req.Overrides); err != nil {
-		return err
-	}
-
-	// Reload models
-	return cm.LoadConfigs(modelPath)
-}
-
-func (g *galleryApplier) updatestatus(s string, op *galleryOpStatus) {
-	g.Lock()
-	defer g.Unlock()
-	g.statuses[s] = op
-}
-
-func (g *galleryApplier) getstatus(s string) *galleryOpStatus {
-	g.Lock()
-	defer g.Unlock()
-
-	return g.statuses[s]
-}
-
-func (g *galleryApplier) start(c context.Context, cm *ConfigMerger) {
-	go func() {
-		for {
-			select {
-			case <-c.Done():
-				return
-			case op := <-g.C:
-				g.updatestatus(op.id, &galleryOpStatus{Message: "processing"})
-
-				updateError := func(e error) {
-					g.updatestatus(op.id, &galleryOpStatus{Error: e, Processed: true})
-				}
-
-				if err := applyGallery(g.modelPath, op.req, cm); err != nil {
-					updateError(err)
-					continue
-				}
-
-				g.updatestatus(op.id, &galleryOpStatus{Processed: true, Message: "completed"})
-			}
-		}
-	}()
-}
-
-func ApplyGalleryFromFile(modelPath, s string, cm *ConfigMerger) error {
-	dat, err := os.ReadFile(s)
-	if err != nil {
-		return err
-	}
-	var requests []ApplyGalleryModelRequest
-	err = json.Unmarshal(dat, &requests)
-	if err != nil {
-		return err
-	}
-
-	for _, r := range requests {
-		if err := applyGallery(modelPath, r, cm); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-func ApplyGalleryFromString(modelPath, s string, cm *ConfigMerger) error {
-	var requests []ApplyGalleryModelRequest
-	err := json.Unmarshal([]byte(s), &requests)
-	if err != nil {
-		return err
-	}
-
-	for _, r := range requests {
-		if err := applyGallery(modelPath, r, cm); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// endpoints
-
-type ApplyGalleryModelRequest struct {
-	URL             string                 `json:"url"`
-	Name            string                 `json:"name"`
-	Overrides       map[string]interface{} `json:"overrides"`
-	AdditionalFiles []gallery.File         `json:"files"`
-}
-
-const (
-	githubURI = "github:"
-)
-
-func (request ApplyGalleryModelRequest) DecodeURL() (string, error) {
-	input := request.URL
-	var rawURL string
-
-	if strings.HasPrefix(input, githubURI) {
-		parts := strings.Split(input, ":")
-		repoParts := strings.Split(parts[1], "@")
-		branch := "main"
-
-		if len(repoParts) > 1 {
-			branch = repoParts[1]
-		}
-
-		repoPath := strings.Split(repoParts[0], "/")
-		org := repoPath[0]
-		project := repoPath[1]
-		projectPath := strings.Join(repoPath[2:], "/")
-
-		rawURL = fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
-	} else if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
-		// Handle regular URLs
-		u, err := url.Parse(input)
-		if err != nil {
-			return "", fmt.Errorf("invalid URL: %w", err)
-		}
-		rawURL = u.String()
-	} else {
-		return "", fmt.Errorf("invalid URL format")
-	}
-
-	return rawURL, nil
-}
-
-func getOpStatus(g *galleryApplier) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		status := g.getstatus(c.Params("uuid"))
-		if status == nil {
-			return fmt.Errorf("could not find any status for ID")
-		}
-
-		return c.JSON(status)
-	}
-}
-
-func applyModelGallery(modelPath string, cm *ConfigMerger, g chan galleryOp) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(ApplyGalleryModelRequest)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		uuid, err := uuid.NewUUID()
-		if err != nil {
-			return err
-		}
-		g <- galleryOp{
-			req: *input,
-			id:  uuid.String(),
-		}
-		return c.JSON(struct {
-			ID        string `json:"uuid"`
-			StatusURL string `json:"status"`
-		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
-	}
-}
--- a/api/gallery_test.go
+++ b/api/gallery_test.go
@@ -1,30 +0,0 @@
-package api_test
-
-import (
-	. "github.com/go-skynet/LocalAI/api"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("Gallery API tests", func() {
-	Context("requests", func() {
-		It("parses github with a branch", func() {
-			req := ApplyGalleryModelRequest{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"}
-			str, err := req.DecodeURL()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(str).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
-		})
-		It("parses github without a branch", func() {
-			req := ApplyGalleryModelRequest{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml"}
-			str, err := req.DecodeURL()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(str).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
-		})
-		It("parses URLS", func() {
-			req := ApplyGalleryModelRequest{URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"}
-			str, err := req.DecodeURL()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(str).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
-		})
-	})
-})
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -0,0 +1,163 @@
+package localai
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+
+	gopsutil "github.com/shirou/gopsutil/v3/process"
+)
+
+type BackendMonitorRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
+type BackendMonitorResponse struct {
+	MemoryInfo    *gopsutil.MemoryInfoStat
+	MemoryPercent float32
+	CPUPercent    float64
+}
+
+type BackendMonitor struct {
+	configLoader *config.ConfigLoader
+	options      *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
+}
+
+func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
+	return BackendMonitor{
+		configLoader: configLoader,
+		options:      options,
+	}
+}
+
+func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
+	config, exists := bm.configLoader.GetConfig(model)
+	var backend string
+	if exists {
+		backend = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backend = model
+	}
+
+	if !strings.HasSuffix(backend, ".bin") {
+		backend = fmt.Sprintf("%s.bin", backend)
+	}
+
+	pid, err := bm.options.Loader.GetGRPCPID(backend)
+
+	if err != nil {
+		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
+		return nil, err
+	}
+
+	// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
+	backendProcess, err := gopsutil.NewProcess(int32(pid))
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memInfo, err := backendProcess.MemoryInfo()
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memPercent, err := backendProcess.MemoryPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	cpuPercent, err := backendProcess.CPUPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	return &BackendMonitorResponse{
+		MemoryInfo:    memInfo,
+		MemoryPercent: memPercent,
+		CPUPercent:    cpuPercent,
+	}, nil
+}
+
+func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
+	input := new(BackendMonitorRequest)
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", err
+	}
+
+	config, exists := bm.configLoader.GetConfig(input.Model)
+	var backendId string
+	if exists {
+		backendId = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backendId = input.Model
+	}
+
+	if !strings.HasSuffix(backendId, ".bin") {
+		backendId = fmt.Sprintf("%s.bin", backendId)
+	}
+
+	return backendId, nil
+}
+
+func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		client := bm.options.Loader.CheckIsLoaded(backendId)
+
+		if client == nil {
+			return fmt.Errorf("backend %s is not currently loaded", backendId)
+		}
+
+		status, rpcErr := client.Status(context.TODO())
+		if rpcErr != nil {
+			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
+			val, slbErr := bm.SampleLocalBackendProcess(backendId)
+			if slbErr != nil {
+				return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
+			}
+			return c.JSON(proto.StatusResponse{
+				State: proto.StatusResponse_ERROR,
+				Memory: &proto.MemoryUsageData{
+					Total: val.MemoryInfo.VMS,
+					Breakdown: map[string]uint64{
+						"gopsutil-RSS": val.MemoryInfo.RSS,
+					},
+				},
+			})
+		}
+
+		return c.JSON(status)
+	}
+}
+
+func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		return bm.options.Loader.ShutdownModel(backendId)
+	}
+}
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -0,0 +1,241 @@
+package localai
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+	"sync"
+
+	json "github.com/json-iterator/go"
+	"gopkg.in/yaml.v3"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog/log"
+)
+
+type galleryOp struct {
+	req         gallery.GalleryModel
+	id          string
+	galleries   []gallery.Gallery
+	galleryName string
+}
+
+type galleryOpStatus struct {
+	Error              error   `json:"error"`
+	Processed          bool    `json:"processed"`
+	Message            string  `json:"message"`
+	Progress           float64 `json:"progress"`
+	TotalFileSize      string  `json:"file_size"`
+	DownloadedFileSize string  `json:"downloaded_size"`
+}
+
+type galleryApplier struct {
+	modelPath string
+	sync.Mutex
+	C        chan galleryOp
+	statuses map[string]*galleryOpStatus
+}
+
+func NewGalleryService(modelPath string) *galleryApplier {
+	return &galleryApplier{
+		modelPath: modelPath,
+		C:         make(chan galleryOp),
+		statuses:  make(map[string]*galleryOpStatus),
+	}
+}
+
+// prepareModel applies a
+func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {
+
+	config, err := gallery.GetGalleryConfigFromURL(req.URL)
+	if err != nil {
+		return err
+	}
+
+	config.Files = append(config.Files, req.AdditionalFiles...)
+
+	return gallery.InstallModel(modelPath, req.Name, &config, req.Overrides, downloadStatus)
+}
+
+func (g *galleryApplier) updateStatus(s string, op *galleryOpStatus) {
+	g.Lock()
+	defer g.Unlock()
+	g.statuses[s] = op
+}
+
+func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
+	g.Lock()
+	defer g.Unlock()
+
+	return g.statuses[s]
+}
+
+func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
+	go func() {
+		for {
+			select {
+			case <-c.Done():
+				return
+			case op := <-g.C:
+				utils.ResetDownloadTimers()
+
+				g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: 0})
+
+				// updates the status with an error
+				updateError := func(e error) {
+					g.updateStatus(op.id, &galleryOpStatus{Error: e, Processed: true, Message: "error: " + e.Error()})
+				}
+
+				// displayDownload displays the download progress
+				progressCallback := func(fileName string, current string, total string, percentage float64) {
+					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
+					utils.DisplayDownloadFunction(fileName, current, total, percentage)
+				}
+
+				var err error
+				// if the request contains a gallery name, we apply the gallery from the gallery list
+				if op.galleryName != "" {
+					if strings.Contains(op.galleryName, "@") {
+						err = gallery.InstallModelFromGallery(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
+					} else {
+						err = gallery.InstallModelFromGalleryByName(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
+					}
+				} else {
+					err = prepareModel(g.modelPath, op.req, cm, progressCallback)
+				}
+
+				if err != nil {
+					updateError(err)
+					continue
+				}
+
+				// Reload models
+				err = cm.LoadConfigs(g.modelPath)
+				if err != nil {
+					updateError(err)
+					continue
+				}
+
+				g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
+			}
+		}
+	}()
+}
+
+type galleryModel struct {
+	gallery.GalleryModel `yaml:",inline"` // https://github.com/go-yaml/yaml/issues/63
+	ID                   string           `json:"id"`
+}
+
+func processRequests(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
+	var err error
+	for _, r := range requests {
+		utils.ResetDownloadTimers()
+		if r.ID == "" {
+			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
+		} else {
+			if strings.Contains(r.ID, "@") {
+				err = gallery.InstallModelFromGallery(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			} else {
+				err = gallery.InstallModelFromGalleryByName(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			}
+		}
+	}
+	return err
+}
+
+func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
+	dat, err := os.ReadFile(s)
+	if err != nil {
+		return err
+	}
+	var requests []galleryModel
+
+	if err := yaml.Unmarshal(dat, &requests); err != nil {
+		return err
+	}
+
+	return processRequests(modelPath, s, cm, galleries, requests)
+}
+
+func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
+	var requests []galleryModel
+	err := json.Unmarshal([]byte(s), &requests)
+	if err != nil {
+		return err
+	}
+
+	return processRequests(modelPath, s, cm, galleries, requests)
+}
+
+/// Endpoints
+
+func GetOpStatusEndpoint(g *galleryApplier) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		status := g.getStatus(c.Params("uuid"))
+		if status == nil {
+			return fmt.Errorf("could not find any status for ID")
+		}
+
+		return c.JSON(status)
+	}
+}
+
+type GalleryModel struct {
+	ID string `json:"id"`
+	gallery.GalleryModel
+}
+
+func ApplyModelGalleryEndpoint(modelPath string, cm *config.ConfigLoader, g chan galleryOp, galleries []gallery.Gallery) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(GalleryModel)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		uuid, err := uuid.NewUUID()
+		if err != nil {
+			return err
+		}
+		g <- galleryOp{
+			req:         input.GalleryModel,
+			id:          uuid.String(),
+			galleryName: input.ID,
+			galleries:   galleries,
+		}
+		return c.JSON(struct {
+			ID        string `json:"uuid"`
+			StatusURL string `json:"status"`
+		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+	}
+}
+
+func ListModelFromGalleryEndpoint(galleries []gallery.Gallery, basePath string) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing models from galleries: %+v", galleries)
+
+		models, err := gallery.AvailableGalleryModels(galleries, basePath)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Models found from galleries: %+v", models)
+		for _, m := range models {
+			log.Debug().Msgf("Model found from galleries: %+v", m)
+		}
+		dat, err := json.Marshal(models)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
--- a/api/localai/localai.go
+++ b/api/localai/localai.go
@@ -0,0 +1,32 @@
+package localai
+
+import (
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+)
+
+type TTSRequest struct {
+	Model   string `json:"model" yaml:"model"`
+	Input   string `json:"input" yaml:"input"`
+	Backend string `json:"backend" yaml:"backend"`
+}
+
+func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		input := new(TTSRequest)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		filePath, _, err := backend.ModelTTS(input.Backend, input.Input, input.Model, o.Loader, o)
+		if err != nil {
+			return err
+		}
+		return c.Download(filePath)
+	}
+}
--- a/api/openai.go
+++ b/api/openai.go
@@ -1,750 +0,0 @@
-package api
-
-import (
-	"bufio"
-	"bytes"
-	"encoding/base64"
-	"errors"
-	"encoding/json"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"net/http"
-	"os"
-	"path"
-	"path/filepath"
-	"strconv"
-	"strings"
-
-	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	whisperutil "github.com/go-skynet/LocalAI/pkg/whisper"
-	llama "github.com/go-skynet/go-llama.cpp"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-	"github.com/valyala/fasthttp"
-)
-
-// APIError provides error information returned by the OpenAI API.
-type APIError struct {
-	Code    any     `json:"code,omitempty"`
-	Message string  `json:"message"`
-	Param   *string `json:"param,omitempty"`
-	Type    string  `json:"type"`
-}
-
-type ErrorResponse struct {
-	Error *APIError `json:"error,omitempty"`
-}
-
-type OpenAIUsage struct {
-	PromptTokens     int `json:"prompt_tokens"`
-	CompletionTokens int `json:"completion_tokens"`
-	TotalTokens      int `json:"total_tokens"`
-}
-
-type Item struct {
-	Embedding []float32 `json:"embedding"`
-	Index     int       `json:"index"`
-	Object    string    `json:"object,omitempty"`
-
-	// Images
-	URL     string `json:"url,omitempty"`
-	B64JSON string `json:"b64_json,omitempty"`
-}
-
-type OpenAIResponse struct {
-	Created int      `json:"created,omitempty"`
-	Object  string   `json:"object,omitempty"`
-	ID      string   `json:"id,omitempty"`
-	Model   string   `json:"model,omitempty"`
-	Choices []Choice `json:"choices,omitempty"`
-	Data    []Item   `json:"data,omitempty"`
-
-	Usage OpenAIUsage `json:"usage"`
-}
-
-type Choice struct {
-	Index        int      `json:"index,omitempty"`
-	FinishReason string   `json:"finish_reason,omitempty"`
-	Message      *Message `json:"message,omitempty"`
-	Delta        *Message `json:"delta,omitempty"`
-	Text         string   `json:"text,omitempty"`
-}
-
-type Message struct {
-	Role    string `json:"role,omitempty" yaml:"role"`
-	Content string `json:"content,omitempty" yaml:"content"`
-}
-
-type OpenAIModel struct {
-	ID     string `json:"id"`
-	Object string `json:"object"`
-}
-
-type OpenAIRequest struct {
-	Model string `json:"model" yaml:"model"`
-
-	// whisper
-	File     string `json:"file" validate:"required"`
-	Language string `json:"language"`
-	//whisper/image
-	ResponseFormat string `json:"response_format"`
-	// image
-	Size string `json:"size"`
-	// Prompt is read only by completion/image API calls
-	Prompt interface{} `json:"prompt" yaml:"prompt"`
-
-	// Edit endpoint
-	Instruction string      `json:"instruction" yaml:"instruction"`
-	Input       interface{} `json:"input" yaml:"input"`
-
-	Stop interface{} `json:"stop" yaml:"stop"`
-
-	// Messages is read only by chat/completion API calls
-	Messages []Message `json:"messages" yaml:"messages"`
-
-	Stream bool `json:"stream"`
-	Echo   bool `json:"echo"`
-	// Common options between all the API calls
-	TopP        float64 `json:"top_p" yaml:"top_p"`
-	TopK        int     `json:"top_k" yaml:"top_k"`
-	Temperature float64 `json:"temperature" yaml:"temperature"`
-	Maxtokens   int     `json:"max_tokens" yaml:"max_tokens"`
-
-	N int `json:"n"`
-
-	// Custom parameters - not present in the OpenAI API
-	Batch         int     `json:"batch" yaml:"batch"`
-	F16           bool    `json:"f16" yaml:"f16"`
-	IgnoreEOS     bool    `json:"ignore_eos" yaml:"ignore_eos"`
-	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
-	Keep          int     `json:"n_keep" yaml:"n_keep"`
-
-	MirostatETA float64 `json:"mirostat_eta" yaml:"mirostat_eta"`
-	MirostatTAU float64 `json:"mirostat_tau" yaml:"mirostat_tau"`
-	Mirostat    int     `json:"mirostat" yaml:"mirostat"`
-
-	Seed int `json:"seed" yaml:"seed"`
-
-	// Image (not supported by OpenAI)
-	Mode int `json:"mode"`
-	Step int `json:"step"`
-}
-
-func defaultRequest(modelFile string) OpenAIRequest {
-	return OpenAIRequest{
-		TopP:        0.7,
-		TopK:        80,
-		Maxtokens:   512,
-		Temperature: 0.9,
-		Model:       modelFile,
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/completions
-func completionEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	process := func(s string, req *OpenAIRequest, config *Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		ComputeChoices(s, req, config, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{{Text: s}},
-				Object:  "text_completion",
-			}
-			log.Debug().Msgf("Sending goroutine: %s", s)
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("`input`: %+v", input)
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		if input.Stream {
-			log.Debug().Msgf("Stream request received")
-			c.Context().SetContentType("text/event-stream")
-			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//c.Set("Content-Type", "text/event-stream")
-			c.Set("Cache-Control", "no-cache")
-			c.Set("Connection", "keep-alive")
-			c.Set("Transfer-Encoding", "chunked")
-		}
-
-		templateFile := config.Model
-
-		if config.TemplateConfig.Completion != "" {
-			templateFile = config.TemplateConfig.Completion
-		}
-
-		if input.Stream {
-			if (len(config.PromptStrings) > 1) {
-				return errors.New("cannot handle more than 1 `PromptStrings` when `Stream`ing")
-			}
-
-			predInput := config.PromptStrings[0]
-
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-				Input string
-			}{Input: predInput})
-			if err == nil {
-				predInput = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", predInput)
-			}
-
-			responses := make(chan OpenAIResponse)
-
-			go process(predInput, input, config, o.loader, responses)
-
-			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-
-				for ev := range responses {
-					var buf bytes.Buffer
-					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
-
-					log.Debug().Msgf("Sending chunk: %s", buf.String())
-					fmt.Fprintf(w, "data: %v\n", buf.String())
-					w.Flush()
-				}
-
-				resp := &OpenAIResponse{
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{{FinishReason: "stop"}},
-				}
-				respData, _ := json.Marshal(resp)
-
-				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
-				w.WriteString("data: [DONE]\n\n")
-				w.Flush()
-			}))
-			return nil
-		}
-
-		var result []Choice
-		for _, i := range config.PromptStrings {
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-				Input string
-			}{Input: i})
-			if err == nil {
-				i = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", i)
-			}
-
-			r, err := ComputeChoices(i, input, config, o.loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s})
-			}, nil)
-			if err != nil {
-				return err
-			}
-
-			result = append(result, r...)
-		}
-
-		resp := &OpenAIResponse{
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "text_completion",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/embeddings
-func embeddingsEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-		items := []Item{}
-
-		for i, s := range config.InputToken {
-			// get the model function to call for the result
-			embedFn, err := ModelEmbedding("", s, o.loader, *config)
-			if err != nil {
-				return err
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				return err
-			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		for i, s := range config.InputStrings {
-			// get the model function to call for the result
-			embedFn, err := ModelEmbedding(s, []int{}, o.loader, *config)
-			if err != nil {
-				return err
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				return err
-			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		resp := &OpenAIResponse{
-			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:   items,
-			Object: "list",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-func chatEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-
-	process := func(s string, req *OpenAIRequest, config *Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		initialMessage := OpenAIResponse{
-			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []Choice{{Delta: &Message{Role: "assistant"}}},
-			Object:  "chat.completion.chunk",
-		}
-		responses <- initialMessage
-
-		ComputeChoices(s, req, config, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{{Delta: &Message{Content: s}}},
-				Object:  "chat.completion.chunk",
-			}
-			log.Debug().Msgf("Sending goroutine: %s", s)
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		var predInput string
-
-		mess := []string{}
-		for _, i := range input.Messages {
-			var content string
-			r := config.Roles[i.Role]
-			if r != "" {
-				content = fmt.Sprint(r, " ", i.Content)
-			} else {
-				content = i.Content
-			}
-
-			mess = append(mess, content)
-		}
-
-		predInput = strings.Join(mess, "\n")
-
-		if input.Stream {
-			log.Debug().Msgf("Stream request received")
-			c.Context().SetContentType("text/event-stream")
-			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//	c.Set("Content-Type", "text/event-stream")
-			c.Set("Cache-Control", "no-cache")
-			c.Set("Connection", "keep-alive")
-			c.Set("Transfer-Encoding", "chunked")
-		}
-
-		templateFile := config.Model
-
-		if config.TemplateConfig.Chat != "" {
-			templateFile = config.TemplateConfig.Chat
-		}
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-			Input string
-		}{Input: predInput})
-		if err == nil {
-			predInput = templatedInput
-			log.Debug().Msgf("Template found, input modified to: %s", predInput)
-		}
-
-		if input.Stream {
-			responses := make(chan OpenAIResponse)
-
-			go process(predInput, input, config, o.loader, responses)
-
-			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-
-				for ev := range responses {
-					var buf bytes.Buffer
-					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
-
-					log.Debug().Msgf("Sending chunk: %s", buf.String())
-					fmt.Fprintf(w, "data: %v\n", buf.String())
-					w.Flush()
-				}
-
-				resp := &OpenAIResponse{
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{{FinishReason: "stop"}},
-				}
-				respData, _ := json.Marshal(resp)
-
-				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
-				w.WriteString("data: [DONE]\n\n")
-				w.Flush()
-			}))
-			return nil
-		}
-
-		result, err := ComputeChoices(predInput, input, config, o.loader, func(s string, c *[]Choice) {
-			*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: s}})
-		}, nil)
-		if err != nil {
-			return err
-		}
-
-		resp := &OpenAIResponse{
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "chat.completion",
-		}
-		respData, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", respData)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-func editEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		templateFile := config.Model
-
-		if config.TemplateConfig.Edit != "" {
-			templateFile = config.TemplateConfig.Edit
-		}
-
-		var result []Choice
-		for _, i := range config.InputStrings {
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-				Input       string
-				Instruction string
-			}{Input: i})
-			if err == nil {
-				i = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", i)
-			}
-
-			r, err := ComputeChoices(i, input, config, o.loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s})
-			}, nil)
-			if err != nil {
-				return err
-			}
-
-			result = append(result, r...)
-		}
-
-		resp := &OpenAIResponse{
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "edit",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/images/create
-
-/*
-*
-
-	curl http://localhost:8080/v1/images/generations \
-	  -H "Content-Type: application/json" \
-	  -d '{
-	    "prompt": "A cute baby sea otter",
-	    "n": 1,
-	    "size": "512x512"
-	  }'
-
-*
-*/
-func imageEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.loader, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		if m == "" {
-			m = model.StableDiffusionBackend
-		}
-		log.Debug().Msgf("Loading model: %+v", m)
-
-		config, input, err := readConfig(m, input, cm, o.loader, o.debug, 0, 0, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		// XXX: Only stablediffusion is supported for now
-		if config.Backend == "" {
-			config.Backend = model.StableDiffusionBackend
-		}
-
-		sizeParts := strings.Split(input.Size, "x")
-		if len(sizeParts) != 2 {
-			return fmt.Errorf("Invalid value for 'size'")
-		}
-		width, err := strconv.Atoi(sizeParts[0])
-		if err != nil {
-			return fmt.Errorf("Invalid value for 'size'")
-		}
-		height, err := strconv.Atoi(sizeParts[1])
-		if err != nil {
-			return fmt.Errorf("Invalid value for 'size'")
-		}
-
-		b64JSON := false
-		if input.ResponseFormat == "b64_json" {
-			b64JSON = true
-		}
-
-		var result []Item
-		for _, i := range config.PromptStrings {
-			n := input.N
-			if input.N == 0 {
-				n = 1
-			}
-			for j := 0; j < n; j++ {
-				prompts := strings.Split(i, "|")
-				positive_prompt := prompts[0]
-				negative_prompt := ""
-				if len(prompts) > 1 {
-					negative_prompt = prompts[1]
-				}
-
-				mode := 0
-				step := 15
-
-				if input.Mode != 0 {
-					mode = input.Mode
-				}
-
-				if input.Step != 0 {
-					step = input.Step
-				}
-
-				tempDir := ""
-				if !b64JSON {
-					tempDir = o.imageDir
-				}
-				// Create a temporary file
-				outputFile, err := ioutil.TempFile(tempDir, "b64")
-				if err != nil {
-					return err
-				}
-				outputFile.Close()
-				output := outputFile.Name() + ".png"
-				// Rename the temporary file
-				err = os.Rename(outputFile.Name(), output)
-				if err != nil {
-					return err
-				}
-
-				baseURL := c.BaseURL()
-
-				fn, err := ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, output, o.loader, *config)
-				if err != nil {
-					return err
-				}
-				if err := fn(); err != nil {
-					return err
-				}
-
-				item := &Item{}
-
-				if b64JSON {
-					defer os.RemoveAll(output)
-					data, err := os.ReadFile(output)
-					if err != nil {
-						return err
-					}
-					item.B64JSON = base64.StdEncoding.EncodeToString(data)
-				} else {
-					base := filepath.Base(output)
-					item.URL = baseURL + "/generated-images/" + base
-				}
-
-				result = append(result, *item)
-			}
-		}
-
-		resp := &OpenAIResponse{
-			Data: result,
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/audio/create
-func transcriptEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.loader, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(m, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-		// retrieve the file data from the request
-		file, err := c.FormFile("file")
-		if err != nil {
-			return err
-		}
-		f, err := file.Open()
-		if err != nil {
-			return err
-		}
-		defer f.Close()
-
-		dir, err := os.MkdirTemp("", "whisper")
-
-		if err != nil {
-			return err
-		}
-		defer os.RemoveAll(dir)
-
-		dst := filepath.Join(dir, path.Base(file.Filename))
-		dstFile, err := os.Create(dst)
-		if err != nil {
-			return err
-		}
-
-		if _, err := io.Copy(dstFile, f); err != nil {
-			log.Debug().Msgf("Audio file copying error %+v - %+v - err %+v", file.Filename, dst, err)
-			return err
-		}
-
-		log.Debug().Msgf("Audio file copied to: %+v", dst)
-
-		whisperModel, err := o.loader.BackendLoader(model.WhisperBackend, config.Model, []llama.ModelOption{}, uint32(config.Threads))
-		if err != nil {
-			return err
-		}
-
-		if whisperModel == nil {
-			return fmt.Errorf("could not load whisper model")
-		}
-
-		w, ok := whisperModel.(whisper.Model)
-		if !ok {
-			return fmt.Errorf("loader returned non-whisper object")
-		}
-
-		tr, err := whisperutil.Transcript(w, dst, input.Language, uint(config.Threads))
-		if err != nil {
-			return err
-		}
-
-		log.Debug().Msgf("Trascribed: %+v", tr)
-		// TODO: handle different outputs here
-		return c.Status(http.StatusOK).JSON(fiber.Map{"text": tr})
-	}
-}
-
-func listModels(loader *model.ModelLoader, cm *ConfigMerger) func(ctx *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		models, err := loader.ListModels()
-		if err != nil {
-			return err
-		}
-		var mm map[string]interface{} = map[string]interface{}{}
-
-		dataModels := []OpenAIModel{}
-		for _, m := range models {
-			mm[m] = nil
-			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
-		}
-
-		for _, k := range cm.ListConfigs() {
-			if _, exists := mm[k]; !exists {
-				dataModels = append(dataModels, OpenAIModel{ID: k, Object: "model"})
-			}
-		}
-
-		return c.JSON(struct {
-			Object string        `json:"object"`
-			Data   []OpenAIModel `json:"data"`
-		}{
-			Object: "list",
-			Data:   dataModels,
-		})
-	}
-}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -0,0 +1,373 @@
+package openai
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+	"github.com/valyala/fasthttp"
+)
+
+func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	emptyMessage := ""
+
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		initialMessage := schema.OpenAIResponse{
+			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
+			Object:  "chat.completion.chunk",
+		}
+		responses <- initialMessage
+
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
+				Object:  "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
+			}
+
+			responses <- resp
+			return true
+		})
+		close(responses)
+	}
+	return func(c *fiber.Ctx) error {
+		processFunctions := false
+		funcs := grammar.Functions{}
+		modelFile, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+		log.Debug().Msgf("Configuration read: %+v", config)
+
+		// Allow the user to set custom actions via config file
+		// to be "embedded" in each model
+		noActionName := "answer"
+		noActionDescription := "use this action to answer without performing any action"
+
+		if config.FunctionsConfig.NoActionFunctionName != "" {
+			noActionName = config.FunctionsConfig.NoActionFunctionName
+		}
+		if config.FunctionsConfig.NoActionDescriptionName != "" {
+			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
+		}
+
+		// process functions if we have any defined or if we have a function call string
+		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
+			log.Debug().Msgf("Response needs to process functions")
+
+			processFunctions = true
+
+			noActionGrammar := grammar.Function{
+				Name:        noActionName,
+				Description: noActionDescription,
+				Parameters: map[string]interface{}{
+					"properties": map[string]interface{}{
+						"message": map[string]interface{}{
+							"type":        "string",
+							"description": "The message to reply the user with",
+						}},
+				},
+			}
+
+			// Append the no action function
+			funcs = append(funcs, input.Functions...)
+			if !config.FunctionsConfig.DisableNoAction {
+				funcs = append(funcs, noActionGrammar)
+			}
+
+			// Force picking one of the functions by the request
+			if config.FunctionToCall() != "" {
+				funcs = funcs.Select(config.FunctionToCall())
+			}
+
+			// Update input grammar
+			jsStruct := funcs.ToJSONStructure()
+			config.Grammar = jsStruct.Grammar("")
+		} else if input.JSONFunctionGrammarObject != nil {
+			config.Grammar = input.JSONFunctionGrammarObject.Grammar("")
+		}
+
+		// functions are not supported in stream mode (yet?)
+		toStream := input.Stream && !processFunctions
+
+		log.Debug().Msgf("Parameters: %+v", config)
+
+		var predInput string
+
+		suppressConfigSystemPrompt := false
+		mess := []string{}
+		for messageIndex, i := range input.Messages {
+			var content string
+			role := i.Role
+
+			// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
+			// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
+			if i.FunctionCall != nil && i.Role == "assistant" {
+				roleFn := "assistant_function_call"
+				r := config.Roles[roleFn]
+				if r != "" {
+					role = roleFn
+				}
+			}
+			r := config.Roles[role]
+			contentExists := i.Content != nil && *i.Content != ""
+			// First attempt to populate content via a chat message specific template
+			if config.TemplateConfig.ChatMessage != "" {
+				chatMessageData := model.ChatMessageTemplateData{
+					SystemPrompt: config.SystemPrompt,
+					Role:         r,
+					RoleName:     role,
+					Content:      *i.Content,
+					MessageIndex: messageIndex,
+				}
+				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
+				if err != nil {
+					log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, config.TemplateConfig.ChatMessage, err)
+				} else {
+					if templatedChatMessage == "" {
+						log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
+						continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
+					}
+					log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
+					content = templatedChatMessage
+				}
+			}
+			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
+			if content == "" {
+				if r != "" {
+					if contentExists {
+						content = fmt.Sprint(r, " ", *i.Content)
+					}
+					if i.FunctionCall != nil {
+						j, err := json.Marshal(i.FunctionCall)
+						if err == nil {
+							if contentExists {
+								content += "\n" + fmt.Sprint(r, " ", string(j))
+							} else {
+								content = fmt.Sprint(r, " ", string(j))
+							}
+						}
+					}
+				} else {
+					if contentExists {
+						content = fmt.Sprint(*i.Content)
+					}
+					if i.FunctionCall != nil {
+						j, err := json.Marshal(i.FunctionCall)
+						if err == nil {
+							if contentExists {
+								content += "\n" + string(j)
+							} else {
+								content = string(j)
+							}
+						}
+					}
+				}
+				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+				if contentExists && role == "system" {
+					suppressConfigSystemPrompt = true
+				}
+			}
+
+			mess = append(mess, content)
+		}
+
+		predInput = strings.Join(mess, "\n")
+		log.Debug().Msgf("Prompt (before templating): %s", predInput)
+
+		if toStream {
+			log.Debug().Msgf("Stream request received")
+			c.Context().SetContentType("text/event-stream")
+			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
+			//	c.Set("Content-Type", "text/event-stream")
+			c.Set("Cache-Control", "no-cache")
+			c.Set("Connection", "keep-alive")
+			c.Set("Transfer-Encoding", "chunked")
+		}
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Chat != "" && !processFunctions {
+			templateFile = config.TemplateConfig.Chat
+		}
+
+		if config.TemplateConfig.Functions != "" && processFunctions {
+			templateFile = config.TemplateConfig.Functions
+		}
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
+			SystemPrompt:         config.SystemPrompt,
+			SuppressSystemPrompt: suppressConfigSystemPrompt,
+			Input:                predInput,
+			Functions:            funcs,
+		})
+		if err == nil {
+			predInput = templatedInput
+			log.Debug().Msgf("Template found, input modified to: %s", predInput)
+		} else {
+			log.Debug().Msgf("Template failed loading: %s", err.Error())
+		}
+
+		log.Debug().Msgf("Prompt (after templating): %s", predInput)
+		if processFunctions {
+			log.Debug().Msgf("Grammar: %+v", config.Grammar)
+		}
+
+		if toStream {
+			responses := make(chan schema.OpenAIResponse)
+
+			go process(predInput, input, config, o.Loader, responses)
+
+			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
+
+				usage := &schema.OpenAIUsage{}
+
+				for ev := range responses {
+					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+					var buf bytes.Buffer
+					enc := json.NewEncoder(&buf)
+					enc.Encode(ev)
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
+					if err != nil {
+						log.Debug().Msgf("Sending chunk failed: %v", err)
+						input.Cancel()
+						break
+					}
+					w.Flush()
+				}
+
+				resp := &schema.OpenAIResponse{
+					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{
+						{
+							FinishReason: "stop",
+							Index:        0,
+							Delta:        &schema.Message{Content: &emptyMessage},
+						}},
+					Object: "chat.completion.chunk",
+					Usage:  *usage,
+				}
+				respData, _ := json.Marshal(resp)
+
+				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
+				w.WriteString("data: [DONE]\n\n")
+				w.Flush()
+			}))
+			return nil
+		}
+
+		result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+			if processFunctions {
+				// As we have to change the result before processing, we can't stream the answer (yet?)
+				ss := map[string]interface{}{}
+				// This prevent newlines to break JSON parsing for clients
+				s = utils.EscapeNewLines(s)
+				json.Unmarshal([]byte(s), &ss)
+				log.Debug().Msgf("Function return: %s %+v", s, ss)
+
+				// The grammar defines the function name as "function", while OpenAI returns "name"
+				func_name := ss["function"]
+				// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
+				args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
+				d, _ := json.Marshal(args)
+
+				ss["arguments"] = string(d)
+				ss["name"] = func_name
+
+				// if do nothing, reply with a message
+				if func_name == noActionName {
+					log.Debug().Msgf("nothing to do, computing a reply")
+
+					// If there is a message that the LLM already sends as part of the JSON reply, use it
+					arguments := map[string]interface{}{}
+					json.Unmarshal([]byte(d), &arguments)
+					m, exists := arguments["message"]
+					if exists {
+						switch message := m.(type) {
+						case string:
+							if message != "" {
+								log.Debug().Msgf("Reply received from LLM: %s", message)
+								message = backend.Finetune(*config, predInput, message)
+								log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
+
+								*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
+								return
+							}
+						}
+					}
+
+					log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
+					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
+					// Note: This costs (in term of CPU) another computation
+					config.Grammar = ""
+					predFunc, err := backend.ModelInference(input.Context, predInput, o.Loader, *config, o, nil)
+					if err != nil {
+						log.Error().Msgf("inference error: %s", err.Error())
+						return
+					}
+
+					prediction, err := predFunc()
+					if err != nil {
+						log.Error().Msgf("inference error: %s", err.Error())
+						return
+					}
+
+					fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+					*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
+				} else {
+					// otherwise reply with the function call
+					*c = append(*c, schema.Choice{
+						FinishReason: "function_call",
+						Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
+					})
+				}
+
+				return
+			}
+			*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
+		}, nil)
+		if err != nil {
+			return err
+		}
+
+		resp := &schema.OpenAIResponse{
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "chat.completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     tokenUsage.Prompt,
+				CompletionTokens: tokenUsage.Completion,
+				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+			},
+		}
+		respData, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", respData)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -0,0 +1,175 @@
+package openai
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"errors"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+	"github.com/valyala/fasthttp"
+)
+
+// https://platform.openai.com/docs/api-reference/completions
+func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{
+					{
+						Index: 0,
+						Text:  s,
+					},
+				},
+				Object: "text_completion",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
+			}
+			log.Debug().Msgf("Sending goroutine: %s", s)
+
+			responses <- resp
+			return true
+		})
+		close(responses)
+	}
+
+	return func(c *fiber.Ctx) error {
+		modelFile, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("`input`: %+v", input)
+
+		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		if input.Stream {
+			log.Debug().Msgf("Stream request received")
+			c.Context().SetContentType("text/event-stream")
+			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
+			//c.Set("Content-Type", "text/event-stream")
+			c.Set("Cache-Control", "no-cache")
+			c.Set("Connection", "keep-alive")
+			c.Set("Transfer-Encoding", "chunked")
+		}
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Completion != "" {
+			templateFile = config.TemplateConfig.Completion
+		}
+
+		if input.Stream {
+			if len(config.PromptStrings) > 1 {
+				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
+			}
+
+			predInput := config.PromptStrings[0]
+
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				Input: predInput,
+			})
+			if err == nil {
+				predInput = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", predInput)
+			}
+
+			responses := make(chan schema.OpenAIResponse)
+
+			go process(predInput, input, config, o.Loader, responses)
+
+			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
+
+				for ev := range responses {
+					var buf bytes.Buffer
+					enc := json.NewEncoder(&buf)
+					enc.Encode(ev)
+
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					fmt.Fprintf(w, "data: %v\n", buf.String())
+					w.Flush()
+				}
+
+				resp := &schema.OpenAIResponse{
+					Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{
+						{
+							Index:        0,
+							FinishReason: "stop",
+						},
+					},
+					Object: "text_completion",
+				}
+				respData, _ := json.Marshal(resp)
+
+				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
+				w.WriteString("data: [DONE]\n\n")
+				w.Flush()
+			}))
+			return nil
+		}
+
+		var result []schema.Choice
+
+		totalTokenUsage := backend.TokenUsage{}
+
+		for k, i := range config.PromptStrings {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Input:        i,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}
+
+			r, tokenUsage, err := ComputeChoices(
+				input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
+				}, nil)
+			if err != nil {
+				return err
+			}
+
+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
+			result = append(result, r...)
+		}
+
+		resp := &schema.OpenAIResponse{
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "text_completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -0,0 +1,82 @@
+package openai
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+
+	"github.com/rs/zerolog/log"
+)
+
+func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		modelFile, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Edit != "" {
+			templateFile = config.TemplateConfig.Edit
+		}
+
+		var result []schema.Choice
+		totalTokenUsage := backend.TokenUsage{}
+
+		for _, i := range config.InputStrings {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
+				Input:        i,
+				Instruction:  input.Instruction,
+				SystemPrompt: config.SystemPrompt,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}
+
+			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+				*c = append(*c, schema.Choice{Text: s})
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
+			result = append(result, r...)
+		}
+
+		resp := &schema.OpenAIResponse{
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "edit",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -0,0 +1,72 @@
+package openai
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// https://platform.openai.com/docs/api-reference/embeddings
+func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		model, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(model, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+		items := []schema.Item{}
+
+		for i, s := range config.InputToken {
+			// get the model function to call for the result
+			embedFn, err := backend.ModelEmbedding("", s, o.Loader, *config, o)
+			if err != nil {
+				return err
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				return err
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		for i, s := range config.InputStrings {
+			// get the model function to call for the result
+			embedFn, err := backend.ModelEmbedding(s, []int{}, o.Loader, *config, o)
+			if err != nil {
+				return err
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				return err
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		resp := &schema.OpenAIResponse{
+			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:   items,
+			Object: "list",
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -0,0 +1,187 @@
+package openai
+
+import (
+	"bufio"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// https://platform.openai.com/docs/api-reference/images/create
+
+/*
+*
+
+	curl http://localhost:8080/v1/images/generations \
+	  -H "Content-Type: application/json" \
+	  -d '{
+	    "prompt": "A cute baby sea otter",
+	    "n": 1,
+	    "size": "512x512"
+	  }'
+
+*
+*/
+func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		m, input, err := readInput(c, o, false)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		if m == "" {
+			m = model.StableDiffusionBackend
+		}
+		log.Debug().Msgf("Loading model: %+v", m)
+
+		config, input, err := readConfig(m, input, cm, o.Loader, o.Debug, 0, 0, false)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		src := ""
+		if input.File != "" {
+			//base 64 decode the file and write it somewhere
+			// that we will cleanup
+			decoded, err := base64.StdEncoding.DecodeString(input.File)
+			if err != nil {
+				return err
+			}
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
+			if err != nil {
+				return err
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(decoded)
+			if err != nil {
+				outputFile.Close()
+				return err
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		// XXX: Only stablediffusion is supported for now
+		if config.Backend == "" {
+			config.Backend = model.StableDiffusionBackend
+		}
+
+		sizeParts := strings.Split(input.Size, "x")
+		if len(sizeParts) != 2 {
+			return fmt.Errorf("Invalid value for 'size'")
+		}
+		width, err := strconv.Atoi(sizeParts[0])
+		if err != nil {
+			return fmt.Errorf("Invalid value for 'size'")
+		}
+		height, err := strconv.Atoi(sizeParts[1])
+		if err != nil {
+			return fmt.Errorf("Invalid value for 'size'")
+		}
+
+		b64JSON := false
+		if input.ResponseFormat == "b64_json" {
+			b64JSON = true
+		}
+		// src and clip_skip
+		var result []schema.Item
+		for _, i := range config.PromptStrings {
+			n := input.N
+			if input.N == 0 {
+				n = 1
+			}
+			for j := 0; j < n; j++ {
+				prompts := strings.Split(i, "|")
+				positive_prompt := prompts[0]
+				negative_prompt := ""
+				if len(prompts) > 1 {
+					negative_prompt = prompts[1]
+				}
+
+				mode := 0
+				step := config.Step
+				if step == 0 {
+					step = 15
+				}
+
+				if input.Mode != 0 {
+					mode = input.Mode
+				}
+
+				if input.Step != 0 {
+					step = input.Step
+				}
+
+				tempDir := ""
+				if !b64JSON {
+					tempDir = o.ImageDir
+				}
+				// Create a temporary file
+				outputFile, err := os.CreateTemp(tempDir, "b64")
+				if err != nil {
+					return err
+				}
+				outputFile.Close()
+				output := outputFile.Name() + ".png"
+				// Rename the temporary file
+				err = os.Rename(outputFile.Name(), output)
+				if err != nil {
+					return err
+				}
+
+				baseURL := c.BaseURL()
+
+				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, src, output, o.Loader, *config, o)
+				if err != nil {
+					return err
+				}
+				if err := fn(); err != nil {
+					return err
+				}
+
+				item := &schema.Item{}
+
+				if b64JSON {
+					defer os.RemoveAll(output)
+					data, err := os.ReadFile(output)
+					if err != nil {
+						return err
+					}
+					item.B64JSON = base64.StdEncoding.EncodeToString(data)
+				} else {
+					base := filepath.Base(output)
+					item.URL = baseURL + "/generated-images/" + base
+				}
+
+				result = append(result, *item)
+			}
+		}
+
+		resp := &schema.OpenAIResponse{
+			Data: result,
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -0,0 +1,50 @@
+package openai
+
+import (
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ComputeChoices(
+	req *schema.OpenAIRequest,
+	predInput string,
+	config *config.Config,
+	o *options.Option,
+	loader *model.ModelLoader,
+	cb func(string, *[]schema.Choice),
+	tokenCallback func(string, backend.TokenUsage) bool) ([]schema.Choice, backend.TokenUsage, error) {
+	n := req.N // number of completions to return
+	result := []schema.Choice{}
+
+	if n == 0 {
+		n = 1
+	}
+
+	// get the model function to call for the result
+	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
+	if err != nil {
+		return result, backend.TokenUsage{}, err
+	}
+
+	tokenUsage := backend.TokenUsage{}
+
+	for i := 0; i < n; i++ {
+		prediction, err := predFunc()
+		if err != nil {
+			return result, backend.TokenUsage{}, err
+		}
+
+		tokenUsage.Prompt += prediction.Usage.Prompt
+		tokenUsage.Completion += prediction.Usage.Completion
+
+		finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+		cb(finetunedResponse, &result)
+
+		//result = append(result, Choice{Text: prediction})
+
+	}
+	return result, tokenUsage, err
+}
--- a/api/openai/list.go
+++ b/api/openai/list.go
@@ -0,0 +1,69 @@
+package openai
+
+import (
+	"regexp"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+)
+
+func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func(ctx *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		models, err := loader.ListModels()
+		if err != nil {
+			return err
+		}
+		var mm map[string]interface{} = map[string]interface{}{}
+
+		dataModels := []schema.OpenAIModel{}
+
+		var filterFn func(name string) bool
+		filter := c.Query("filter")
+
+		// If filter is not specified, do not filter the list by model name
+		if filter == "" {
+			filterFn = func(_ string) bool { return true }
+		} else {
+			// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
+			rxp, err := regexp.Compile(filter)
+			if err != nil {
+				return err
+			}
+			filterFn = func(name string) bool {
+				return rxp.MatchString(name)
+			}
+		}
+
+		// By default, exclude any loose files that are already referenced by a configuration file.
+		excludeConfigured := c.QueryBool("excludeConfigured", true)
+
+		// Start with the known configurations
+		for _, c := range cm.GetAllConfigs() {
+			if excludeConfigured {
+				mm[c.Model] = nil
+			}
+
+			if filterFn(c.Name) {
+				dataModels = append(dataModels, schema.OpenAIModel{ID: c.Name, Object: "model"})
+			}
+		}
+
+		// Then iterate through the loose files:
+		for _, m := range models {
+			// And only adds them if they shouldn't be skipped.
+			if _, exists := mm[m]; !exists && filterFn(m) {
+				dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
+			}
+		}
+
+		return c.JSON(struct {
+			Object string               `json:"object"`
+			Data   []schema.OpenAIModel `json:"data"`
+		}{
+			Object: "list",
+			Data:   dataModels,
+		})
+	}
+}
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -0,0 +1,273 @@
+package openai
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	options "github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *schema.OpenAIRequest, error) {
+	loader := o.Loader
+	input := new(schema.OpenAIRequest)
+	ctx, cancel := context.WithCancel(o.Context)
+	input.Context = ctx
+	input.Cancel = cancel
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", nil, err
+	}
+
+	modelFile := input.Model
+
+	if c.Params("model") != "" {
+		modelFile = c.Params("model")
+	}
+
+	received, _ := json.Marshal(input)
+
+	log.Debug().Msgf("Request received: %s", string(received))
+
+	// Set model from bearer token, if available
+	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+
+	// If no model was specified, take the first available
+	if modelFile == "" && !bearerExists && randomModel {
+		models, _ := loader.ListModels()
+		if len(models) > 0 {
+			modelFile = models[0]
+			log.Debug().Msgf("No model specified, using: %s", modelFile)
+		} else {
+			log.Debug().Msgf("No model specified, returning error")
+			return "", nil, fmt.Errorf("no model specified")
+		}
+	}
+
+	// If a model is found in bearer token takes precedence
+	if bearerExists {
+		log.Debug().Msgf("Using model from bearer token: %s", bearer)
+		modelFile = bearer
+	}
+	return modelFile, input, nil
+}
+
+func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
+	if input.Echo {
+		config.Echo = input.Echo
+	}
+	if input.TopK != 0 {
+		config.TopK = input.TopK
+	}
+	if input.TopP != 0 {
+		config.TopP = input.TopP
+	}
+
+	if input.Backend != "" {
+		config.Backend = input.Backend
+	}
+
+	if input.ClipSkip != 0 {
+		config.Diffusers.ClipSkip = input.ClipSkip
+	}
+
+	if input.ModelBaseName != "" {
+		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
+	}
+
+	if input.NegativePromptScale != 0 {
+		config.NegativePromptScale = input.NegativePromptScale
+	}
+
+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
+	if input.NegativePrompt != "" {
+		config.NegativePrompt = input.NegativePrompt
+	}
+
+	if input.RopeFreqBase != 0 {
+		config.RopeFreqBase = input.RopeFreqBase
+	}
+
+	if input.RopeFreqScale != 0 {
+		config.RopeFreqScale = input.RopeFreqScale
+	}
+
+	if input.Grammar != "" {
+		config.Grammar = input.Grammar
+	}
+
+	if input.Temperature != 0 {
+		config.Temperature = input.Temperature
+	}
+
+	if input.Maxtokens != 0 {
+		config.Maxtokens = input.Maxtokens
+	}
+
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
+	}
+
+	if input.RepeatPenalty != 0 {
+		config.RepeatPenalty = input.RepeatPenalty
+	}
+
+	if input.Keep != 0 {
+		config.Keep = input.Keep
+	}
+
+	if input.Batch != 0 {
+		config.Batch = input.Batch
+	}
+
+	if input.F16 {
+		config.F16 = input.F16
+	}
+
+	if input.IgnoreEOS {
+		config.IgnoreEOS = input.IgnoreEOS
+	}
+
+	if input.Seed != 0 {
+		config.Seed = input.Seed
+	}
+
+	if input.Mirostat != 0 {
+		config.LLMConfig.Mirostat = input.Mirostat
+	}
+
+	if input.MirostatETA != 0 {
+		config.LLMConfig.MirostatETA = input.MirostatETA
+	}
+
+	if input.MirostatTAU != 0 {
+		config.LLMConfig.MirostatTAU = input.MirostatTAU
+	}
+
+	if input.TypicalP != 0 {
+		config.TypicalP = input.TypicalP
+	}
+
+	switch inputs := input.Input.(type) {
+	case string:
+		if inputs != "" {
+			config.InputStrings = append(config.InputStrings, inputs)
+		}
+	case []interface{}:
+		for _, pp := range inputs {
+			switch i := pp.(type) {
+			case string:
+				config.InputStrings = append(config.InputStrings, i)
+			case []interface{}:
+				tokens := []int{}
+				for _, ii := range i {
+					tokens = append(tokens, int(ii.(float64)))
+				}
+				config.InputToken = append(config.InputToken, tokens)
+			}
+		}
+	}
+
+	// Can be either a string or an object
+	switch fnc := input.FunctionCall.(type) {
+	case string:
+		if fnc != "" {
+			config.SetFunctionCallString(fnc)
+		}
+	case map[string]interface{}:
+		var name string
+		n, exists := fnc["name"]
+		if exists {
+			nn, e := n.(string)
+			if e {
+				name = nn
+			}
+		}
+		config.SetFunctionCallNameString(name)
+	}
+
+	switch p := input.Prompt.(type) {
+	case string:
+		config.PromptStrings = append(config.PromptStrings, p)
+	case []interface{}:
+		for _, pp := range p {
+			if s, ok := pp.(string); ok {
+				config.PromptStrings = append(config.PromptStrings, s)
+			}
+		}
+	}
+}
+
+func readConfig(modelFile string, input *schema.OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *schema.OpenAIRequest, error) {
+	// Load a config file if present after the model name
+	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
+
+	var cfg *config.Config
+
+	defaults := func() {
+		cfg = config.DefaultConfig(modelFile)
+		cfg.ContextSize = ctx
+		cfg.Threads = threads
+		cfg.F16 = f16
+		cfg.Debug = debug
+	}
+
+	cfgExisting, exists := cm.GetConfig(modelFile)
+	if !exists {
+		if _, err := os.Stat(modelConfig); err == nil {
+			if err := cm.LoadConfig(modelConfig); err != nil {
+				return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+			}
+			cfgExisting, exists = cm.GetConfig(modelFile)
+			if exists {
+				cfg = &cfgExisting
+			} else {
+				defaults()
+			}
+		} else {
+			defaults()
+		}
+	} else {
+		cfg = &cfgExisting
+	}
+
+	// Set the parameters for the language model prediction
+	updateConfig(cfg, input)
+
+	// Don't allow 0 as setting
+	if cfg.Threads == 0 {
+		if threads != 0 {
+			cfg.Threads = threads
+		} else {
+			cfg.Threads = 4
+		}
+	}
+
+	// Enforce debug flag if passed from CLI
+	if debug {
+		cfg.Debug = true
+	}
+
+	return cfg, input, nil
+}
--- a/api/openai/transcription.go
+++ b/api/openai/transcription.go
@@ -0,0 +1,71 @@
+package openai
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path"
+	"path/filepath"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// https://platform.openai.com/docs/api-reference/audio/create
+func TranscriptEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		m, input, err := readInput(c, o, false)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(m, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+		// retrieve the file data from the request
+		file, err := c.FormFile("file")
+		if err != nil {
+			return err
+		}
+		f, err := file.Open()
+		if err != nil {
+			return err
+		}
+		defer f.Close()
+
+		dir, err := os.MkdirTemp("", "whisper")
+
+		if err != nil {
+			return err
+		}
+		defer os.RemoveAll(dir)
+
+		dst := filepath.Join(dir, path.Base(file.Filename))
+		dstFile, err := os.Create(dst)
+		if err != nil {
+			return err
+		}
+
+		if _, err := io.Copy(dstFile, f); err != nil {
+			log.Debug().Msgf("Audio file copying error %+v - %+v - err %+v", file.Filename, dst, err)
+			return err
+		}
+
+		log.Debug().Msgf("Audio file copied to: %+v", dst)
+
+		tr, err := backend.ModelTranscription(dst, input.Language, o.Loader, *config, o)
+		if err != nil {
+			return err
+		}
+
+		log.Debug().Msgf("Trascribed: %+v", tr)
+		// TODO: handle different outputs here
+		return c.Status(http.StatusOK).JSON(tr)
+	}
+}
--- a/api/options.go
+++ b/api/options.go
@@ -1,137 +0,0 @@
-package api
-
-import (
-	"context"
-	"embed"
-
-	model "github.com/go-skynet/LocalAI/pkg/model"
-)
-
-type Option struct {
-	context                         context.Context
-	configFile                      string
-	loader                          *model.ModelLoader
-	uploadLimitMB, threads, ctxSize int
-	f16                             bool
-	debug, disableMessage           bool
-	imageDir                        string
-	cors                            bool
-	preloadJSONModels               string
-	preloadModelsFromPath           string
-	corsAllowOrigins                string
-
-	backendAssets     embed.FS
-	assetsDestination string
-}
-
-type AppOption func(*Option)
-
-func newOptions(o ...AppOption) *Option {
-	opt := &Option{
-		context:        context.Background(),
-		uploadLimitMB:  15,
-		threads:        1,
-		ctxSize:        512,
-		debug:          true,
-		disableMessage: true,
-	}
-	for _, oo := range o {
-		oo(opt)
-	}
-	return opt
-}
-
-func WithCors(b bool) AppOption {
-	return func(o *Option) {
-		o.cors = b
-	}
-}
-
-func WithCorsAllowOrigins(b string) AppOption {
-	return func(o *Option) {
-		o.corsAllowOrigins = b
-	}
-}
-
-func WithBackendAssetsOutput(out string) AppOption {
-	return func(o *Option) {
-		o.assetsDestination = out
-	}
-}
-
-func WithBackendAssets(f embed.FS) AppOption {
-	return func(o *Option) {
-		o.backendAssets = f
-	}
-}
-
-func WithContext(ctx context.Context) AppOption {
-	return func(o *Option) {
-		o.context = ctx
-	}
-}
-
-func WithYAMLConfigPreload(configFile string) AppOption {
-	return func(o *Option) {
-		o.preloadModelsFromPath = configFile
-	}
-}
-
-func WithJSONStringPreload(configFile string) AppOption {
-	return func(o *Option) {
-		o.preloadJSONModels = configFile
-	}
-}
-func WithConfigFile(configFile string) AppOption {
-	return func(o *Option) {
-		o.configFile = configFile
-	}
-}
-
-func WithModelLoader(loader *model.ModelLoader) AppOption {
-	return func(o *Option) {
-		o.loader = loader
-	}
-}
-
-func WithUploadLimitMB(limit int) AppOption {
-	return func(o *Option) {
-		o.uploadLimitMB = limit
-	}
-}
-
-func WithThreads(threads int) AppOption {
-	return func(o *Option) {
-		o.threads = threads
-	}
-}
-
-func WithContextSize(ctxSize int) AppOption {
-	return func(o *Option) {
-		o.ctxSize = ctxSize
-	}
-}
-
-func WithF16(f16 bool) AppOption {
-	return func(o *Option) {
-		o.f16 = f16
-	}
-}
-
-func WithDebug(debug bool) AppOption {
-	return func(o *Option) {
-		o.debug = debug
-	}
-}
-
-func WithDisableMessage(disableMessage bool) AppOption {
-	return func(o *Option) {
-		o.disableMessage = disableMessage
-	}
-}
-
-func WithImageDir(imageDir string) AppOption {
-	return func(o *Option) {
-		o.imageDir = imageDir
-	}
-}
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -0,0 +1,199 @@
+package options
+
+import (
+	"context"
+	"embed"
+	"encoding/json"
+
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
+)
+
+type Option struct {
+	Context                             context.Context
+	ConfigFile                          string
+	Loader                              *model.ModelLoader
+	UploadLimitMB, Threads, ContextSize int
+	F16                                 bool
+	Debug, DisableMessage               bool
+	ImageDir                            string
+	AudioDir                            string
+	CORS                                bool
+	PreloadJSONModels                   string
+	PreloadModelsFromPath               string
+	CORSAllowOrigins                    string
+	ApiKeys                             []string
+
+	Galleries []gallery.Gallery
+
+	BackendAssets     embed.FS
+	AssetsDestination string
+
+	ExternalGRPCBackends map[string]string
+
+	AutoloadGalleries bool
+
+	SingleBackend bool
+}
+
+type AppOption func(*Option)
+
+func NewOptions(o ...AppOption) *Option {
+	opt := &Option{
+		Context:        context.Background(),
+		UploadLimitMB:  15,
+		Threads:        1,
+		ContextSize:    512,
+		Debug:          true,
+		DisableMessage: true,
+	}
+	for _, oo := range o {
+		oo(opt)
+	}
+	return opt
+}
+
+func WithCors(b bool) AppOption {
+	return func(o *Option) {
+		o.CORS = b
+	}
+}
+
+var EnableSingleBackend = func(o *Option) {
+	o.SingleBackend = true
+}
+
+var EnableGalleriesAutoload = func(o *Option) {
+	o.AutoloadGalleries = true
+}
+
+func WithExternalBackend(name string, uri string) AppOption {
+	return func(o *Option) {
+		if o.ExternalGRPCBackends == nil {
+			o.ExternalGRPCBackends = make(map[string]string)
+		}
+		o.ExternalGRPCBackends[name] = uri
+	}
+}
+
+func WithCorsAllowOrigins(b string) AppOption {
+	return func(o *Option) {
+		o.CORSAllowOrigins = b
+	}
+}
+
+func WithBackendAssetsOutput(out string) AppOption {
+	return func(o *Option) {
+		o.AssetsDestination = out
+	}
+}
+
+func WithBackendAssets(f embed.FS) AppOption {
+	return func(o *Option) {
+		o.BackendAssets = f
+	}
+}
+
+func WithStringGalleries(galls string) AppOption {
+	return func(o *Option) {
+		if galls == "" {
+			log.Debug().Msgf("no galleries to load")
+			return
+		}
+		var galleries []gallery.Gallery
+		if err := json.Unmarshal([]byte(galls), &galleries); err != nil {
+			log.Error().Msgf("failed loading galleries: %s", err.Error())
+		}
+		o.Galleries = append(o.Galleries, galleries...)
+	}
+}
+
+func WithGalleries(galleries []gallery.Gallery) AppOption {
+	return func(o *Option) {
+		o.Galleries = append(o.Galleries, galleries...)
+	}
+}
+
+func WithContext(ctx context.Context) AppOption {
+	return func(o *Option) {
+		o.Context = ctx
+	}
+}
+
+func WithYAMLConfigPreload(configFile string) AppOption {
+	return func(o *Option) {
+		o.PreloadModelsFromPath = configFile
+	}
+}
+
+func WithJSONStringPreload(configFile string) AppOption {
+	return func(o *Option) {
+		o.PreloadJSONModels = configFile
+	}
+}
+func WithConfigFile(configFile string) AppOption {
+	return func(o *Option) {
+		o.ConfigFile = configFile
+	}
+}
+
+func WithModelLoader(loader *model.ModelLoader) AppOption {
+	return func(o *Option) {
+		o.Loader = loader
+	}
+}
+
+func WithUploadLimitMB(limit int) AppOption {
+	return func(o *Option) {
+		o.UploadLimitMB = limit
+	}
+}
+
+func WithThreads(threads int) AppOption {
+	return func(o *Option) {
+		o.Threads = threads
+	}
+}
+
+func WithContextSize(ctxSize int) AppOption {
+	return func(o *Option) {
+		o.ContextSize = ctxSize
+	}
+}
+
+func WithF16(f16 bool) AppOption {
+	return func(o *Option) {
+		o.F16 = f16
+	}
+}
+
+func WithDebug(debug bool) AppOption {
+	return func(o *Option) {
+		o.Debug = debug
+	}
+}
+
+func WithDisableMessage(disableMessage bool) AppOption {
+	return func(o *Option) {
+		o.DisableMessage = disableMessage
+	}
+}
+
+func WithAudioDir(audioDir string) AppOption {
+	return func(o *Option) {
+		o.AudioDir = audioDir
+	}
+}
+
+func WithImageDir(imageDir string) AppOption {
+	return func(o *Option) {
+		o.ImageDir = imageDir
+	}
+}
+
+func WithApiKeys(apiKeys []string) AppOption {
+	return func(o *Option) {
+		o.ApiKeys = apiKeys
+	}
+}
--- a/api/prediction.go
+++ b/api/prediction.go
@@ -1,616 +0,0 @@
-package api
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-	"regexp"
-	"strings"
-	"sync"
-
-	"github.com/donomii/go-rwkv.cpp"
-	"github.com/go-skynet/LocalAI/pkg/langchain"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
-	"github.com/go-skynet/bloomz.cpp"
-	bert "github.com/go-skynet/go-bert.cpp"
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-	llama "github.com/go-skynet/go-llama.cpp"
-	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
-)
-
-// mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-var mutexMap sync.Mutex
-var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)
-
-func defaultLLamaOpts(c Config) []llama.ModelOption {
-	llamaOpts := []llama.ModelOption{}
-	if c.ContextSize != 0 {
-		llamaOpts = append(llamaOpts, llama.SetContext(c.ContextSize))
-	}
-	if c.F16 {
-		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-	}
-	if c.Embeddings {
-		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
-	}
-
-	if c.NGPULayers != 0 {
-		llamaOpts = append(llamaOpts, llama.SetGPULayers(c.NGPULayers))
-	}
-
-	return llamaOpts
-}
-
-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c Config) (func() error, error) {
-	if c.Backend != model.StableDiffusionBackend {
-		return nil, fmt.Errorf("endpoint only working with stablediffusion models")
-	}
-	inferenceModel, err := loader.BackendLoader(c.Backend, c.ImageGenerationAssets, []llama.ModelOption{}, uint32(c.Threads))
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() error
-	switch model := inferenceModel.(type) {
-	case *stablediffusion.StableDiffusion:
-		fn = func() error {
-			return model.GenerateImage(height, width, mode, step, seed, positive_prompt, negative_prompt, dst)
-		}
-
-	default:
-		fn = func() error {
-			return fmt.Errorf("creation of images not supported by the backend")
-		}
-	}
-
-	return func() error {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[c.Backend]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[c.Backend] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		return fn()
-	}, nil
-}
-
-func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c Config) (func() ([]float32, error), error) {
-	if !c.Embeddings {
-		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
-	}
-
-	modelFile := c.Model
-
-	llamaOpts := defaultLLamaOpts(c)
-
-	var inferenceModel interface{}
-	var err error
-	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(modelFile, llamaOpts, uint32(c.Threads))
-	} else {
-		inferenceModel, err = loader.BackendLoader(c.Backend, modelFile, llamaOpts, uint32(c.Threads))
-	}
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() ([]float32, error)
-	switch model := inferenceModel.(type) {
-	case *llama.LLama:
-		fn = func() ([]float32, error) {
-			predictOptions := buildLLamaPredictOptions(c, loader.ModelPath)
-			if len(tokens) > 0 {
-				return model.TokenEmbeddings(tokens, predictOptions...)
-			}
-			return model.Embeddings(s, predictOptions...)
-		}
-	// bert embeddings
-	case *bert.Bert:
-		fn = func() ([]float32, error) {
-			if len(tokens) > 0 {
-				return model.TokenEmbeddings(tokens, bert.SetThreads(c.Threads))
-			}
-			return model.Embeddings(s, bert.SetThreads(c.Threads))
-		}
-	default:
-		fn = func() ([]float32, error) {
-			return nil, fmt.Errorf("embeddings not supported by the backend")
-		}
-	}
-
-	return func() ([]float32, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		embeds, err := fn()
-		if err != nil {
-			return embeds, err
-		}
-		// Remove trailing 0s
-		for i := len(embeds) - 1; i >= 0; i-- {
-			if embeds[i] == 0.0 {
-				embeds = embeds[:i]
-			} else {
-				break
-			}
-		}
-		return embeds, nil
-	}, nil
-}
-
-func buildLLamaPredictOptions(c Config, modelPath string) []llama.PredictOption {
-	// Generate the prediction using the language model
-	predictOptions := []llama.PredictOption{
-		llama.SetTemperature(c.Temperature),
-		llama.SetTopP(c.TopP),
-		llama.SetTopK(c.TopK),
-		llama.SetTokens(c.Maxtokens),
-		llama.SetThreads(c.Threads),
-	}
-
-	if c.PromptCacheAll {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
-	}
-
-	if c.PromptCachePath != "" {
-		// Create parent directory
-		p := filepath.Join(modelPath, c.PromptCachePath)
-		os.MkdirAll(filepath.Dir(p), 0755)
-		predictOptions = append(predictOptions, llama.SetPathPromptCache(p))
-	}
-
-	if c.Mirostat != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostat(c.Mirostat))
-	}
-
-	if c.MirostatETA != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatETA(c.MirostatETA))
-	}
-
-	if c.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatTAU(c.MirostatTAU))
-	}
-
-	if c.Debug {
-		predictOptions = append(predictOptions, llama.Debug)
-	}
-
-	predictOptions = append(predictOptions, llama.SetStopWords(c.StopWords...))
-
-	if c.RepeatPenalty != 0 {
-		predictOptions = append(predictOptions, llama.SetPenalty(c.RepeatPenalty))
-	}
-
-	if c.Keep != 0 {
-		predictOptions = append(predictOptions, llama.SetNKeep(c.Keep))
-	}
-
-	if c.Batch != 0 {
-		predictOptions = append(predictOptions, llama.SetBatch(c.Batch))
-	}
-
-	if c.F16 {
-		predictOptions = append(predictOptions, llama.EnableF16KV)
-	}
-
-	if c.IgnoreEOS {
-		predictOptions = append(predictOptions, llama.IgnoreEOS)
-	}
-
-	if c.Seed != 0 {
-		predictOptions = append(predictOptions, llama.SetSeed(c.Seed))
-	}
-
-	return predictOptions
-}
-
-func ModelInference(s string, loader *model.ModelLoader, c Config, tokenCallback func(string) bool) (func() (string, error), error) {
-	supportStreams := false
-	modelFile := c.Model
-
-	llamaOpts := defaultLLamaOpts(c)
-
-	var inferenceModel interface{}
-	var err error
-	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(modelFile, llamaOpts, uint32(c.Threads))
-	} else {
-		inferenceModel, err = loader.BackendLoader(c.Backend, modelFile, llamaOpts, uint32(c.Threads))
-	}
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() (string, error)
-
-	switch model := inferenceModel.(type) {
-	case *rwkv.RwkvState:
-		supportStreams = true
-
-		fn = func() (string, error) {
-			stopWord := "\n"
-			if len(c.StopWords) > 0 {
-				stopWord = c.StopWords[0]
-			}
-
-			if err := model.ProcessInput(s); err != nil {
-				return "", err
-			}
-
-			response := model.GenerateResponse(c.Maxtokens, stopWord, float32(c.Temperature), float32(c.TopP), tokenCallback)
-
-			return response, nil
-		}
-	case *transformers.GPTNeoX:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Replit:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Starcoder:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.MPT:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *bloomz.Bloomz:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []bloomz.PredictOption{
-				bloomz.SetTemperature(c.Temperature),
-				bloomz.SetTopP(c.TopP),
-				bloomz.SetTopK(c.TopK),
-				bloomz.SetTokens(c.Maxtokens),
-				bloomz.SetThreads(c.Threads),
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, bloomz.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Falcon:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.GPTJ:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Dolly:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.GPT2:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *gpt4all.Model:
-		supportStreams = true
-
-		fn = func() (string, error) {
-			if tokenCallback != nil {
-				model.SetTokenCallback(tokenCallback)
-			}
-
-			// Generate the prediction using the language model
-			predictOptions := []gpt4all.PredictOption{
-				gpt4all.SetTemperature(c.Temperature),
-				gpt4all.SetTopP(c.TopP),
-				gpt4all.SetTopK(c.TopK),
-				gpt4all.SetTokens(c.Maxtokens),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, gpt4all.SetBatch(c.Batch))
-			}
-
-			str, er := model.Predict(
-				s,
-				predictOptions...,
-			)
-			// Seems that if we don't free the callback explicitly we leave functions registered (that might try to send on closed channels)
-			// For instance otherwise the API returns: {"error":{"code":500,"message":"send on closed channel","type":""}}
-			// after a stream event has occurred
-			model.SetTokenCallback(nil)
-			return str, er
-		}
-	case *llama.LLama:
-		supportStreams = true
-		fn = func() (string, error) {
-
-			if tokenCallback != nil {
-				model.SetTokenCallback(tokenCallback)
-			}
-
-			predictOptions := buildLLamaPredictOptions(c, loader.ModelPath)
-
-			str, er := model.Predict(
-				s,
-				predictOptions...,
-			)
-			// Seems that if we don't free the callback explicitly we leave functions registered (that might try to send on closed channels)
-			// For instance otherwise the API returns: {"error":{"code":500,"message":"send on closed channel","type":""}}
-			// after a stream event has occurred
-			model.SetTokenCallback(nil)
-			return str, er
-		}
-	case *langchain.HuggingFace:
-		fn = func() (string, error) {
-
-			// Generate the prediction using the language model
-			predictOptions := []langchain.PredictOption{
-				langchain.SetModel(c.Model),
-				langchain.SetMaxTokens(c.Maxtokens),
-				langchain.SetTemperature(c.Temperature),
-				langchain.SetStopWords(c.StopWords),
-			}
-
-			pred, er := model.PredictHuggingFace(s, predictOptions...)
-			if er != nil {
-				return "", er
-			}
-			return pred.Completion, nil
-		}
-	}
-
-	return func() (string, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		res, err := fn()
-		if tokenCallback != nil && !supportStreams {
-			tokenCallback(res)
-		}
-		return res, err
-	}, nil
-}
-
-func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
-	result := []Choice{}
-
-	n := input.N
-
-	if input.N == 0 {
-		n = 1
-	}
-
-	// get the model function to call for the result
-	predFunc, err := ModelInference(predInput, loader, *config, tokenCallback)
-	if err != nil {
-		return result, err
-	}
-
-	for i := 0; i < n; i++ {
-		prediction, err := predFunc()
-		if err != nil {
-			return result, err
-		}
-
-		prediction = Finetune(*config, predInput, prediction)
-		cb(prediction, &result)
-
-		//result = append(result, Choice{Text: prediction})
-
-	}
-	return result, err
-}
-
-var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
-var mu sync.Mutex = sync.Mutex{}
-
-func Finetune(config Config, input, prediction string) string {
-	if config.Echo {
-		prediction = input + prediction
-	}
-
-	for _, c := range config.Cutstrings {
-		mu.Lock()
-		reg, ok := cutstrings[c]
-		if !ok {
-			cutstrings[c] = regexp.MustCompile(c)
-			reg = cutstrings[c]
-		}
-		mu.Unlock()
-		prediction = reg.ReplaceAllString(prediction, "")
-	}
-
-	for _, c := range config.TrimSpace {
-		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
-	}
-	return prediction
-
-}
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -0,0 +1,115 @@
+package schema
+
+import (
+	"context"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+)
+
+// APIError provides error information returned by the OpenAI API.
+type APIError struct {
+	Code    any     `json:"code,omitempty"`
+	Message string  `json:"message"`
+	Param   *string `json:"param,omitempty"`
+	Type    string  `json:"type"`
+}
+
+type ErrorResponse struct {
+	Error *APIError `json:"error,omitempty"`
+}
+
+type OpenAIUsage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
+
+type Item struct {
+	Embedding []float32 `json:"embedding"`
+	Index     int       `json:"index"`
+	Object    string    `json:"object,omitempty"`
+
+	// Images
+	URL     string `json:"url,omitempty"`
+	B64JSON string `json:"b64_json,omitempty"`
+}
+
+type OpenAIResponse struct {
+	Created int      `json:"created,omitempty"`
+	Object  string   `json:"object,omitempty"`
+	ID      string   `json:"id,omitempty"`
+	Model   string   `json:"model,omitempty"`
+	Choices []Choice `json:"choices,omitempty"`
+	Data    []Item   `json:"data,omitempty"`
+
+	Usage OpenAIUsage `json:"usage"`
+}
+
+type Choice struct {
+	Index        int      `json:"index"`
+	FinishReason string   `json:"finish_reason,omitempty"`
+	Message      *Message `json:"message,omitempty"`
+	Delta        *Message `json:"delta,omitempty"`
+	Text         string   `json:"text,omitempty"`
+}
+
+type Message struct {
+	// The message role
+	Role string `json:"role,omitempty" yaml:"role"`
+	// The message content
+	Content *string `json:"content" yaml:"content"`
+	// A result of a function call
+	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
+}
+
+type OpenAIModel struct {
+	ID     string `json:"id"`
+	Object string `json:"object"`
+}
+
+type OpenAIRequest struct {
+	config.PredictionOptions
+
+	Context context.Context
+	Cancel  context.CancelFunc
+
+	// whisper
+	File string `json:"file" validate:"required"`
+	//whisper/image
+	ResponseFormat string `json:"response_format"`
+	// image
+	Size string `json:"size"`
+	// Prompt is read only by completion/image API calls
+	Prompt interface{} `json:"prompt" yaml:"prompt"`
+
+	// Edit endpoint
+	Instruction string      `json:"instruction" yaml:"instruction"`
+	Input       interface{} `json:"input" yaml:"input"`
+
+	Stop interface{} `json:"stop" yaml:"stop"`
+
+	// Messages is read only by chat/completion API calls
+	Messages []Message `json:"messages" yaml:"messages"`
+
+	// A list of available functions to call
+	Functions    []grammar.Function `json:"functions" yaml:"functions"`
+	FunctionCall interface{}        `json:"function_call" yaml:"function_call"` // might be a string or an object
+
+	Stream bool `json:"stream"`
+
+	// Image (not supported by OpenAI)
+	Mode int `json:"mode"`
+	Step int `json:"step"`
+
+	// A grammar to constrain the LLM output
+	Grammar string `json:"grammar" yaml:"grammar"`
+
+	JSONFunctionGrammarObject *grammar.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`
+
+	Backend string `json:"backend" yaml:"backend"`
+
+	// AutoGPTQ
+	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
+}
--- a/api/schema/whisper.go
+++ b/api/schema/whisper.go
@@ -0,0 +1,16 @@
+package schema
+
+import "time"
+
+type Segment struct {
+	Id     int           `json:"id"`
+	Start  time.Duration `json:"start"`
+	End    time.Duration `json:"end"`
+	Text   string        `json:"text"`
+	Tokens []int         `json:"tokens"`
+}
+
+type Result struct {
+	Segments []Segment `json:"segments"`
+	Text     string    `json:"text"`
+}
--- a/cmd/grpc/bert-embeddings/main.go
+++ b/cmd/grpc/bert-embeddings/main.go
@@ -0,0 +1,22 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &bert.Embeddings{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/bloomz/main.go
+++ b/cmd/grpc/bloomz/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	bloomz "github.com/go-skynet/LocalAI/pkg/backend/llm/bloomz"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &bloomz.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/dolly/main.go
+++ b/cmd/grpc/dolly/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Dolly{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/falcon-ggml/main.go
+++ b/cmd/grpc/falcon-ggml/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Falcon{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/falcon/main.go
+++ b/cmd/grpc/falcon/main.go
@@ -0,0 +1,25 @@
+package main
+
+// GRPC Falcon server
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	falcon "github.com/go-skynet/LocalAI/pkg/backend/llm/falcon"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &falcon.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gpt2/main.go
+++ b/cmd/grpc/gpt2/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.GPT2{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gpt4all/main.go
+++ b/cmd/grpc/gpt4all/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	gpt4all "github.com/go-skynet/LocalAI/pkg/backend/llm/gpt4all"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &gpt4all.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gptj/main.go
+++ b/cmd/grpc/gptj/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.GPTJ{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gptneox/main.go
+++ b/cmd/grpc/gptneox/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.GPTNeoX{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/langchain-huggingface/main.go
+++ b/cmd/grpc/langchain-huggingface/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	langchain "github.com/go-skynet/LocalAI/pkg/backend/llm/langchain"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &langchain.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/llama-stable/main.go
+++ b/cmd/grpc/llama-stable/main.go
@@ -0,0 +1,21 @@
+package main
+
+import (
+	"flag"
+
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/llama/main.go
+++ b/cmd/grpc/llama/main.go
@@ -0,0 +1,25 @@
+package main
+
+// GRPC Falcon server
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/mpt/main.go
+++ b/cmd/grpc/mpt/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.MPT{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/piper/main.go
+++ b/cmd/grpc/piper/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	tts "github.com/go-skynet/LocalAI/pkg/backend/tts"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &tts.Piper{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/replit/main.go
+++ b/cmd/grpc/replit/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Replit{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/rwkv/main.go
+++ b/cmd/grpc/rwkv/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &rwkv.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/stablediffusion/main.go
+++ b/cmd/grpc/stablediffusion/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	image "github.com/go-skynet/LocalAI/pkg/backend/image"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &image.StableDiffusion{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/starcoder/main.go
+++ b/cmd/grpc/starcoder/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Starcoder{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/whisper/main.go
+++ b/cmd/grpc/whisper/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transcribe "github.com/go-skynet/LocalAI/pkg/backend/transcribe"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transcribe.Whisper{}); err != nil {
+		panic(err)
+	}
+}
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -5,7 +5,36 @@ cd /build

 if [ "$REBUILD" != "false" ]; then
 	rm -rf ./local-ai
-	make build
+	ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data make build -j${BUILD_PARALLELISM:-1}
+else
+	echo "@@@@@"
+	echo "Skipping rebuild"
+	echo "@@@@@"
+	echo "If you are experiencing issues with the pre-compiled builds, try setting REBUILD=true"
+	echo "If you are still experiencing issues with the build, try setting CMAKE_ARGS and disable the instructions set as needed:"
+	echo 'CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"'
+	echo "see the documentation at: https://localai.io/basics/build/index.html"
+	echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288"
+	echo "@@@@@"
+	echo "CPU info:"
+	grep -e "model\sname" /proc/cpuinfo | head -1
+	grep -e "flags" /proc/cpuinfo | head -1
+	if grep -q -e "\savx\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX    found OK"
+	else
+		echo "CPU: no AVX    found"
+	fi
+	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX2   found OK"
+	else
+		echo "CPU: no AVX2   found"
+	fi
+	if grep -q -e "\savx512" /proc/cpuinfo ; then
+		echo "CPU:    AVX512 found OK"
+	else
+		echo "CPU: no AVX512 found"
+	fi
+	echo "@@@@@"
 fi

-./local-ai "$@"
+./local-ai "$@"
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,7 +1,16 @@
 # Examples

+| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
+
+|                                                                    [Telegram bot](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot)   | [Flowise](https://github.com/go-skynet/LocalAI/tree/master/examples/flowise)                                                                                                                     |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)   |  ![Screenshot from 2023-05-30 18-01-03](https://github.com/go-skynet/LocalAI/assets/2420543/02458782-0549-4131-971c-95ee56ec1af8)|    |
+
 Here is a list of projects that can easily be integrated with the LocalAI backend. 

+
 ### Projects

 ### AutoGPT
@@ -24,6 +33,14 @@ This integration shows how to use LocalAI with [mckaywrigley/chatbot-ui](https:/

 There is also a separate example to show how to manually setup a model: [example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui-manual/)

+### K8sGPT
+
+_by [@mudler](https://github.com/mudler)_
+
+This example show how to use LocalAI inside Kubernetes with [k8sgpt](https://k8sgpt.ai).
+
+![Screenshot from 2023-06-19 23-58-47](https://github.com/go-skynet/go-ggml-transformers.cpp/assets/2420543/cab87409-ee68-44ae-8d53-41627fb49509)
+
 ### Flowise

 _by [@mudler](https://github.com/mudler)_
@@ -56,6 +73,14 @@ A ready to use example to show e2e how to integrate LocalAI with langchain

 [Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain-python/)

+### LocalAI functions
+
+_by [@mudler](https://github.com/mudler)_
+
+A ready to use example to show how to use OpenAI functions with LocalAI
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/functions/)
+
 ### LocalAI WebUI

 _by [@dhruvgera](https://github.com/dhruvgera)_
@@ -90,6 +115,14 @@ Run a slack bot which lets you talk directly with a model

 [Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/slack-bot/)

+### Slack bot (Question answering)
+
+_by [@mudler](https://github.com/mudler)_
+
+Run a slack bot, ideally for teams, which lets you ask questions on a documentation website, or a github repository.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/slack-qa-bot/)
+
 ### Question answering on documents with llama-index

 _by [@mudler](https://github.com/mudler)_
@@ -106,6 +139,16 @@ Shows how to integrate with `Langchain` and `Chroma` to enable question answerin

 [Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain-chroma/)

+### Telegram bot
+
+_by [@mudler](https://github.com/mudler)
+
+![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)
+
+Use LocalAI to power a Telegram bot assistant, with Image generation and audio support!
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot/)
+
 ### Template for Runpod.io

 _by [@fHachenberg](https://github.com/fHachenberg)_
@@ -114,6 +157,16 @@ Allows to run any LocalAI-compatible model as a backend on the servers of https:

 [Check it out here](https://runpod.io/gsc?template=uv9mtqnrd0&ref=984wlcra)

+### Continue
+
+<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
+
+_by [@gruberdev](https://github.com/gruberdev)_
+
+Demonstrates how to integrate an open-source copilot alternative that enhances code analysis, completion, and improvements. This approach seamlessly integrates with any LocalAI model, offering a more user-friendly experience.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/continue/)
+
 ## Want to contribute?

 Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/continue/README.md
+++ b/examples/continue/README.md
@@ -0,0 +1,56 @@
+# Continue
+
+![logo](https://continue.dev/docs/assets/images/continue-cover-logo-aa135cc83fe8a14af480d1633ed74eb5.png)
+
+This document presents an example of integration with [continuedev/continue](https://github.com/continuedev/continue).
+
+![Screenshot](https://continue.dev/docs/assets/images/continue-screenshot-1f36b99467817f755739d7f4c4c08fe3.png)
+
+For a live demonstration, please click on the link below:
+
+- [How it works (Video demonstration)](https://www.youtube.com/watch?v=3Ocrc-WX4iQ)
+
+## Integration Setup Walkthrough
+
+1. [As outlined in `continue`'s documentation](https://continue.dev/docs/getting-started), install the [Visual Studio Code extension from the marketplace](https://marketplace.visualstudio.com/items?itemName=Continue.continue) and open it.
+2. In this example, LocalAI will download the gpt4all model and set it up as "gpt-3.5-turbo". Refer to the `docker-compose.yaml` file for details.
+
+    ```bash
+    # Clone LocalAI
+    git clone https://github.com/go-skynet/LocalAI
+
+    cd LocalAI/examples/continue
+
+    # Start with docker-compose
+    docker-compose up --build -d
+    ```
+
+3. Type `/config` within Continue's VSCode extension, or edit the file located at `~/.continue/config.py` on your system with the following configuration:
+
+    ```py
+    from continuedev.src.continuedev.libs.llm.openai import OpenAI, OpenAIServerInfo
+
+    config = ContinueConfig(
+       ...
+       models=Models(
+            default=OpenAI(
+               api_key="my-api-key",
+               model="gpt-3.5-turbo",
+               openai_server_info=OpenAIServerInfo(
+                  api_base="http://localhost:8080",
+                  model="gpt-3.5-turbo"
+               )
+            )
+       ),
+    )
+    ```
+
+This setup enables you to make queries directly to your model running in the Docker container. Note that the `api_key` does not need to be properly set up; it is included here as a placeholder.
+
+If editing the configuration seems confusing, you may copy and paste the provided default `config.py` file over the existing one in `~/.continue/config.py` after initializing the extension in the VSCode IDE.
+
+## Additional Resources
+
+- [Official Continue documentation](https://continue.dev/docs/intro)
+- [Documentation page on using self-hosted models](https://continue.dev/docs/customization#self-hosting-an-open-source-model)
+- [Official extension link](https://marketplace.visualstudio.com/items?itemName=Continue.continue)
--- a/examples/continue/config.py
+++ b/examples/continue/config.py
@@ -0,0 +1,148 @@
+"""
+This is the Continue configuration file.
+
+See https://continue.dev/docs/customization to learn more.
+"""
+
+import subprocess
+
+from continuedev.src.continuedev.core.main import Step
+from continuedev.src.continuedev.core.sdk import ContinueSDK
+from continuedev.src.continuedev.core.models import Models
+from continuedev.src.continuedev.core.config import CustomCommand, SlashCommand, ContinueConfig
+from continuedev.src.continuedev.plugins.context_providers.github import GitHubIssuesContextProvider
+from continuedev.src.continuedev.plugins.context_providers.google import GoogleContextProvider
+from continuedev.src.continuedev.plugins.policies.default import DefaultPolicy
+from continuedev.src.continuedev.libs.llm.openai import OpenAI, OpenAIServerInfo
+from continuedev.src.continuedev.libs.llm.ggml import GGML
+
+from continuedev.src.continuedev.plugins.steps.open_config import OpenConfigStep
+from continuedev.src.continuedev.plugins.steps.clear_history import ClearHistoryStep
+from continuedev.src.continuedev.plugins.steps.feedback import FeedbackStep
+from continuedev.src.continuedev.plugins.steps.comment_code import CommentCodeStep
+from continuedev.src.continuedev.plugins.steps.share_session import ShareSessionStep
+from continuedev.src.continuedev.plugins.steps.main import EditHighlightedCodeStep
+from continuedev.src.continuedev.plugins.context_providers.search import SearchContextProvider
+from continuedev.src.continuedev.plugins.context_providers.diff import DiffContextProvider
+from continuedev.src.continuedev.plugins.context_providers.url import URLContextProvider
+
+class CommitMessageStep(Step):
+    """
+    This is a Step, the building block of Continue.
+    It can be used below as a slash command, so that
+    run will be called when you type '/commit'.
+    """
+    async def run(self, sdk: ContinueSDK):
+
+        # Get the root directory of the workspace
+        dir = sdk.ide.workspace_directory
+
+        # Run git diff in that directory
+        diff = subprocess.check_output(
+            ["git", "diff"], cwd=dir).decode("utf-8")
+
+        # Ask the LLM to write a commit message,
+        # and set it as the description of this step
+        self.description = await sdk.models.default.complete(
+            f"{diff}\n\nWrite a short, specific (less than 50 chars) commit message about the above changes:")
+
+
+config = ContinueConfig(
+
+    # If set to False, we will not collect any usage data
+    # See here to learn what anonymous data we collect: https://continue.dev/docs/telemetry
+    allow_anonymous_telemetry=True,
+
+    models = Models(
+        default = OpenAI(
+            api_key = "my-api-key",
+            model = "gpt-3.5-turbo",
+            openai_server_info = OpenAIServerInfo(
+                api_base = "http://localhost:8080",
+                model = "gpt-3.5-turbo"
+            )
+        )
+    ),
+    # Set a system message with information that the LLM should always keep in mind
+    # E.g. "Please give concise answers. Always respond in Spanish."
+    system_message=None,
+
+    # Set temperature to any value between 0 and 1. Higher values will make the LLM
+    # more creative, while lower values will make it more predictable.
+    temperature=0.5,
+
+    # Custom commands let you map a prompt to a shortened slash command
+    # They are like slash commands, but more easily defined - write just a prompt instead of a Step class
+    # Their output will always be in chat form
+    custom_commands=[
+        # CustomCommand(
+        #     name="test",
+        #     description="Write unit tests for the higlighted code",
+        #     prompt="Write a comprehensive set of unit tests for the selected code. It should setup, run tests that check for correctness including important edge cases, and teardown. Ensure that the tests are complete and sophisticated. Give the tests just as chat output, don't edit any file.",
+        # )
+    ],
+
+    # Slash commands let you run a Step from a slash command
+    slash_commands=[
+        # SlashCommand(
+        #     name="commit",
+        #     description="This is an example slash command. Use /config to edit it and create more",
+        #     step=CommitMessageStep,
+        # )
+        SlashCommand(
+            name="edit",
+            description="Edit code in the current file or the highlighted code",
+            step=EditHighlightedCodeStep,
+        ),
+        SlashCommand(
+            name="config",
+            description="Customize Continue - slash commands, LLMs, system message, etc.",
+            step=OpenConfigStep,
+        ),
+        SlashCommand(
+            name="comment",
+            description="Write comments for the current file or highlighted code",
+            step=CommentCodeStep,
+        ),
+        SlashCommand(
+            name="feedback",
+            description="Send feedback to improve Continue",
+            step=FeedbackStep,
+        ),
+        SlashCommand(
+            name="clear",
+            description="Clear step history",
+            step=ClearHistoryStep,
+        ),
+        SlashCommand(
+            name="share",
+            description="Download and share the session transcript",
+            step=ShareSessionStep,
+        )
+    ],
+
+    # Context providers let you quickly select context by typing '@'
+    # Uncomment the following to
+    # - quickly reference GitHub issues
+    # - show Google search results to the LLM
+    context_providers=[
+        # GitHubIssuesContextProvider(
+        #     repo_name="<your github username or organization>/<your repo name>",
+        #     auth_token="<your github auth token>"
+        # ),
+        # GoogleContextProvider(
+        #     serper_api_key="<your serper.dev api key>"
+        # )
+        SearchContextProvider(),
+        DiffContextProvider(),
+        URLContextProvider(
+            preset_urls = [
+                # Add any common urls you reference here so they appear in autocomplete
+            ]
+        )
+    ],
+
+    # Policies hold the main logic that decides which Step to take next
+    # You can use them to design agents, or deeply customize Continue
+    policy=DefaultPolicy()
+)
--- a/examples/continue/docker-compose.yml
+++ b/examples/continue/docker-compose.yml
@@ -0,0 +1,27 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    # As initially LocalAI will download the models defined in PRELOAD_MODELS
+    # you might need to tweak the healthcheck values here according to your network connection.
+    # Here we give a timespan of 20m to download all the required files.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 20
+    build:
+      context: ../../
+      dockerfile: Dockerfile
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+      # You can preload different models here as well.
+      # See: https://github.com/go-skynet/model-gallery
+      - 'PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}]'
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
--- a/examples/continue/img/screen.png
+++ b/examples/continue/img/screen.png
--- a/examples/flowise/README.md
+++ b/examples/flowise/README.md
@@ -24,3 +24,7 @@ docker-compose up --pull always

 Open http://localhost:3000.

+## Using LocalAI
+
+Search for LocalAI in the integration, and use the `http://api:8080/` as URL.
+
--- a/examples/functions/.env
+++ b/examples/functions/.env
@@ -0,0 +1,9 @@
+OPENAI_API_KEY=sk---anystringhere
+OPENAI_API_BASE=http://api:8080/v1
+# Models to preload at start
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings
+PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/openllama-7b-open-instruct.yaml", "name": "gpt-3.5-turbo"}]
+
+## Change the default number of threads
+#THREADS=14
+
--- a/examples/functions/Dockerfile
+++ b/examples/functions/Dockerfile
@@ -0,0 +1,5 @@
+FROM python:3.10-bullseye
+COPY . /app
+WORKDIR /app
+RUN pip install --no-cache-dir -r requirements.txt
+ENTRYPOINT [ "python", "./functions-openai.py" ];
--- a/examples/functions/README.md
+++ b/examples/functions/README.md
@@ -0,0 +1,18 @@
+# LocalAI functions
+
+Example of using LocalAI functions, see the [OpenAI](https://openai.com/blog/function-calling-and-other-api-updates) blog post.
+
+## Run
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/functions
+
+docker-compose run --rm functions
+```
+
+Note: The example automatically downloads the `openllama` model as it is under a permissive license.
+
+See the `.env` configuration file to set a different model with the [model-gallery](https://github.com/go-skynet/model-gallery) by editing `PRELOAD_MODELS`.
--- a/examples/functions/docker-compose.yaml
+++ b/examples/functions/docker-compose.yaml
@@ -0,0 +1,23 @@
+version: "3.9"
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:master
+    ports:
+      - 8080:8080
+    env_file:
+      - .env
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+  functions:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    depends_on:
+      api:
+        condition: service_healthy
+    env_file:
+      - .env
--- a/examples/functions/functions-openai.py
+++ b/examples/functions/functions-openai.py
@@ -0,0 +1,76 @@
+import openai
+import json
+
+# Example dummy function hard coded to return the same weather
+# In production, this could be your backend API or an external API
+def get_current_weather(location, unit="fahrenheit"):
+    """Get the current weather in a given location"""
+    weather_info = {
+        "location": location,
+        "temperature": "72",
+        "unit": unit,
+        "forecast": ["sunny", "windy"],
+    }
+    return json.dumps(weather_info)
+
+
+def run_conversation():
+    # Step 1: send the conversation and available functions to GPT
+    messages = [{"role": "user", "content": "What's the weather like in Boston?"}]
+    functions = [
+        {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        }
+    ]
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        functions=functions,
+        function_call="auto",  # auto is default, but we'll be explicit
+    )
+    response_message = response["choices"][0]["message"]
+
+    # Step 2: check if GPT wanted to call a function
+    if response_message.get("function_call"):
+        # Step 3: call the function
+        # Note: the JSON response may not always be valid; be sure to handle errors
+        available_functions = {
+            "get_current_weather": get_current_weather,
+        }  # only one function in this example, but you can have multiple
+        function_name = response_message["function_call"]["name"]
+        fuction_to_call = available_functions[function_name]
+        function_args = json.loads(response_message["function_call"]["arguments"])
+        function_response = fuction_to_call(
+            location=function_args.get("location"),
+            unit=function_args.get("unit"),
+        )
+
+        # Step 4: send the info on the function call and function response to GPT
+        messages.append(response_message)  # extend conversation with assistant's reply
+        messages.append(
+            {
+                "role": "function",
+                "name": function_name,
+                "content": function_response,
+            }
+        )  # extend conversation with function response
+        second_response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+        )  # get a new response from GPT where it can see the function response
+        return second_response
+
+
+print(run_conversation())
--- a/examples/functions/requirements.txt
+++ b/examples/functions/requirements.txt
@@ -0,0 +1,2 @@
+langchain==0.0.234
+openai==0.27.8
--- a/examples/insomnia/Insomnia_LocalAI.json
+++ b/examples/insomnia/Insomnia_LocalAI.json
--- a/examples/insomnia/README.md
+++ b/examples/insomnia/README.md
@@ -0,0 +1,17 @@
+# Insomnia
+
+Developer Testing Request Collection for [Insomnia](https://insomnia.rest/), an open-source REST client
+
+## Instructions
+
+* Install Insomnia as normal
+* [Import](https://docs.insomnia.rest/insomnia/import-export-data) `Insomnia_LocalAI.json`
+* Control + E opens the environment settings - 
+
+| **Parameter Name** | **Default Value** | **Description**                          |
+|--------------------|-------------------|------------------------------------------|
+| HOST               | localhost         | LocalAI base URL                         |
+| PORT               | 8080              | LocalAI port                             |
+| DEFAULT_MODEL      | gpt-3.5-turbo     | Name of the model used on most requests. |
+
+** you may want to duplicate localhost into a "Private" environment to avoid saving private settings back to this file **
--- a/examples/k8sgpt/README.md
+++ b/examples/k8sgpt/README.md
@@ -0,0 +1,72 @@
+# k8sgpt example
+
+This example show how to use LocalAI with k8sgpt
+
+![Screenshot from 2023-06-19 23-58-47](https://github.com/go-skynet/go-ggml-transformers.cpp/assets/2420543/cab87409-ee68-44ae-8d53-41627fb49509)
+
+## Create the cluster locally with Kind (optional)
+
+If you want to test this locally without a remote Kubernetes cluster, you can use kind.
+
+Install [kind](https://kind.sigs.k8s.io/) and create a cluster:
+
+```
+kind create cluster
+```
+
+## Setup LocalAI
+
+We will use [helm](https://helm.sh/docs/intro/install/):
+
+```
+helm repo add go-skynet https://go-skynet.github.io/helm-charts/
+helm repo update
+
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/k8sgpt
+
+# modify values.yaml preload_models with the models you want to install.
+# CHANGE the URL to a model in huggingface.
+helm install local-ai go-skynet/local-ai --create-namespace --namespace local-ai --values values.yaml
+```
+
+## Setup K8sGPT
+
+```
+# Install k8sgpt
+helm repo add k8sgpt https://charts.k8sgpt.ai/
+helm repo update
+helm install release k8sgpt/k8sgpt-operator -n k8sgpt-operator-system --create-namespace --version 0.0.17
+```
+
+Apply the k8sgpt-operator configuration:
+
+```
+kubectl apply -f - << EOF
+apiVersion: core.k8sgpt.ai/v1alpha1
+kind: K8sGPT
+metadata:
+  name: k8sgpt-local-ai
+  namespace: default
+spec:
+  backend: localai
+  baseUrl: http://local-ai.local-ai.svc.cluster.local:8080/v1
+  noCache: false
+  model: gpt-3.5-turbo
+  version: v0.3.0
+  enableAI: true
+EOF
+```
+
+## Test
+
+Apply a broken pod:
+
+```
+kubectl apply -f broken-pod.yaml
+```
+
+## ArgoCD Deployment Example
+[Deploy K8sgpt + localai with Argocd](https://github.com/tyler-harpool/gitops/tree/main/infra/k8gpt)
--- a/examples/k8sgpt/broken-pod.yaml
+++ b/examples/k8sgpt/broken-pod.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: broken-pod
+spec:
+  containers:
+    - name: broken-pod
+      image: nginx:1.a.b.c
+      livenessProbe:
+        httpGet:
+          path: /
+          port: 90
+        initialDelaySeconds: 3
+        periodSeconds: 3
--- a/examples/k8sgpt/values.yaml
+++ b/examples/k8sgpt/values.yaml
@@ -0,0 +1,96 @@
+replicaCount: 1
+
+deployment:
+  # https://quay.io/repository/go-skynet/local-ai?tab=tags
+  image: quay.io/go-skynet/local-ai:v1.23.0
+  env:
+    threads: 4
+    debug: "true"
+    context_size: 512
+    galleries: '[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
+    preload_models: '[{ "id": "huggingface@thebloke__open-llama-13b-open-instruct-ggml__open-llama-13b-open-instruct.ggmlv3.q3_k_m.bin", "name": "gpt-3.5-turbo", "overrides": { "f16": true, "mmap": true }}]'
+  modelsPath: "/models"
+
+resources:
+  {}
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
+  # limits:
+  #   cpu: 100m
+  #   memory: 128Mi
+  # requests:
+  #   cpu: 100m
+  #   memory: 128Mi
+
+# Prompt templates to include
+# Note: the keys of this map will be the names of the prompt template files
+promptTemplates:
+  {}
+  # ggml-gpt4all-j.tmpl: |
+  #   The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+  #   ### Prompt:
+  #   {{.Input}}
+  #   ### Response:
+
+# Models to download at runtime
+models:
+  # Whether to force download models even if they already exist
+  forceDownload: false
+
+  # The list of URLs to download models from
+  # Note: the name of the file will be the name of the loaded model
+  list:
+  #- url: "https://gpt4all.io/models/ggml-gpt4all-j.bin"
+      # basicAuth: base64EncodedCredentials
+
+  # Persistent storage for models and prompt templates.
+  # PVC and HostPath are mutually exclusive. If both are enabled,
+  # PVC configuration takes precedence. If neither are enabled, ephemeral
+  # storage is used.
+  persistence:
+    pvc:
+      enabled: false
+      size: 6Gi
+      accessModes:
+        - ReadWriteOnce
+
+      annotations: {}
+
+      # Optional
+      storageClass: ~
+
+    hostPath:
+      enabled: false
+      path: "/models"
+
+service:
+  type: ClusterIP
+  port: 8080
+  annotations: {}
+  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
+  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
+
+ingress:
+  enabled: false
+  className: ""
+  annotations:
+    {}
+    # kubernetes.io/ingress.class: nginx
+    # kubernetes.io/tls-acme: "true"
+  hosts:
+    - host: chart-example.local
+      paths:
+        - path: /
+          pathType: ImplementationSpecific
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}
--- a/examples/langchain-chroma/query.py
+++ b/examples/langchain-chroma/query.py
@@ -9,7 +9,7 @@ from langchain.vectorstores.base import VectorStoreRetriever
 base_path = os.environ.get('OPENAI_API_BASE', 'http://localhost:8080/v1')

 # Load and process the text
-embedding = OpenAIEmbeddings()
+embedding = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_base=base_path)
 persist_directory = 'db'

 # Now we can load the persisted database from disk, and use it as normal. 
--- a/examples/langchain-chroma/store.py
+++ b/examples/langchain-chroma/store.py
@@ -18,8 +18,8 @@ texts = text_splitter.split_documents(documents)
 # Supplying a persist_directory will store the embeddings on disk
 persist_directory = 'db'

-embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
+embedding = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_base=base_path)
 vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)

 vectordb.persist()
-vectordb = None
+vectordb = None
--- a/examples/langchain-python/README.md
+++ b/examples/langchain-python/README.md
@@ -12,15 +12,8 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/langchain-python

-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# Download gpt4all-j to models/
-wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
-
 # start with docker-compose
-docker-compose up -d --build
-
+docker-compose up --pull always

 pip install langchain
 pip install openai
--- a/examples/langchain-python/docker-compose.yaml
+++ b/examples/langchain-python/docker-compose.yaml
@@ -3,6 +3,14 @@ version: '3.6'
 services:
  api:
    image: quay.io/go-skynet/local-ai:latest
+    # As initially LocalAI will download the models defined in PRELOAD_MODELS
+    # you might need to tweak the healthcheck values here according to your network connection.
+    # Here we give a timespan of 20m to download all the required files.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 20
    build:
      context: ../../
      dockerfile: Dockerfile
@@ -11,6 +19,9 @@ services:
    environment:
      - DEBUG=true
      - MODELS_PATH=/models
+      # You can preload different models here as well.
+      # See: https://github.com/go-skynet/model-gallery
+      - 'PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}]'
    volumes:
      - ./models:/models:cached
-    command: ["/usr/bin/local-ai" ]
+    command: ["/usr/bin/local-ai" ]
--- a/examples/langchain-python/models
+++ b/examples/langchain-python/models
@@ -1 +0,0 @@
-../chatbot-ui/models
--- a/examples/slack-qa-bot/.env.example
+++ b/examples/slack-qa-bot/.env.example
@@ -0,0 +1,48 @@
+# Create an app-level token with connections:write scope
+SLACK_APP_TOKEN=xapp-1-...
+# Install the app into your workspace to grab this token
+SLACK_BOT_TOKEN=xoxb-...
+
+# Set this to a random string, it doesn't matter, however if present the python library complains
+OPENAI_API_KEY=sk-foo-bar-baz
+
+# Optional: gpt-3.5-turbo and gpt-4 are currently supported (default: gpt-3.5-turbo)
+OPENAI_MODEL=gpt-3.5-turbo
+# Optional: You can adjust the timeout seconds for OpenAI calls (default: 30)
+OPENAI_TIMEOUT_SECONDS=560
+
+MEMORY_DIR=/tmp/memory_dir
+
+OPENAI_API_BASE=http://api:8080/v1
+
+EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
+
+## Repository and sitemap to index in the vector database on start
+SITEMAP="https://kairos.io/sitemap.xml"
+
+# Optional repository names.
+# REPOSITORIES="foo,bar"
+# # Define clone URL for "foo"
+# foo_CLONE_URL="http://github.com.."
+# bar_CLONE_URL="..."
+# # Define branch for foo
+# foo_BRANCH="master"
+# Optional token if scraping issues
+# GITHUB_PERSONAL_ACCESS_TOKEN=""
+# ISSUE_REPOSITORIES="go-skynet/LocalAI,foo/bar,..."
+
+# Optional: When the string is "true", this app translates ChatGPT prompts into a user's preferred language (default: true)
+USE_SLACK_LANGUAGE=true
+# Optional: Adjust the app's logging level (default: DEBUG)
+SLACK_APP_LOG_LEVEL=INFO
+# Optional: When the string is "true", translate between OpenAI markdown and Slack mrkdwn format (default: false)
+TRANSLATE_MARKDOWN=true
+
+
+### LocalAI
+
+DEBUG=true
+MODELS_PATH=/models
+IMAGE_PATH=/tmp
+# See: https://github.com/go-skynet/model-gallery
+PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}]
--- a/examples/slack-qa-bot/README.md
+++ b/examples/slack-qa-bot/README.md
@@ -0,0 +1,23 @@
+## Slack QA Bot 
+
+This example uses https://github.com/spectrocloud-labs/Slack-QA-bot to deploy a slack bot that can answer to your documentation!
+
+- Create a new Slack app using the manifest-dev.yml file
+- Install the app into your Slack workspace
+- Retrieve your slack keys and edit `.env`
+- Start the app
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/slack-qa-bot
+
+cp -rfv .env.example .env
+
+# Edit .env and add slackbot api keys, or repository settings to scan
+vim .env
+
+# run the bot
+docker-compose up
+```
--- a/examples/slack-qa-bot/deployment.yaml
+++ b/examples/slack-qa-bot/deployment.yaml
@@ -0,0 +1,97 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: slack-bot
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: knowledgebase
+  namespace: slack-bot
+  labels:
+    app: localai-qabot
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 5Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: localai-qabot
+  namespace: slack-bot
+  labels:
+    app: localai-qabot
+spec:
+  selector:
+    matchLabels:
+      app: localai-qabot
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: localai-qabot
+      name: localai-qabot
+    spec:
+      containers:
+        - name: localai-qabot-slack
+          env:
+          - name: OPENAI_API_KEY
+            value: "x"
+          - name: SLACK_APP_TOKEN
+            value: "xapp-1-"
+          - name: SLACK_BOT_TOKEN
+            value: "xoxb-"
+          - name: OPENAI_MODEL
+            value: "gpt-3.5-turbo"
+          - name: OPENAI_TIMEOUT_SECONDS
+            value: "400"
+          - name: OPENAI_SYSTEM_TEXT
+            value: ""
+          - name: MEMORY_DIR
+            value: "/memory"
+          - name: TRANSLATE_MARKDOWN
+            value: "true"
+          - name: OPENAI_API_BASE
+            value: "http://local-ai.default.svc.cluster.local:8080"
+          - name: REPOSITORIES
+            value: "KAIROS,AGENT,SDK,OSBUILDER,PACKAGES,IMMUCORE"
+          - name: KAIROS_CLONE_URL
+            value: "https://github.com/kairos-io/kairos"
+          - name: KAIROS_BRANCH
+            value: "master"
+          - name: AGENT_CLONE_URL
+            value: "https://github.com/kairos-io/kairos-agent"
+          - name: AGENT_BRANCH
+            value: "main"
+          - name: SDK_CLONE_URL
+            value: "https://github.com/kairos-io/kairos-sdk"
+          - name: SDK_BRANCH
+            value: "main"
+          - name: OSBUILDER_CLONE_URL
+            value: "https://github.com/kairos-io/osbuilder"
+          - name: OSBUILDER_BRANCH
+            value: "master"
+          - name: PACKAGES_CLONE_URL
+            value: "https://github.com/kairos-io/packages"
+          - name: PACKAGES_BRANCH
+            value: "main"
+          - name: IMMUCORE_CLONE_URL
+            value: "https://github.com/kairos-io/immucore"
+          - name: IMMUCORE_BRANCH
+            value: "master"
+          - name: GITHUB_PERSONAL_ACCESS_TOKEN
+            value: ""
+          - name: ISSUE_REPOSITORIES
+            value: "kairos-io/kairos"
+          image: quay.io/spectrocloud-labs/slack-qa-local-bot:qa
+          imagePullPolicy: Always
+          volumeMounts:
+            - mountPath: "/memory"
+              name: knowledgebase
+      volumes:
+        - name: knowledgebase
+          persistentVolumeClaim:
+            claimName: knowledgebase
--- a/examples/slack-qa-bot/docker-compose.yml
+++ b/examples/slack-qa-bot/docker-compose.yml
@@ -0,0 +1,30 @@
+version: "3"
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    # As initially LocalAI will download the models defined in PRELOAD_MODELS
+    # you might need to tweak the healthcheck values here according to your network connection.
+    # Here we give a timespan of 20m to download all the required files.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 20
+    ports:
+      - 8080:8080
+    env_file:
+      - .env
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+ 
+  slackbot:
+    image: quay.io/spectrocloud-labs/slack-qa-local-bot:qa
+    container_name: slackbot
+    restart: always
+    env_file:
+      - .env
+    depends_on:
+      api:
+        condition: service_healthy
--- a/examples/telegram-bot/README.md
+++ b/examples/telegram-bot/README.md
@@ -0,0 +1,30 @@
+## Telegram bot
+
+![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)
+
+This example uses a fork of [chatgpt-telegram-bot](https://github.com/karfly/chatgpt_telegram_bot) to deploy a telegram bot with LocalAI instead of OpenAI.
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/telegram-bot
+
+git clone https://github.com/mudler/chatgpt_telegram_bot
+
+cp -rf docker-compose.yml chatgpt_telegram_bot
+
+cd chatgpt_telegram_bot
+
+mv config/config.example.yml config/config.yml
+mv config/config.example.env config/config.env
+
+# Edit config/config.yml to set the telegram bot token
+vim config/config.yml
+
+# run the bot
+docker-compose --env-file config/config.env up --build
+```
+
+Note: LocalAI is configured to download `gpt4all-j` in place of `gpt-3.5-turbo` and `stablediffusion` for image generation at the first start. Download size is >6GB, if your network connection is slow, adapt the `docker-compose.yml` file healthcheck section accordingly (replace `20m`, for instance with `1h`, etc.). 
+To configure models manually, comment the `PRELOAD_MODELS` environment variable in the `docker-compose.yml` file and see for instance the [chatbot-ui-manual example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui-manual) `model` directory.
--- a/examples/telegram-bot/docker-compose.yml
+++ b/examples/telegram-bot/docker-compose.yml
@@ -0,0 +1,38 @@
+version: "3"
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:v1.18.0-ffmpeg
+    # As initially LocalAI will download the models defined in PRELOAD_MODELS
+    # you might need to tweak the healthcheck values here according to your network connection.
+    # Here we give a timespan of 20m to download all the required files.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
+      interval: 1m
+      timeout: 20m
+      retries: 20
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+      - IMAGE_PATH=/tmp
+      # You can preload different models here as well.
+      # See: https://github.com/go-skynet/model-gallery
+      - 'PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, {"url": "github:go-skynet/model-gallery/stablediffusion.yaml"}, {"url": "github:go-skynet/model-gallery/whisper-base.yaml", "name": "whisper-1"}]'
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai"]
+  chatgpt_telegram_bot:
+    container_name: chatgpt_telegram_bot
+    command: python3 bot/bot.py
+    restart: always
+    environment:
+      - OPENAI_API_KEY=sk---anystringhere
+      - OPENAI_API_BASE=http://api:8080/v1
+    build:
+      context: "."
+      dockerfile: Dockerfile
+    depends_on:
+      api:
+        condition: service_healthy
--- a/extra/grpc/autogptq/autogptq.py
+++ b/extra/grpc/autogptq/autogptq.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from pathlib import Path
+from transformers import AutoTokenizer
+from transformers import TextGenerationPipeline
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            device = "cuda:0"
+            if request.Device != "":
+                device = request.Device
+
+            tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=request.UseFastTokenizer)
+
+            model = AutoGPTQForCausalLM.from_quantized(request.Model,
+                    model_basename=request.ModelBaseName,
+                    use_safetensors=True,
+                    trust_remote_code=True,
+                    device=device,
+                    use_triton=request.UseTriton,
+                    quantize_config=None)
+            
+            self.model = model
+            self.tokenizer = tokenizer
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Predict(self, request, context):
+        penalty = 1.0
+        if request.Penalty != 0.0:
+            penalty = request.Penalty
+        tokens = 512
+        if request.Tokens != 0:
+            tokens = request.Tokens
+        top_p = 0.95
+        if request.TopP != 0.0:
+            top_p = request.TopP
+
+        # Implement Predict RPC
+        pipeline = TextGenerationPipeline(
+            model=self.model, 
+            tokenizer=self.tokenizer,
+            max_new_tokens=tokens,
+            temperature=request.Temperature,
+            top_p=top_p,
+            repetition_penalty=penalty,
+            )
+        t = pipeline(request.Prompt)[0]["generated_text"]
+        # Remove prompt from response if present
+        if request.Prompt in t:
+            t = t.replace(request.Prompt, "")
+
+        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
+
+    def PredictStream(self, request, context):
+        # Implement PredictStream RPC
+        #for reply in some_data_generator():
+        #    yield reply
+        # Not implemented yet
+        return self.Predict(request, context)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/extra/grpc/autogptq/backend_pb2.py
+++ b/extra/grpc/autogptq/backend_pb2.py
--- a/extra/grpc/autogptq/backend_pb2_grpc.py
+++ b/extra/grpc/autogptq/backend_pb2_grpc.py
@@ -0,0 +1,363 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import backend_pb2 as backend__pb2
+
+
+class BackendStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Health = channel.unary_unary(
+                '/backend.Backend/Health',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Predict = channel.unary_unary(
+                '/backend.Backend/Predict',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.LoadModel = channel.unary_unary(
+                '/backend.Backend/LoadModel',
+                request_serializer=backend__pb2.ModelOptions.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.PredictStream = channel.unary_stream(
+                '/backend.Backend/PredictStream',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Embedding = channel.unary_unary(
+                '/backend.Backend/Embedding',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.EmbeddingResult.FromString,
+                )
+        self.GenerateImage = channel.unary_unary(
+                '/backend.Backend/GenerateImage',
+                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.AudioTranscription = channel.unary_unary(
+                '/backend.Backend/AudioTranscription',
+                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
+                response_deserializer=backend__pb2.TranscriptResult.FromString,
+                )
+        self.TTS = channel.unary_unary(
+                '/backend.Backend/TTS',
+                request_serializer=backend__pb2.TTSRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )
+
+
+class BackendServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Health(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def LoadModel(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def PredictStream(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Embedding(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateImage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def AudioTranscription(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TTS(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BackendServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Health': grpc.unary_unary_rpc_method_handler(
+                    servicer.Health,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'LoadModel': grpc.unary_unary_rpc_method_handler(
+                    servicer.LoadModel,
+                    request_deserializer=backend__pb2.ModelOptions.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'PredictStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.PredictStream,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Embedding': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embedding,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
+            ),
+            'GenerateImage': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateImage,
+                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
+                    servicer.AudioTranscription,
+                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
+                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
+            ),
+            'TTS': grpc.unary_unary_rpc_method_handler(
+                    servicer.TTS,
+                    request_deserializer=backend__pb2.TTSRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'backend.Backend', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Backend(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Health(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def LoadModel(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
+            backend__pb2.ModelOptions.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def PredictStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Embedding(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.EmbeddingResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateImage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
+            backend__pb2.GenerateImageRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def AudioTranscription(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
+            backend__pb2.TranscriptRequest.SerializeToString,
+            backend__pb2.TranscriptResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TTS(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
+            backend__pb2.TTSRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/bark/backend_pb2.py
+++ b/extra/grpc/bark/backend_pb2.py
--- a/extra/grpc/bark/backend_pb2_grpc.py
+++ b/extra/grpc/bark/backend_pb2_grpc.py
@@ -0,0 +1,363 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import backend_pb2 as backend__pb2
+
+
+class BackendStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Health = channel.unary_unary(
+                '/backend.Backend/Health',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Predict = channel.unary_unary(
+                '/backend.Backend/Predict',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.LoadModel = channel.unary_unary(
+                '/backend.Backend/LoadModel',
+                request_serializer=backend__pb2.ModelOptions.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.PredictStream = channel.unary_stream(
+                '/backend.Backend/PredictStream',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Embedding = channel.unary_unary(
+                '/backend.Backend/Embedding',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.EmbeddingResult.FromString,
+                )
+        self.GenerateImage = channel.unary_unary(
+                '/backend.Backend/GenerateImage',
+                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.AudioTranscription = channel.unary_unary(
+                '/backend.Backend/AudioTranscription',
+                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
+                response_deserializer=backend__pb2.TranscriptResult.FromString,
+                )
+        self.TTS = channel.unary_unary(
+                '/backend.Backend/TTS',
+                request_serializer=backend__pb2.TTSRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.TokenizeString = channel.unary_unary(
+                '/backend.Backend/TokenizeString',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.TokenizationResponse.FromString,
+                )
+        self.Status = channel.unary_unary(
+                '/backend.Backend/Status',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.StatusResponse.FromString,
+                )
+
+
+class BackendServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Health(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def LoadModel(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def PredictStream(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Embedding(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateImage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def AudioTranscription(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TTS(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TokenizeString(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Status(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BackendServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Health': grpc.unary_unary_rpc_method_handler(
+                    servicer.Health,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'LoadModel': grpc.unary_unary_rpc_method_handler(
+                    servicer.LoadModel,
+                    request_deserializer=backend__pb2.ModelOptions.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'PredictStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.PredictStream,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Embedding': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embedding,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
+            ),
+            'GenerateImage': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateImage,
+                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
+                    servicer.AudioTranscription,
+                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
+                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
+            ),
+            'TTS': grpc.unary_unary_rpc_method_handler(
+                    servicer.TTS,
+                    request_deserializer=backend__pb2.TTSRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'TokenizeString': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenizeString,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
+            ),
+            'Status': grpc.unary_unary_rpc_method_handler(
+                    servicer.Status,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.StatusResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'backend.Backend', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Backend(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Health(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def LoadModel(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
+            backend__pb2.ModelOptions.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def PredictStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Embedding(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.EmbeddingResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateImage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
+            backend__pb2.GenerateImageRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def AudioTranscription(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
+            backend__pb2.TranscriptRequest.SerializeToString,
+            backend__pb2.TranscriptResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TTS(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
+            backend__pb2.TTSRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenizeString(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.TokenizationResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Status(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.StatusResponse.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/bark/ttsbark.py
+++ b/extra/grpc/bark/ttsbark.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from pathlib import Path
+from bark import SAMPLE_RATE, generate_audio, preload_models
+from scipy.io.wavfile import write as write_wav
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        model_name = request.Model
+        try:
+            print("Preparing models, please wait", file=sys.stderr)
+            # download and load all models
+            preload_models()
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        model = request.model
+        print(request, file=sys.stderr)
+        try:
+            audio_array = None
+            if model != "":
+                audio_array = generate_audio(request.text, history_prompt=model)
+            else:
+                audio_array = generate_audio(request.text)
+            print("saving to", request.dst, file=sys.stderr)
+            # save audio to disk
+            write_wav(request.dst, SAMPLE_RATE, audio_array)
+            print("saved to", request.dst, file=sys.stderr)
+            print("tts for", file=sys.stderr)
+            print(request, file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/extra/grpc/diffusers/backend_diffusers.py
+++ b/extra/grpc/diffusers/backend_diffusers.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os
+
+# import diffusers
+import torch
+from torch import autocast
+from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+from diffusers.pipelines.stable_diffusion import safety_checker
+from compel import Compel
+from PIL import Image
+from io import BytesIO
+from diffusers import StableDiffusionImg2ImgPipeline
+from transformers import CLIPTextModel
+from enum import Enum
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+COMPEL=os.environ.get("COMPEL", "1") == "1"
+CLIPSKIP=os.environ.get("CLIPSKIP", "1") == "1"
+
+# https://github.com/CompVis/stable-diffusion/issues/239#issuecomment-1627615287
+def sc(self, clip_input, images) : return images, [False for i in images]
+# edit the StableDiffusionSafetyChecker class so that, when called, it just returns the images and an array of True values
+safety_checker.StableDiffusionSafetyChecker.forward = sc
+
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UniPCMultistepScheduler,
+)
+# The scheduler list mapping was taken from here: https://github.com/neggles/animatediff-cli/blob/6f336f5f4b5e38e85d7f06f1744ef42d0a45f2a7/src/animatediff/schedulers.py#L39
+# Credits to https://github.com/neggles
+# See https://github.com/huggingface/diffusers/issues/4167 for more details on sched mapping from A1111
+class DiffusionScheduler(str, Enum):
+    ddim = "ddim"  # DDIM
+    pndm = "pndm"  # PNDM
+    heun = "heun"  # Heun
+    unipc = "unipc"  # UniPC
+    euler = "euler"  # Euler
+    euler_a = "euler_a"  # Euler a
+
+    lms = "lms"  # LMS
+    k_lms = "k_lms"  # LMS Karras
+
+    dpm_2 = "dpm_2"  # DPM2
+    k_dpm_2 = "k_dpm_2"  # DPM2 Karras
+
+    dpm_2_a = "dpm_2_a"  # DPM2 a
+    k_dpm_2_a = "k_dpm_2_a"  # DPM2 a Karras
+
+    dpmpp_2m = "dpmpp_2m"  # DPM++ 2M
+    k_dpmpp_2m = "k_dpmpp_2m"  # DPM++ 2M Karras
+
+    dpmpp_sde = "dpmpp_sde"  # DPM++ SDE
+    k_dpmpp_sde = "k_dpmpp_sde"  # DPM++ SDE Karras
+
+    dpmpp_2m_sde = "dpmpp_2m_sde"  # DPM++ 2M SDE
+    k_dpmpp_2m_sde = "k_dpmpp_2m_sde"  # DPM++ 2M SDE Karras
+
+
+def get_scheduler(name: str, config: dict = {}):
+    is_karras = name.startswith("k_")
+    if is_karras:
+        # strip the k_ prefix and add the karras sigma flag to config
+        name = name.lstrip("k_")
+        config["use_karras_sigmas"] = True
+
+    if name == DiffusionScheduler.ddim:
+        sched_class = DDIMScheduler
+    elif name == DiffusionScheduler.pndm:
+        sched_class = PNDMScheduler
+    elif name == DiffusionScheduler.heun:
+        sched_class = HeunDiscreteScheduler
+    elif name == DiffusionScheduler.unipc:
+        sched_class = UniPCMultistepScheduler
+    elif name == DiffusionScheduler.euler:
+        sched_class = EulerDiscreteScheduler
+    elif name == DiffusionScheduler.euler_a:
+        sched_class = EulerAncestralDiscreteScheduler
+    elif name == DiffusionScheduler.lms:
+        sched_class = LMSDiscreteScheduler
+    elif name == DiffusionScheduler.dpm_2:
+        # Equivalent to DPM2 in K-Diffusion
+        sched_class = KDPM2DiscreteScheduler
+    elif name == DiffusionScheduler.dpm_2_a:
+        # Equivalent to `DPM2 a`` in K-Diffusion
+        sched_class = KDPM2AncestralDiscreteScheduler
+    elif name == DiffusionScheduler.dpmpp_2m:
+        # Equivalent to `DPM++ 2M` in K-Diffusion
+        sched_class = DPMSolverMultistepScheduler
+        config["algorithm_type"] = "dpmsolver++"
+        config["solver_order"] = 2
+    elif name == DiffusionScheduler.dpmpp_sde:
+        # Equivalent to `DPM++ SDE` in K-Diffusion
+        sched_class = DPMSolverSinglestepScheduler
+    elif name == DiffusionScheduler.dpmpp_2m_sde:
+        # Equivalent to `DPM++ 2M SDE` in K-Diffusion
+        sched_class = DPMSolverMultistepScheduler
+        config["algorithm_type"] = "sde-dpmsolver++"
+    else:
+        raise ValueError(f"Invalid scheduler '{'k_' if is_karras else ''}{name}'")
+
+    return sched_class.from_config(config)
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            print(f"Loading model {request.Model}...", file=sys.stderr)
+            print(f"Request {request}", file=sys.stderr)
+            torchType = torch.float32
+            if request.F16Memory:
+                torchType = torch.float16
+
+            local = False
+            modelFile = request.Model
+
+            cfg_scale = 7
+            if request.CFGScale != 0:
+                cfg_scale = request.CFGScale
+            
+            clipmodel = "runwayml/stable-diffusion-v1-5"
+            if request.CLIPModel != "":
+                clipmodel = request.CLIPModel
+            clipsubfolder = "text_encoder"
+            if request.CLIPSubfolder != "":
+                clipsubfolder = request.CLIPSubfolder
+            
+            # Check if ModelFile exists
+            if request.ModelFile != "":
+                if os.path.exists(request.ModelFile):
+                    local = True
+                    modelFile = request.ModelFile
+            
+            fromSingleFile = request.Model.startswith("http") or request.Model.startswith("/") or local
+            
+            if request.IMG2IMG and request.PipelineType == "":
+                request.PipelineType == "StableDiffusionImg2ImgPipeline"
+                
+            if request.PipelineType == "":
+                request.PipelineType == "StableDiffusionPipeline"
+
+            ## img2img
+            if request.PipelineType == "StableDiffusionImg2ImgPipeline":
+                if fromSingleFile:
+                    self.pipe = StableDiffusionImg2ImgPipeline.from_single_file(modelFile,
+                                torch_dtype=torchType,
+                                guidance_scale=cfg_scale)
+                else:
+                    self.pipe = StableDiffusionImg2ImgPipeline.from_pretrained(request.Model,
+                                torch_dtype=torchType,
+                                guidance_scale=cfg_scale)
+
+            if request.PipelineType == "StableDiffusionDepth2ImgPipeline":
+                self.pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(request.Model,
+                            torch_dtype=torchType,
+                            guidance_scale=cfg_scale)
+            ## text2img
+            if request.PipelineType == "StableDiffusionPipeline":
+                if fromSingleFile:
+                    self.pipe = StableDiffusionPipeline.from_single_file(modelFile,
+                                                        torch_dtype=torchType,
+                                                        guidance_scale=cfg_scale)
+                else:
+                    self.pipe = StableDiffusionPipeline.from_pretrained(request.Model,
+                                                        torch_dtype=torchType,
+                                                        guidance_scale=cfg_scale)
+
+            if request.PipelineType == "DiffusionPipeline":
+                self.pipe = DiffusionPipeline.from_pretrained(request.Model,
+                                                        torch_dtype=torchType,
+                                                        guidance_scale=cfg_scale)
+
+            if request.PipelineType == "StableDiffusionXLPipeline":
+                if fromSingleFile:
+                    self.pipe = StableDiffusionXLPipeline.from_single_file(modelFile,
+                                                               torch_dtype=torchType, use_safetensors=True,
+                                                               guidance_scale=cfg_scale)
+                else:
+                    self.pipe = StableDiffusionXLPipeline.from_pretrained(
+                        request.Model, 
+                        torch_dtype=torchType, 
+                        use_safetensors=True, 
+                #       variant="fp16"
+                        guidance_scale=cfg_scale)
+            # https://github.com/huggingface/diffusers/issues/4446
+            # do not use text_encoder in the constructor since then
+            # https://github.com/huggingface/diffusers/issues/3212#issuecomment-1521841481
+            if CLIPSKIP and request.CLIPSkip != 0:
+                text_encoder = CLIPTextModel.from_pretrained(clipmodel, num_hidden_layers=request.CLIPSkip,  subfolder=clipsubfolder, torch_dtype=torchType)
+                self.pipe.text_encoder=text_encoder
+            # torch_dtype needs to be customized. float16 for GPU, float32 for CPU
+            # TODO: this needs to be customized
+            if request.SchedulerType != "":
+                self.pipe.scheduler = get_scheduler(request.SchedulerType, self.pipe.scheduler.config)
+            self.compel = Compel(tokenizer=self.pipe.tokenizer, text_encoder=self.pipe.text_encoder)
+            if request.CUDA:
+                self.pipe.to('cuda')
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+    def GenerateImage(self, request, context):
+
+        prompt = request.positive_prompt
+
+        # create a dictionary of values for the parameters
+        options = {
+            "negative_prompt":     request.negative_prompt, 
+            "width":               request.width, 
+            "height":              request.height,
+            "num_inference_steps": request.step,
+        }
+
+        if request.src != "":
+            image = Image.open(request.src)
+            options["image"] = image
+
+        # Get the keys that we will build the args for our pipe for
+        keys = options.keys()
+
+        if request.EnableParameters != "":
+            keys = request.EnableParameters.split(",")
+
+        if request.EnableParameters == "none":
+            keys = []
+
+        # create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
+        kwargs = {key: options[key] for key in keys}
+
+        image = {}
+        if COMPEL:
+            conditioning = self.compel.build_conditioning_tensor(prompt)
+            kwargs["prompt_embeds"]= conditioning
+            # pass the kwargs dictionary to the self.pipe method
+            image = self.pipe( 
+                **kwargs
+                ).images[0] 
+        else:
+            # pass the kwargs dictionary to the self.pipe method
+            image = self.pipe(
+                prompt, 
+                **kwargs
+                ).images[0]
+
+        # save the result
+        image.save(request.dst)
+
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/Show More
+++ b/Show More