feat: pre-configure LocalAI galleries (#886 )

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 4d855af (#875 )
2026-02-03 03:02:38 -05:00 · 2023-08-12 11:25:17 +02:00 · 2023-08-12 08:58:55 +02:00 · 2023-08-12 08:56:15 +02:00 · 2023-08-12 08:56:01 +02:00 · 2023-08-11 01:02:31 +02:00
88 changed files with 2996 additions and 960 deletions
--- a/.env
+++ b/.env
@@ -24,6 +24,9 @@ MODELS_PATH=/models
 # DEBUG=true

 ## Specify a build type. Available: cublas, openblas, clblas.
+## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
+## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
+## clBLAS:   This is an open-source implementation of the BLAS library that uses OpenCL, a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. clBLAS is designed to take advantage of the parallel computing power of GPUs but can also run on any hardware that supports OpenCL. This includes hardware from different vendors like Nvidia, AMD, and Intel.
 # BUILD_TYPE=openblas

 ## Uncomment and set to true to enable rebuilding from source
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.sh text eol=lf
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,9 +12,6 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
-          - repository: "go-skynet/go-llama.cpp"
-            variable: "GOLLAMA_GRAMMAR_VERSION"
-            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -43,8 +43,8 @@ jobs:
          mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
          curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v1.0.0/libpiper_phonemize-amd64.tar.gz" | \
          tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /lib64/ && \
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+          sudo ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
          sudo cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
      - name: Test
        run: |
--- a/.gitignore
+++ b/.gitignore
@@ -36,5 +36,5 @@ release/

 # Generated during build
 backend-assets/
-
+prepare
 /ggml-metal.metal
--- a/37
+++ b/37
@@ -11,15 +11,16 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV BUILD_TYPE=${BUILD_TYPE}
-ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py"
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/autogptq.py,bark:/build/extra/grpc/bark/ttsbark.py,diffusers:/build/extra/grpc/diffusers/backend_diffusers.py,exllama:/build/extra/grpc/exllama/exllama.py"
+ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tts"

 RUN apt-get update && \
    apt-get install -y ca-certificates cmake curl patch pip

-# Extras requirements
-COPY extra/requirements.txt /build/extra/requirements.txt
-RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+# Use the variables in subsequent instructions
+RUN echo "Target Architecture: $TARGETARCH"
+RUN echo "Target Variant: $TARGETVARIANT"

 # CuBLAS requirements
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
@@ -29,10 +30,23 @@ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
    dpkg -i cuda-keyring_1.0-1_all.deb && \
    rm -f cuda-keyring_1.0-1_all.deb && \
    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
    ; fi
 ENV PATH /usr/local/cuda/bin:${PATH}

+# Extras requirements
+COPY extra/requirements.txt /build/extra/requirements.txt
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+    fi
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+        pip install torch && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+    fi
+RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+
 WORKDIR /build

 # OpenBLAS requirements
@@ -42,9 +56,6 @@ RUN apt-get install -y libopenblas-dev
 RUN apt-get install -y libopencv-dev && \
    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

-# Use the variables in subsequent instructions
-RUN echo "Target Architecture: $TARGETARCH"
-RUN echo "Target Variant: $TARGETVARIANT"

 # piper requirements
 # Use pre-compiled Piper phonemization library (includes onnxruntime)
@@ -63,8 +74,8 @@ RUN curl -L "https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSIO
    mkdir -p "lib/Linux-$(uname -m)/piper_phonemize" && \
    curl -L "https://github.com/rhasspy/piper-phonemize/releases/download/v${PIPER_PHONEMIZE_VERSION}/libpiper_phonemize-${TARGETARCH:-$(go env GOARCH)}${TARGETVARIANT}.tar.gz" | \
    tar -C "lib/Linux-$(uname -m)/piper_phonemize" -xzvf - && ls -liah /build/lib/Linux-$(uname -m)/piper_phonemize/ && \
-    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /lib64/ && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/lib/. /usr/lib/ && \
+    ln -s /usr/lib/libpiper_phonemize.so /usr/lib/libpiper_phonemize.so.1 && \
    cp -rfv /build/lib/Linux-$(uname -m)/piper_phonemize/include/. /usr/include/
 # \
 #    ; fi
@@ -98,7 +109,10 @@ RUN ESPEAK_DATA=/build/lib/Linux-$(uname -m)/piper_phonemize/lib/espeak-ng-data
 FROM requirements

 ARG FFMPEG
+ARG BUILD_TYPE
+ARG TARGETARCH

+ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

@@ -116,7 +130,10 @@ WORKDIR /build
 COPY . .
 RUN make prepare-sources
 COPY --from=builder /build/local-ai ./
-
+# To resolve exllama import error
+RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH:-$(go env GOARCH)}" = "amd64" ]; then \
+        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
+    fi
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
--- a/43
+++ b/43
@@ -4,21 +4,11 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

 # llama.cpp versions
-# Temporarly pinned to https://github.com/go-skynet/go-llama.cpp/pull/124
-GOLLAMA_VERSION?=f3a6ee0ef53d667f110d28fcf9b808bdca741c07
-
-GOLLAMA_GRAMMAR_VERSION?=cb8d7cd4cb95725a04504a9e3a26dd72a12b69ac
-# Temporary set a specific version of llama.cpp
-# containing: https://github.com/ggerganov/llama.cpp/pull/1773 and
-# rebased on top of master.
-# This pin can be dropped when the PR above is merged, and go-llama has merged changes as well
-# Set empty to use the version pinned by go-llama
-LLAMA_CPP_GRAMMAR_REPO?=https://github.com/mudler/llama.cpp
-LLAMA_CPP_GRAMMAR_VERSION?=48ce8722a05a018681634af801fd0fd45b3a87cc
+GOLLAMA_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
-GPT4ALL_VERSION?=8aba2c9009fb6bc723f623c614e265b41722e4e3
+GPT4ALL_VERSION?=0f2bb506a8ee752afc06cbb832773bf85b97eef3

 # go-ggml-transformers version
 GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
@@ -202,28 +192,17 @@ go-llama:
 	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
 	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

-go-llama-grammar:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-grammar
-	cd go-llama-grammar && git checkout -b build $(GOLLAMA_GRAMMAR_VERSION) && git submodule update --init --recursive --depth 1
-ifneq ($(LLAMA_CPP_GRAMMAR_REPO),)
-	cd go-llama-grammar && rm -rf llama.cpp && git clone $(LLAMA_CPP_GRAMMAR_REPO) llama.cpp && cd llama.cpp && git checkout -b build $(LLAMA_CPP_GRAMMAR_VERSION) && git submodule update --init --recursive --depth 1
-endif
-
 go-llama/libbinding.a: go-llama
 	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a

-go-llama-grammar/libbinding.a: go-llama-grammar
-	$(MAKE) -C go-llama-grammar BUILD_TYPE=$(BUILD_TYPE) libbinding.a
-
 go-piper/libpiper_binding.a:
 	$(MAKE) -C go-piper libpiper_binding.a example/main

-get-sources: go-llama go-ggllm go-llama-grammar go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
+get-sources: go-llama go-ggllm go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert bloomz go-stable-diffusion
 	touch $@

 replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp-grammar=$(shell pwd)/go-llama-grammar
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
@@ -241,7 +220,6 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C go-llama clean
-	$(MAKE) -C go-llama-grammar clean
 	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C go-ggml-transformers clean
 	$(MAKE) -C go-rwkv clean
@@ -258,7 +236,8 @@ prepare: prepare-sources $(OPTIONAL_TARGETS)

 clean: ## Remove build related file
 	$(GOCMD) clean -cache
-	rm -fr ./go-llama
+	rm -f prepare
+	rm -rf ./go-llama
 	rm -rf ./gpt4all
 	rm -rf ./go-gpt2
 	rm -rf ./go-stable-diffusion
@@ -356,7 +335,11 @@ protogen-go:
    pkg/grpc/proto/backend.proto

 protogen-python:
-	python -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto

 ## GRPC

@@ -375,10 +358,6 @@ ifeq ($(BUILD_TYPE),metal)
 	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
 endif

-backend-assets/grpc/llama-grammar: backend-assets/grpc go-llama-grammar/libbinding.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-grammar LIBRARY_PATH=$(shell pwd)/go-llama-grammar \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-grammar ./cmd/grpc/llama-grammar/
-
 backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
@@ -442,4 +421,4 @@ backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/

-grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/llama-grammar backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+grpcs: prepare backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
--- a/README.md
+++ b/README.md
@@ -1,208 +1,120 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>

-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml) [![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)
+<p align="center">
+<a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
+<img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
+<img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
+<img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
+</a>
+<a href='https://github.com/go-skynet/LocalAI/releases'>
+<img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
+</a>
+</p>

-[![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 
+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+> 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)

-[Documentation website](https://localai.io/)
+
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

 **LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.

-For a list of the supported model families, please see [the model compatibility table](https://localai.io/model-compatibility/index.html#model-compatibility-table).
+<p align="center"><b>Follow LocalAI </b></p>
+
+<p align="center">
+<a href="https://twitter.com/LocalAI_API" target="blank">
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+</a>
+<a href="https://discord.gg/uJAeKSAGDy" target="blank">
+<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
+</a>
+
+<p align="center"><b>Connect with the Creator </b></p>
+
+<p align="center">
+<a href="https://twitter.com/mudler_it" target="blank">
+<img src="https://img.shields.io/twitter/follow/mudler_it?label=Follow: mudler_it&style=social" alt="Follow mudler_it"/>
+</a>
+<a href='https://github.com/mudler'>
+<img alt="Follow on Github" src="https://img.shields.io/badge/Follow-mudler-black?logo=github&link=https%3A%2F%2Fgithub.com%2Fmudler">
+</a>
+</p>
+
+<p align="center"><b>Share LocalAI Repository</b></p>
+
+<p align="center">
+
+<a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
+<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a> 
+<a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
+<a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
+" target="blank">
+<img src="https://img.shields.io/twitter/url?label=Reddit&logo=Reddit&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Reddit"/>
+</a> <a href="mailto:?subject=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%3A%0Ahttps://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Gmail&logo=Gmail&style=social&url=https://github.com/go-skynet/LocalAI"/></a> <a href="https://www.buymeacoffee.com/mudler" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="23" width="100" style="border-radius:1px"></a>
+
+</p>
+
+<hr>

 In a nutshell:

 - Local, OpenAI drop-in alternative REST API. You own your data.
 - NO GPU required. NO Internet access is required either
  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
- Supports multiple models:
-  - 📖 Text generation with GPTs (`llama.cpp`, `gpt4all.cpp`, ... and more)
-  - 🗣 Text to Audio 🎺🆕
-  - 🔈 Audio to Text (Audio transcription with `whisper.cpp`)
-  - 🎨 Image generation with stable diffusion
+- Supports multiple models
 - 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance. 
+- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.

 LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 

-See the [Getting started](https://localai.io/basics/getting_started/index.html) and [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) sections to learn how to use LocalAI. For a list of curated models check out the [model gallery](https://localai.io/models/).
+Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
+
+## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)
+
+## 🚀 [Features](https://localai.io/features/)
+
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
+- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
+- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
+- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)


-| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
-|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
-|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
-
-|                                                                    [Telegram bot](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot)   | [Flowise](https://github.com/go-skynet/LocalAI/tree/master/examples/flowise)                                                                                                                     |
-|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
-![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)   |  ![Screenshot from 2023-05-30 18-01-03](https://github.com/go-skynet/LocalAI/assets/2420543/02458782-0549-4131-971c-95ee56ec1af8)|    |
-
-## Hot topics / Roadmap
-
- [x] Support for embeddings
- [x] Support for audio transcription with https://github.com/ggerganov/whisper.cpp
- [X] Support for text-to-audio
- [x] GPU/CUDA support ( https://github.com/go-skynet/LocalAI/issues/69 )
- [X] Enable automatic downloading of models from a curated gallery
- [X] Enable automatic downloading of models from HuggingFace
- [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351) 
- [ ] Enable gallery management directly from the webui.
- [ ] 🔥 OpenAI functions: https://github.com/go-skynet/LocalAI/issues/588
-
-## News
-
- 🔥🔥🔥 28-06-2023: **v1.20.0**: Added text to audio and gallery huggingface repositories! [Release notes](https://localai.io/basics/news/index.html#-28-06-2023-__v1200__-) [Changelog](https://github.com/go-skynet/LocalAI/releases/tag/v1.20.0)
- 🔥🔥🔥 19-06-2023: **v1.19.0**: CUDA support! [Release notes](https://localai.io/basics/news/index.html#-19-06-2023-__v1190__-) [Changelog](https://github.com/go-skynet/LocalAI/releases/tag/v1.19.0)
- 🔥🔥🔥 06-06-2023: **v1.18.0**: Many updates, new features, and much more 🚀, check out the [Release notes](https://localai.io/basics/news/index.html#-06-06-2023-__v1180__-)!
- 29-05-2023: LocalAI now has a website, [https://localai.io](https://localai.io)! check the news in the [dedicated section](https://localai.io/basics/news/index.html)!
-
-For latest news, follow also on Twitter [@LocalAI_API](https://twitter.com/LocalAI_API) and [@mudler_it](https://twitter.com/mudler_it)
-
-## Media, Blogs, Social
+## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
 - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
 - [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
 - [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)

-## Contribute and help
+## 💻 Usage

-To help the project you can:
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

- [Hacker news post](https://news.ycombinator.com/item?id=35726934) - help us out by voting if you like this project.
+### 💡 Example: Use GPT4ALL-J model

- If you have technological skills and want to contribute to development, have a look at the open issues. If you are new you can have a look at the [good-first-issue](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and [help-wanted](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.
+See the [documentation](https://localai.io/basics/getting_started/#example-use-gpt4all-j-model-with-docker-compose)

- If you don't have technological skills you can still help improving documentation or add examples or share your user-stories with our community, any help and contribution is welcome!
+### 🔗 Resources

-## Usage
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/integrations/)

-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section. Here below you will find generic, quick instructions to get ready and use LocalAI.
-
-The easiest way to run LocalAI is by using `docker-compose` (to build locally, see [building LocalAI](https://localai.io/basics/build/index.html)):
-
-```bash
-
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-
-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# copy your models to models/
-cp your-model.bin models/
-
-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
-
-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "your-model.bin",            
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
-```
-
-### Example: Use GPT4ALL-J model
-
-<details>
-
-```bash
-# Clone LocalAI
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-
-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# Download gpt4all-j to models/
-wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
-
-# Use a template from the examples
-cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
-
-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"ggml-gpt4all-j","object":"model"}]}
-
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-gpt4all-j",
-     "messages": [{"role": "user", "content": "How are you?"}],
-     "temperature": 0.9 
-   }'
-
-# {"model":"ggml-gpt4all-j","choices":[{"message":{"role":"assistant","content":"I'm doing well, thanks. How about you?"}}]}
-```
-</details>
-
-
-### Build locally
-
-<details>
-
-In order to build the `LocalAI` container image locally you can use `docker`:
-
-```
-# build the image
-docker build -t localai .
-docker run localai
-```
-
-Or you can build the binary with `make`:
-
-```
-make build
-```
-
-</details>
-
-See the [build section](https://localai.io/basics/build/index.html) in our documentation for detailed instructions.
-
-### Run LocalAI in Kubernetes
-
-LocalAI can be installed inside Kubernetes with helm. See [installation instructions](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes).
-
-## Supported API endpoints
-
-See the [list of the LocalAI features](https://localai.io/features/index.html) for a full tour of the available API endpoints.
-
-## Frequently asked questions
-
-See [the FAQ](https://localai.io/faq/index.html) section for a list of common questions.
-
-## Projects already using LocalAI to run local models
-
-Feel free to open up a PR to get your project listed!
-
- [Kairos](https://github.com/kairos-io/kairos)
- [k8sgpt](https://github.com/k8sgpt-ai/k8sgpt#running-local-models)
- [Spark](https://github.com/cedriking/spark)
- [autogpt4all](https://github.com/aorumbayev/autogpt4all)
- [Mods](https://github.com/charmbracelet/mods)
- [Flowise](https://github.com/FlowiseAI/Flowise)
-
-## Sponsors
+## ❤️ Sponsors

 > Do you find LocalAI useful?

@@ -215,21 +127,17 @@ A huge thank you to our generous sponsors who support this project:
 |  [Spectro Cloud](https://www.spectrocloud.com/)  |  
 |  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

-## Star history
+## 🌟 Star history

 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)

-## License
+## 📖 License

 LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-MIT
+MIT - Author Ettore Di Giacinto

-## Author
-
-Ettore Di Giacinto and others
-
-## Acknowledgements
+## 🙇 Acknowledgements

 LocalAI couldn't have been built without the help of great software already available from the community. Thank you!

@@ -240,9 +148,12 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
+- https://github.com/rhasspy/piper
+- https://github.com/cmp-nct/ggllm.cpp

-## Contributors
+## 🤗 Contributors

+This is a community project, a special thanks to our contributors! 🤗
 <a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
 </a>
--- a/api/api.go
+++ b/api/api.go
@@ -2,6 +2,7 @@ package api

 import (
 	"errors"
+	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/api/localai"
@@ -89,6 +90,32 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	// Default middleware config
 	app.Use(recover.New())

+	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
+	auth := func(c *fiber.Ctx) error {
+		if len(options.ApiKeys) > 0 {
+			authHeader := c.Get("Authorization")
+			if authHeader == "" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+			}
+			authHeaderParts := strings.Split(authHeader, " ")
+			if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+			}
+
+			apiKey := authHeaderParts[1]
+			validApiKey := false
+			for _, key := range options.ApiKeys {
+				if apiKey == key {
+					validApiKey = true
+				}
+			}
+			if !validApiKey {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+			}
+		}
+		return c.Next()
+	}
+
 	if options.PreloadJSONModels != "" {
 		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cm, options.Galleries); err != nil {
 			return nil, err
@@ -116,42 +143,42 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
 	galleryService.Start(options.Context, cm)

-	app.Get("/version", func(c *fiber.Ctx) error {
+	app.Get("/version", auth, func(c *fiber.Ctx) error {
 		return c.JSON(struct {
 			Version string `json:"version"`
 		}{Version: internal.PrintableVersion()})
 	})

-	app.Post("/models/apply", localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cm, galleryService.C, options.Galleries))
-	app.Get("/models/available", localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
-	app.Get("/models/jobs/:uuid", localai.GetOpStatusEndpoint(galleryService))
+	app.Post("/models/apply", auth, localai.ApplyModelGalleryEndpoint(options.Loader.ModelPath, cm, galleryService.C, options.Galleries))
+	app.Get("/models/available", auth, localai.ListModelFromGalleryEndpoint(options.Galleries, options.Loader.ModelPath))
+	app.Get("/models/jobs/:uuid", auth, localai.GetOpStatusEndpoint(galleryService))

 	// openAI compatible API endpoint

 	// chat
-	app.Post("/v1/chat/completions", openai.ChatEndpoint(cm, options))
-	app.Post("/chat/completions", openai.ChatEndpoint(cm, options))
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cm, options))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cm, options))

 	// edit
-	app.Post("/v1/edits", openai.EditEndpoint(cm, options))
-	app.Post("/edits", openai.EditEndpoint(cm, options))
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cm, options))
+	app.Post("/edits", auth, openai.EditEndpoint(cm, options))

 	// completion
-	app.Post("/v1/completions", openai.CompletionEndpoint(cm, options))
-	app.Post("/completions", openai.CompletionEndpoint(cm, options))
-	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cm, options))
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cm, options))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cm, options))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cm, options))

 	// embeddings
-	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cm, options))
-	app.Post("/embeddings", openai.EmbeddingsEndpoint(cm, options))
-	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cm, options))

 	// audio
-	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cm, options))
-	app.Post("/tts", localai.TTSEndpoint(cm, options))
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cm, options))
+	app.Post("/tts", auth, localai.TTSEndpoint(cm, options))

 	// images
-	app.Post("/v1/images/generations", openai.ImageEndpoint(cm, options))
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cm, options))

 	if options.ImageDir != "" {
 		app.Static("/generated-images", options.ImageDir)
@@ -170,8 +197,8 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
 	app.Get("/readyz", ok)

 	// models
-	app.Get("/v1/models", openai.ListModelsEndpoint(options.Loader, cm))
-	app.Get("/models", openai.ListModelsEndpoint(options.Loader, cm))
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cm))
+	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cm))

 	// turn off any process that was started by GRPC if the context is canceled
 	go func() {
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -8,7 +8,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"net/http"
 	"os"
 	"path/filepath"
@@ -30,10 +29,10 @@ import (
 )

 type modelApplyRequest struct {
-	ID        string            `json:"id"`
-	URL       string            `json:"url"`
-	Name      string            `json:"name"`
-	Overrides map[string]string `json:"overrides"`
+	ID        string                 `json:"id"`
+	URL       string                 `json:"url"`
+	Name      string                 `json:"name"`
+	Overrides map[string]interface{} `json:"overrides"`
 }

 func getModelStatus(url string) (response map[string]interface{}) {
@@ -45,7 +44,7 @@ func getModelStatus(url string) (response map[string]interface{}) {
 	}
 	defer resp.Body.Close()

-	body, err := ioutil.ReadAll(resp.Body)
+	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		fmt.Println("Error reading response body:", err)
 		return
@@ -97,7 +96,7 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
 	}
 	defer resp.Body.Close()

-	body, err := ioutil.ReadAll(resp.Body)
+	body, err := io.ReadAll(resp.Body)
 	if err != nil {
 		fmt.Println("Error reading response body:", err)
 		return
@@ -153,7 +152,7 @@ var _ = Describe("API test", func() {
 			}
 			out, err := yaml.Marshal(g)
 			Expect(err).ToNot(HaveOccurred())
-			err = ioutil.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
+			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
 			Expect(err).ToNot(HaveOccurred())

 			galleries := []gallery.Gallery{
@@ -243,7 +242,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
 					Name: "bert",
-					Overrides: map[string]string{
+					Overrides: map[string]interface{}{
 						"backend": "llama",
 					},
 				})
@@ -269,7 +268,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
 					Name:      "bert",
-					Overrides: map[string]string{},
+					Overrides: map[string]interface{}{},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -297,7 +296,7 @@ var _ = Describe("API test", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
-					Overrides: map[string]string{"backend": "llama-grammar"},
+					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -355,7 +354,7 @@ var _ = Describe("API test", func() {
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
+				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
@@ -366,9 +365,8 @@ var _ = Describe("API test", func() {
 				}

 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/gpt4all-j.yaml",
-					Name:      "gpt4all-j",
-					Overrides: map[string]string{},
+					URL:  "github:go-skynet/model-gallery/gpt4all-j.yaml",
+					Name: "gpt4all-j",
 				})

 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -472,6 +470,9 @@ var _ = Describe("API test", func() {

 			response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 				ID: "model-gallery@stablediffusion",
+				Overrides: map[string]interface{}{
+					"parameters": map[string]interface{}{"model": "stablediffusion_assets"},
+				},
 			})

 			Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -536,7 +537,7 @@ var _ = Describe("API test", func() {
 		It("returns the models list", func() {
 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(11))
+			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions", func() {
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
@@ -740,19 +741,14 @@ var _ = Describe("API test", func() {
 			cancel()
 			app.Shutdown()
 		})
-		It("can generate chat completions from config file", func() {
-			models, err := client.ListModels(context.TODO())
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(13))
-		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+		It("can generate chat completions from config file (list1)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+		It("can generate chat completions from config file (list2)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
 			Expect(err).ToNot(HaveOccurred())
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
--- a/api/backend/embeddings.go
+++ b/api/backend/embeddings.go
@@ -23,10 +23,10 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.
 	var err error

 	opts := []model.Option{
-		model.WithLoadGRPCLLMModelOpts(grpcOpts),
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
-		model.WithModelFile(modelFile),
+		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 	}

--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -1,7 +1,6 @@
 package backend

 import (
-	"fmt"
 	"sync"

 	config "github.com/go-skynet/LocalAI/api/config"
@@ -11,16 +10,18 @@ import (
 )

 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
-	if c.Backend != model.StableDiffusionBackend {
-		return nil, fmt.Errorf("endpoint only working with stablediffusion models")
-	}

 	opts := []model.Option{
 		model.WithBackendString(c.Backend),
 		model.WithAssetDir(o.AssetsDestination),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithContext(o.Context),
-		model.WithModelFile(c.ImageGenerationAssets),
+		model.WithModel(c.Model),
+		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
+			CUDA:          c.Diffusers.CUDA,
+			SchedulerType: c.Diffusers.SchedulerType,
+			PipelineType:  c.Diffusers.PipelineType,
+		}),
 	}

 	for k, v := range o.ExternalGRPCBackends {
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -1,6 +1,7 @@
 package backend

 import (
+	"context"
 	"os"
 	"regexp"
 	"strings"
@@ -14,7 +15,7 @@ import (
 	"github.com/go-skynet/LocalAI/pkg/utils"
 )

-func ModelInference(s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string) bool) (func() (string, error), error) {
+func ModelInference(ctx context.Context, s string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string) bool) (func() (string, error), error) {
 	modelFile := c.Model

 	grpcOpts := gRPCModelOpts(c)
@@ -23,10 +24,10 @@ func ModelInference(s string, loader *model.ModelLoader, c config.Config, o *opt
 	var err error

 	opts := []model.Option{
-		model.WithLoadGRPCLLMModelOpts(grpcOpts),
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
 		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
 		model.WithAssetDir(o.AssetsDestination),
-		model.WithModelFile(modelFile),
+		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 	}

@@ -66,17 +67,17 @@ func ModelInference(s string, loader *model.ModelLoader, c config.Config, o *opt
 		opts.Prompt = s
 		if tokenCallback != nil {
 			ss := ""
-			err := inferenceModel.PredictStream(o.Context, opts, func(s string) {
-				tokenCallback(s)
-				ss += s
+			err := inferenceModel.PredictStream(ctx, opts, func(s []byte) {
+				tokenCallback(string(s))
+				ss += string(s)
 			})
 			return ss, err
 		} else {
-			reply, err := inferenceModel.Predict(o.Context, opts)
+			reply, err := inferenceModel.Predict(ctx, opts)
 			if err != nil {
 				return "", err
 			}
-			return reply.Message, err
+			return string(reply.Message), err
 		}
 	}

--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -18,16 +18,26 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		ContextSize: int32(c.ContextSize),
 		Seed:        int32(c.Seed),
 		NBatch:      int32(b),
-		F16Memory:   c.F16,
-		MLock:       c.MMlock,
-		NUMA:        c.NUMA,
-		Embeddings:  c.Embeddings,
-		LowVRAM:     c.LowVRAM,
-		NGPULayers:  int32(c.NGPULayers),
-		MMap:        c.MMap,
-		MainGPU:     c.MainGPU,
-		Threads:     int32(c.Threads),
-		TensorSplit: c.TensorSplit,
+		NGQA:        c.NGQA,
+
+		RMSNormEps:    c.RMSNormEps,
+		F16Memory:     c.F16,
+		MLock:         c.MMlock,
+		RopeFreqBase:  c.RopeFreqBase,
+		RopeFreqScale: c.RopeFreqScale,
+		NUMA:          c.NUMA,
+		Embeddings:    c.Embeddings,
+		LowVRAM:       c.LowVRAM,
+		NGPULayers:    int32(c.NGPULayers),
+		MMap:          c.MMap,
+		MainGPU:       c.MainGPU,
+		Threads:       int32(c.Threads),
+		TensorSplit:   c.TensorSplit,
+		// AutoGPTQ
+		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
+		Device:           c.AutoGPTQ.Device,
+		UseTriton:        c.AutoGPTQ.Triton,
+		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
 	}
 }

@@ -39,34 +49,37 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
 		promptCachePath = p
 	}
 	return &pb.PredictOptions{
-		Temperature:     float32(c.Temperature),
-		TopP:            float32(c.TopP),
-		TopK:            int32(c.TopK),
-		Tokens:          int32(c.Maxtokens),
-		Threads:         int32(c.Threads),
-		PromptCacheAll:  c.PromptCacheAll,
-		PromptCacheRO:   c.PromptCacheRO,
-		PromptCachePath: promptCachePath,
-		F16KV:           c.F16,
-		DebugMode:       c.Debug,
-		Grammar:         c.Grammar,
-
-		Mirostat:          int32(c.Mirostat),
-		MirostatETA:       float32(c.MirostatETA),
-		MirostatTAU:       float32(c.MirostatTAU),
-		Debug:             c.Debug,
-		StopPrompts:       c.StopWords,
-		Repeat:            int32(c.RepeatPenalty),
-		NKeep:             int32(c.Keep),
-		Batch:             int32(c.Batch),
-		IgnoreEOS:         c.IgnoreEOS,
-		Seed:              int32(c.Seed),
-		FrequencyPenalty:  float32(c.FrequencyPenalty),
-		MLock:             c.MMlock,
-		MMap:              c.MMap,
-		MainGPU:           c.MainGPU,
-		TensorSplit:       c.TensorSplit,
-		TailFreeSamplingZ: float32(c.TFZ),
-		TypicalP:          float32(c.TypicalP),
+		Temperature:         float32(c.Temperature),
+		TopP:                float32(c.TopP),
+		TopK:                int32(c.TopK),
+		Tokens:              int32(c.Maxtokens),
+		Threads:             int32(c.Threads),
+		PromptCacheAll:      c.PromptCacheAll,
+		PromptCacheRO:       c.PromptCacheRO,
+		PromptCachePath:     promptCachePath,
+		F16KV:               c.F16,
+		DebugMode:           c.Debug,
+		Grammar:             c.Grammar,
+		NegativePromptScale: c.NegativePromptScale,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeFreqScale:       c.RopeFreqScale,
+		NegativePrompt:      c.NegativePrompt,
+		Mirostat:            int32(c.LLMConfig.Mirostat),
+		MirostatETA:         float32(c.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(c.LLMConfig.MirostatTAU),
+		Debug:               c.Debug,
+		StopPrompts:         c.StopWords,
+		Repeat:              int32(c.RepeatPenalty),
+		NKeep:               int32(c.Keep),
+		Batch:               int32(c.Batch),
+		IgnoreEOS:           c.IgnoreEOS,
+		Seed:                int32(c.Seed),
+		FrequencyPenalty:    float32(c.FrequencyPenalty),
+		MLock:               c.MMlock,
+		MMap:                c.MMap,
+		MainGPU:             c.MainGPU,
+		TensorSplit:         c.TensorSplit,
+		TailFreeSamplingZ:   float32(c.TFZ),
+		TypicalP:            float32(c.TypicalP),
 	}
 }
--- a/api/backend/transcript.go
+++ b/api/backend/transcript.go
@@ -15,7 +15,7 @@ import (
 func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*api.Result, error) {
 	opts := []model.Option{
 		model.WithBackendString(model.WhisperBackend),
-		model.WithModelFile(c.Model),
+		model.WithModel(c.Model),
 		model.WithContext(o.Context),
 		model.WithThreads(uint32(c.Threads)),
 		model.WithAssetDir(o.AssetsDestination),
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -28,10 +28,14 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }

-func ModelTTS(text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
+func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
+	bb := backend
+	if bb == "" {
+		bb = model.PiperBackend
+	}
 	opts := []model.Option{
-		model.WithBackendString(model.PiperBackend),
-		model.WithModelFile(modelFile),
+		model.WithBackendString(bb),
+		model.WithModel(modelFile),
 		model.WithContext(o.Context),
 		model.WithAssetDir(o.AssetsDestination),
 	}
@@ -56,10 +60,13 @@ func ModelTTS(text, modelFile string, loader *model.ModelLoader, o *options.Opti
 	fileName := generateUniqueFileName(o.AudioDir, "piper", ".wav")
 	filePath := filepath.Join(o.AudioDir, fileName)

-	modelPath := filepath.Join(o.Loader.ModelPath, modelFile)
-
-	if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
-		return "", nil, err
+	// If the model file is not empty, we pass it joined with the model path
+	modelPath := ""
+	if modelFile != "" {
+		modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
+		if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+			return "", nil, err
+		}
 	}

 	res, err := piperModel.TTS(context.Background(), &proto.TTSRequest{
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -13,44 +13,69 @@ import (

 type Config struct {
 	PredictionOptions `yaml:"parameters"`
-	Name              string            `yaml:"name"`
-	StopWords         []string          `yaml:"stopwords"`
-	Cutstrings        []string          `yaml:"cutstrings"`
-	TrimSpace         []string          `yaml:"trimspace"`
-	ContextSize       int               `yaml:"context_size"`
-	F16               bool              `yaml:"f16"`
-	NUMA              bool              `yaml:"numa"`
-	Threads           int               `yaml:"threads"`
-	Debug             bool              `yaml:"debug"`
-	Roles             map[string]string `yaml:"roles"`
-	Embeddings        bool              `yaml:"embeddings"`
-	Backend           string            `yaml:"backend"`
-	TemplateConfig    TemplateConfig    `yaml:"template"`
-	MirostatETA       float64           `yaml:"mirostat_eta"`
-	MirostatTAU       float64           `yaml:"mirostat_tau"`
-	Mirostat          int               `yaml:"mirostat"`
-	NGPULayers        int               `yaml:"gpu_layers"`
-	MMap              bool              `yaml:"mmap"`
-	MMlock            bool              `yaml:"mmlock"`
-	LowVRAM           bool              `yaml:"low_vram"`
+	Name              string `yaml:"name"`

-	TensorSplit           string `yaml:"tensor_split"`
-	MainGPU               string `yaml:"main_gpu"`
-	ImageGenerationAssets string `yaml:"asset_dir"`
+	F16            bool              `yaml:"f16"`
+	Threads        int               `yaml:"threads"`
+	Debug          bool              `yaml:"debug"`
+	Roles          map[string]string `yaml:"roles"`
+	Embeddings     bool              `yaml:"embeddings"`
+	Backend        string            `yaml:"backend"`
+	TemplateConfig TemplateConfig    `yaml:"template"`

-	PromptCachePath string `yaml:"prompt_cache_path"`
-	PromptCacheAll  bool   `yaml:"prompt_cache_all"`
-	PromptCacheRO   bool   `yaml:"prompt_cache_ro"`
-
-	Grammar string `yaml:"grammar"`
-
-	PromptStrings, InputStrings                []string
-	InputToken                                 [][]int
-	functionCallString, functionCallNameString string
+	PromptStrings, InputStrings                []string `yaml:"-"`
+	InputToken                                 [][]int  `yaml:"-"`
+	functionCallString, functionCallNameString string   `yaml:"-"`

 	FunctionsConfig Functions `yaml:"function"`

-	SystemPrompt string `yaml:"system_prompt"`
+	// LLM configs (GPT4ALL, Llama.cpp, ...)
+	LLMConfig `yaml:",inline"`
+
+	// AutoGPTQ specifics
+	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
+
+	// Diffusers
+	Diffusers Diffusers `yaml:"diffusers"`
+
+	Step int `yaml:"step"`
+}
+
+type Diffusers struct {
+	PipelineType  string `yaml:"pipeline_type"`
+	SchedulerType string `yaml:"scheduler_type"`
+	CUDA          bool   `yaml:"cuda"`
+}
+
+type LLMConfig struct {
+	SystemPrompt    string   `yaml:"system_prompt"`
+	TensorSplit     string   `yaml:"tensor_split"`
+	MainGPU         string   `yaml:"main_gpu"`
+	RMSNormEps      float32  `yaml:"rms_norm_eps"`
+	NGQA            int32    `yaml:"ngqa"`
+	PromptCachePath string   `yaml:"prompt_cache_path"`
+	PromptCacheAll  bool     `yaml:"prompt_cache_all"`
+	PromptCacheRO   bool     `yaml:"prompt_cache_ro"`
+	MirostatETA     float64  `yaml:"mirostat_eta"`
+	MirostatTAU     float64  `yaml:"mirostat_tau"`
+	Mirostat        int      `yaml:"mirostat"`
+	NGPULayers      int      `yaml:"gpu_layers"`
+	MMap            bool     `yaml:"mmap"`
+	MMlock          bool     `yaml:"mmlock"`
+	LowVRAM         bool     `yaml:"low_vram"`
+	Grammar         string   `yaml:"grammar"`
+	StopWords       []string `yaml:"stopwords"`
+	Cutstrings      []string `yaml:"cutstrings"`
+	TrimSpace       []string `yaml:"trimspace"`
+	ContextSize     int      `yaml:"context_size"`
+	NUMA            bool     `yaml:"numa"`
+}
+
+type AutoGPTQ struct {
+	ModelBaseName    string `yaml:"model_base_name"`
+	Device           string `yaml:"device"`
+	Triton           bool   `yaml:"triton"`
+	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

 type Functions struct {
@@ -172,6 +197,16 @@ func (cm *ConfigLoader) GetConfig(m string) (Config, bool) {
 	return v, exists
 }

+func (cm *ConfigLoader) GetAllConfigs() []Config {
+	cm.Lock()
+	defer cm.Unlock()
+	var res []Config
+	for _, v := range cm.configs {
+		res = append(res, v)
+	}
+	return res
+}
+
 func (cm *ConfigLoader) ListConfigs() []string {
 	cm.Lock()
 	defer cm.Unlock()
--- a/api/config/prediction.go
+++ b/api/config/prediction.go
@@ -34,4 +34,11 @@ type PredictionOptions struct {

 	TypicalP float64 `json:"typical_p" yaml:"typical_p"`
 	Seed     int     `json:"seed" yaml:"seed"`
+
+	NegativePrompt      string  `json:"negative_prompt" yaml:"negative_prompt"`
+	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
+	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
+	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
+	// AutoGPTQ
+	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
 }
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -8,6 +8,7 @@ import (
 	"sync"

 	json "github.com/json-iterator/go"
+	"gopkg.in/yaml.v3"

 	config "github.com/go-skynet/LocalAI/api/config"
 	"github.com/go-skynet/LocalAI/pkg/gallery"
@@ -128,8 +129,27 @@ func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
 }

 type galleryModel struct {
-	gallery.GalleryModel
-	ID string `json:"id"`
+	gallery.GalleryModel `yaml:",inline"` // https://github.com/go-yaml/yaml/issues/63
+	ID                   string           `json:"id"`
+}
+
+func processRequests(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
+	var err error
+	for _, r := range requests {
+		utils.ResetDownloadTimers()
+		if r.ID == "" {
+			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
+		} else {
+			if strings.Contains(r.ID, "@") {
+				err = gallery.InstallModelFromGallery(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			} else {
+				err = gallery.InstallModelFromGalleryByName(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			}
+		}
+	}
+	return err
 }

 func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
@@ -137,7 +157,13 @@ func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, gallerie
 	if err != nil {
 		return err
 	}
-	return ApplyGalleryFromString(modelPath, string(dat), cm, galleries)
+	var requests []galleryModel
+
+	if err := yaml.Unmarshal(dat, &requests); err != nil {
+		return err
+	}
+
+	return processRequests(modelPath, s, cm, galleries, requests)
 }

 func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
@@ -147,16 +173,7 @@ func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galler
 		return err
 	}

-	for _, r := range requests {
-		utils.ResetDownloadTimers()
-		if r.ID == "" {
-			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
-		} else {
-			err = gallery.InstallModelFromGallery(galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
-		}
-	}
-
-	return err
+	return processRequests(modelPath, s, cm, galleries, requests)
 }

 /// Endpoints
--- a/api/localai/localai.go
+++ b/api/localai/localai.go
@@ -9,8 +9,9 @@ import (
 )

 type TTSRequest struct {
-	Model string `json:"model" yaml:"model"`
-	Input string `json:"input" yaml:"input"`
+	Model   string `json:"model" yaml:"model"`
+	Input   string `json:"input" yaml:"input"`
+	Backend string `json:"backend" yaml:"backend"`
 }

 func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
@@ -22,7 +23,7 @@ func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			return err
 		}

-		filePath, _, err := backend.ModelTTS(input.Input, input.Model, o.Loader, o)
+		filePath, _, err := backend.ModelTTS(input.Backend, input.Input, input.Model, o.Loader, o)
 		if err != nil {
 			return err
 		}
--- a/api/openai/api.go
+++ b/api/openai/api.go
@@ -1,6 +1,8 @@
 package openai

 import (
+	"context"
+
 	config "github.com/go-skynet/LocalAI/api/config"

 	"github.com/go-skynet/LocalAI/pkg/grammar"
@@ -70,6 +72,9 @@ type OpenAIModel struct {
 type OpenAIRequest struct {
 	config.PredictionOptions

+	Context context.Context
+	Cancel  context.CancelFunc
+
 	// whisper
 	File string `json:"file" validate:"required"`
 	//whisper/image
@@ -102,4 +107,9 @@ type OpenAIRequest struct {
 	Grammar string `json:"grammar" yaml:"grammar"`

 	JSONFunctionGrammarObject *grammar.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`
+
+	Backend string `json:"backend" yaml:"backend"`
+
+	// AutoGPTQ
+	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
 }
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -12,6 +12,7 @@ import (
 	"github.com/go-skynet/LocalAI/api/options"
 	"github.com/go-skynet/LocalAI/pkg/grammar"
 	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
@@ -28,7 +29,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		}
 		responses <- initialMessage

-		ComputeChoices(s, req.N, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
 			resp := OpenAIResponse{
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []Choice{{Delta: &Message{Content: &s}, Index: 0}},
@@ -43,7 +44,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 	return func(c *fiber.Ctx) error {
 		processFunctions := false
 		funcs := grammar.Functions{}
-		modelFile, input, err := readInput(c, o.Loader, true)
+		modelFile, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -109,6 +110,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		var predInput string

+		suppressConfigSystemPrompt := false
 		mess := []string{}
 		for messageIndex, i := range input.Messages {
 			var content string
@@ -146,7 +148,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					content = templatedChatMessage
 				}
 			}
-			// If this model doesn't have such a template, or if
+			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
 			if content == "" {
 				if r != "" {
 					if contentExists {
@@ -177,6 +179,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 						}
 					}
 				}
+				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+				if contentExists && role == "system" {
+					suppressConfigSystemPrompt = true
+				}
 			}

 			mess = append(mess, content)
@@ -207,8 +213,10 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)

 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 		templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-			Input:     predInput,
-			Functions: funcs,
+			SystemPrompt:         config.SystemPrompt,
+			SuppressSystemPrompt: suppressConfigSystemPrompt,
+			Input:                predInput,
+			Functions:            funcs,
 		})
 		if err == nil {
 			predInput = templatedInput
@@ -235,7 +243,12 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					enc.Encode(ev)

 					log.Debug().Msgf("Sending chunk: %s", buf.String())
-					fmt.Fprintf(w, "data: %v\n", buf.String())
+					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
+					if err != nil {
+						log.Debug().Msgf("Sending chunk failed: %v", err)
+						input.Cancel()
+						break
+					}
 					w.Flush()
 				}

@@ -258,10 +271,12 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 			return nil
 		}

-		result, err := ComputeChoices(predInput, input.N, config, o, o.Loader, func(s string, c *[]Choice) {
+		result, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]Choice) {
 			if processFunctions {
 				// As we have to change the result before processing, we can't stream the answer (yet?)
 				ss := map[string]interface{}{}
+				// This prevent newlines to break JSON parsing for clients
+				s = utils.EscapeNewLines(s)
 				json.Unmarshal([]byte(s), &ss)
 				log.Debug().Msgf("Function return: %s %+v", s, ss)

@@ -300,7 +315,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
 					// Note: This costs (in term of CPU) another computation
 					config.Grammar = ""
-					predFunc, err := backend.ModelInference(predInput, o.Loader, *config, o, nil)
+					predFunc, err := backend.ModelInference(input.Context, predInput, o.Loader, *config, o, nil)
 					if err != nil {
 						log.Error().Msgf("inference error: %s", err.Error())
 						return
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -18,7 +18,7 @@ import (
 // https://platform.openai.com/docs/api-reference/completions
 func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	process := func(s string, req *OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		ComputeChoices(s, req.N, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
 			resp := OpenAIResponse{
 				Model: req.Model, // we have to return what the user sent here, due to OpenAI spec.
 				Choices: []Choice{
@@ -38,7 +38,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 	}

 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readInput(c, o.Loader, true)
+		modelFile, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -123,14 +123,15 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
 		for k, i := range config.PromptStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-				Input: i,
+				SystemPrompt: config.SystemPrompt,
+				Input:        i,
 			})
 			if err == nil {
 				i = templatedInput
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

-			r, err := ComputeChoices(i, input.N, config, o, o.Loader, func(s string, c *[]Choice) {
+			r, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]Choice) {
 				*c = append(*c, Choice{Text: s, FinishReason: "stop", Index: k})
 			}, nil)
 			if err != nil {
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -13,7 +13,7 @@ import (

 func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		modelFile, input, err := readInput(c, o.Loader, true)
+		modelFile, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -35,15 +35,16 @@ func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
 		for _, i := range config.InputStrings {
 			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
-				Input:       i,
-				Instruction: input.Instruction,
+				Input:        i,
+				Instruction:  input.Instruction,
+				SystemPrompt: config.SystemPrompt,
 			})
 			if err == nil {
 				i = templatedInput
 				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}

-			r, err := ComputeChoices(i, input.N, config, o, o.Loader, func(s string, c *[]Choice) {
+			r, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]Choice) {
 				*c = append(*c, Choice{Text: s})
 			}, nil)
 			if err != nil {
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -14,7 +14,7 @@ import (
 // https://platform.openai.com/docs/api-reference/embeddings
 func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.Loader, true)
+		model, input, err := readInput(c, o, true)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -4,7 +4,6 @@ import (
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -35,7 +34,7 @@ import (
 */
 func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.Loader, false)
+		m, input, err := readInput(c, o, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
@@ -90,7 +89,10 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 				}

 				mode := 0
-				step := 15
+				step := config.Step
+				if step == 0 {
+					step = 15
+				}

 				if input.Mode != 0 {
 					mode = input.Mode
@@ -105,7 +107,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
 					tempDir = o.ImageDir
 				}
 				// Create a temporary file
-				outputFile, err := ioutil.TempFile(tempDir, "b64")
+				outputFile, err := os.CreateTemp(tempDir, "b64")
 				if err != nil {
 					return err
 				}
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -7,7 +7,8 @@ import (
 	model "github.com/go-skynet/LocalAI/pkg/model"
 )

-func ComputeChoices(predInput string, n int, config *config.Config, o *options.Option, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
+func ComputeChoices(req *OpenAIRequest, predInput string, config *config.Config, o *options.Option, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
+	n := req.N
 	result := []Choice{}

 	if n == 0 {
@@ -15,7 +16,7 @@ func ComputeChoices(predInput string, n int, config *config.Config, o *options.O
 	}

 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(predInput, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, err
 	}
--- a/api/openai/list.go
+++ b/api/openai/list.go
@@ -1,6 +1,8 @@
 package openai

 import (
+	"regexp"
+
 	config "github.com/go-skynet/LocalAI/api/config"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
@@ -15,14 +17,43 @@ func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func
 		var mm map[string]interface{} = map[string]interface{}{}

 		dataModels := []OpenAIModel{}
-		for _, m := range models {
-			mm[m] = nil
-			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+
+		var filterFn func(name string) bool
+		filter := c.Query("filter")
+
+		// If filter is not specified, do not filter the list by model name
+		if filter == "" {
+			filterFn = func(_ string) bool { return true }
+		} else {
+			// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
+			rxp, err := regexp.Compile(filter)
+			if err != nil {
+				return err
+			}
+			filterFn = func(name string) bool {
+				return rxp.MatchString(name)
+			}
 		}

-		for _, k := range cm.ListConfigs() {
-			if _, exists := mm[k]; !exists {
-				dataModels = append(dataModels, OpenAIModel{ID: k, Object: "model"})
+		// By default, exclude any loose files that are already referenced by a configuration file.
+		excludeConfigured := c.QueryBool("excludeConfigured", true)
+
+		// Start with the known configurations
+		for _, c := range cm.GetAllConfigs() {
+			if excludeConfigured {
+				mm[c.Model] = nil
+			}
+
+			if filterFn(c.Name) {
+				dataModels = append(dataModels, OpenAIModel{ID: c.Name, Object: "model"})
+			}
+		}
+
+		// Then iterate through the loose files:
+		for _, m := range models {
+			// And only adds them if they shouldn't be skipped.
+			if _, exists := mm[m]; !exists && filterFn(m) {
+				dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
 			}
 		}

--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -1,6 +1,7 @@
 package openai

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"os"
@@ -8,13 +9,18 @@ import (
 	"strings"

 	config "github.com/go-skynet/LocalAI/api/config"
+	options "github.com/go-skynet/LocalAI/api/options"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )

-func readInput(c *fiber.Ctx, loader *model.ModelLoader, randomModel bool) (string, *OpenAIRequest, error) {
+func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *OpenAIRequest, error) {
+	loader := o.Loader
 	input := new(OpenAIRequest)
+	ctx, cancel := context.WithCancel(o.Context)
+	input.Context = ctx
+	input.Cancel = cancel
 	// Get input data from the request body
 	if err := c.BodyParser(input); err != nil {
 		return "", nil, err
@@ -65,6 +71,34 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		config.TopP = input.TopP
 	}

+	if input.Backend != "" {
+		config.Backend = input.Backend
+	}
+
+	if input.ModelBaseName != "" {
+		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
+	}
+
+	if input.NegativePromptScale != 0 {
+		config.NegativePromptScale = input.NegativePromptScale
+	}
+
+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
+	if input.NegativePrompt != "" {
+		config.NegativePrompt = input.NegativePrompt
+	}
+
+	if input.RopeFreqBase != 0 {
+		config.RopeFreqBase = input.RopeFreqBase
+	}
+
+	if input.RopeFreqScale != 0 {
+		config.RopeFreqScale = input.RopeFreqScale
+	}
+
 	if input.Grammar != "" {
 		config.Grammar = input.Grammar
 	}
@@ -115,15 +149,15 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 	}

 	if input.Mirostat != 0 {
-		config.Mirostat = input.Mirostat
+		config.LLMConfig.Mirostat = input.Mirostat
 	}

 	if input.MirostatETA != 0 {
-		config.MirostatETA = input.MirostatETA
+		config.LLMConfig.MirostatETA = input.MirostatETA
 	}

 	if input.MirostatTAU != 0 {
-		config.MirostatTAU = input.MirostatTAU
+		config.LLMConfig.MirostatTAU = input.MirostatTAU
 	}

 	if input.TypicalP != 0 {
@@ -161,7 +195,7 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		n, exists := fnc["name"]
 		if exists {
 			nn, e := n.(string)
-			if !e {
+			if e {
 				name = nn
 			}
 		}
--- a/api/openai/transcription.go
+++ b/api/openai/transcription.go
@@ -19,7 +19,7 @@ import (
 // https://platform.openai.com/docs/api-reference/audio/create
 func TranscriptEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.Loader, false)
+		m, input, err := readInput(c, o, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -23,6 +23,7 @@ type Option struct {
 	PreloadJSONModels                   string
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
+	ApiKeys                             []string

 	Galleries []gallery.Gallery

@@ -184,3 +185,9 @@ func WithImageDir(imageDir string) AppOption {
 		o.ImageDir = imageDir
 	}
 }
+
+func WithApiKeys(apiKeys []string) AppOption {
+	return func(o *Option) {
+		o.ApiKeys = apiKeys
+	}
+}
--- a/cmd/grpc/llama-grammar/main.go
+++ b/cmd/grpc/llama-grammar/main.go
@@ -1,25 +0,0 @@
-package main
-
-// GRPC Falcon server
-
-// Note: this is started internally by LocalAI and a server is allocated for each model
-
-import (
-	"flag"
-
-	llama "github.com/go-skynet/LocalAI/pkg/grpc/llm/llama-grammar"
-
-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -16,6 +16,25 @@ else
 	echo "see the documentation at: https://localai.io/basics/build/index.html"
 	echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288"
 	echo "@@@@@"
+	echo "CPU info:"
+	grep -e "model\sname" /proc/cpuinfo | head -1
+	grep -e "flags" /proc/cpuinfo | head -1
+	if grep -q -e "\savx\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX    found OK"
+	else
+		echo "CPU: no AVX    found"
+	fi
+	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX2   found OK"
+	else
+		echo "CPU: no AVX2   found"
+	fi
+	if grep -q -e "\savx512" /proc/cpuinfo ; then
+		echo "CPU:    AVX512 found OK"
+	else
+		echo "CPU: no AVX512 found"
+	fi
+	echo "@@@@@"
 fi

 ./local-ai "$@"
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,7 +1,16 @@
 # Examples

+| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
+
+|                                                                    [Telegram bot](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot)   | [Flowise](https://github.com/go-skynet/LocalAI/tree/master/examples/flowise)                                                                                                                     |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)   |  ![Screenshot from 2023-05-30 18-01-03](https://github.com/go-skynet/LocalAI/assets/2420543/02458782-0549-4131-971c-95ee56ec1af8)|    |
+
 Here is a list of projects that can easily be integrated with the LocalAI backend. 

+
 ### Projects

 ### AutoGPT
--- a/examples/k8sgpt/README.md
+++ b/examples/k8sgpt/README.md
@@ -38,7 +38,7 @@ helm install local-ai go-skynet/local-ai --create-namespace --namespace local-ai
 # Install k8sgpt
 helm repo add k8sgpt https://charts.k8sgpt.ai/
 helm repo update
-helm install release k8sgpt/k8sgpt-operator -n k8sgpt-operator-system --create-namespace
+helm install release k8sgpt/k8sgpt-operator -n k8sgpt-operator-system --create-namespace --version 0.0.17
 ```

 Apply the k8sgpt-operator configuration:
@@ -55,7 +55,6 @@ spec:
  baseUrl: http://local-ai.local-ai.svc.cluster.local:8080/v1
  noCache: false
  model: gpt-3.5-turbo
-  noCache: false
  version: v0.3.0
  enableAI: true
 EOF
@@ -67,4 +66,7 @@ Apply a broken pod:

 ```
 kubectl apply -f broken-pod.yaml
-```
+```
+
+## ArgoCD Deployment Example
+[Deploy K8sgpt + localai with Argocd](https://github.com/tyler-harpool/gitops/tree/main/infra/k8gpt)
--- a/examples/k8sgpt/values.yaml
+++ b/examples/k8sgpt/values.yaml
@@ -2,12 +2,13 @@ replicaCount: 1

 deployment:
  # https://quay.io/repository/go-skynet/local-ai?tab=tags
-  image: quay.io/go-skynet/local-ai:latest
+  image: quay.io/go-skynet/local-ai:v1.23.0
  env:
    threads: 4
    debug: "true"
    context_size: 512
-    preload_models: '[{ "url": "github:go-skynet/model-gallery/wizard.yaml", "name": "gpt-3.5-turbo", "overrides": { "parameters": { "model": "WizardLM-7B-uncensored.ggmlv3.q5_1" }},"files": [ { "uri": "https://huggingface.co//WizardLM-7B-uncensored-GGML/resolve/main/WizardLM-7B-uncensored.ggmlv3.q5_1.bin", "sha256": "d92a509d83a8ea5e08ba4c2dbaf08f29015932dc2accd627ce0665ac72c2bb2b", "filename": "WizardLM-7B-uncensored.ggmlv3.q5_1" }]}]'
+    galleries: '[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
+    preload_models: '[{ "id": "huggingface@thebloke__open-llama-13b-open-instruct-ggml__open-llama-13b-open-instruct.ggmlv3.q3_k_m.bin", "name": "gpt-3.5-turbo", "overrides": { "f16": true, "mmap": true }}]'
  modelsPath: "/models"

 resources:
--- a/examples/langchain-chroma/query.py
+++ b/examples/langchain-chroma/query.py
@@ -9,7 +9,7 @@ from langchain.vectorstores.base import VectorStoreRetriever
 base_path = os.environ.get('OPENAI_API_BASE', 'http://localhost:8080/v1')

 # Load and process the text
-embedding = OpenAIEmbeddings()
+embedding = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_base=base_path)
 persist_directory = 'db'

 # Now we can load the persisted database from disk, and use it as normal. 
--- a/examples/langchain-chroma/store.py
+++ b/examples/langchain-chroma/store.py
@@ -18,8 +18,8 @@ texts = text_splitter.split_documents(documents)
 # Supplying a persist_directory will store the embeddings on disk
 persist_directory = 'db'

-embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
+embedding = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_base=base_path)
 vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)

 vectordb.persist()
-vectordb = None
+vectordb = None
--- a/extra/grpc/autogptq/autogptq.py
+++ b/extra/grpc/autogptq/autogptq.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from pathlib import Path
+from transformers import AutoTokenizer
+from transformers import TextGenerationPipeline
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            device = "cuda:0"
+            if request.Device != "":
+                device = request.Device
+
+            tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=request.UseFastTokenizer)
+
+            model = AutoGPTQForCausalLM.from_quantized(request.Model,
+                    model_basename=request.ModelBaseName,
+                    use_safetensors=True,
+                    trust_remote_code=True,
+                    device=device,
+                    use_triton=request.UseTriton,
+                    quantize_config=None)
+            
+            self.model = model
+            self.tokenizer = tokenizer
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Predict(self, request, context):
+        penalty = 1.0
+        if request.Penalty != 0.0:
+            penalty = request.Penalty
+        tokens = 512
+        if request.Tokens != 0:
+            tokens = request.Tokens
+        top_p = 0.95
+        if request.TopP != 0.0:
+            top_p = request.TopP
+
+        # Implement Predict RPC
+        pipeline = TextGenerationPipeline(
+            model=self.model, 
+            tokenizer=self.tokenizer,
+            max_new_tokens=tokens,
+            temperature=request.Temperature,
+            top_p=top_p,
+            repetition_penalty=penalty,
+            )
+        t = pipeline(request.Prompt)[0]["generated_text"]
+        # Remove prompt from response if present
+        if request.Prompt in t:
+            t = t.replace(request.Prompt, "")
+
+        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
+
+    def PredictStream(self, request, context):
+        # Implement PredictStream RPC
+        #for reply in some_data_generator():
+        #    yield reply
+        # Not implemented yet
+        return self.Predict(request, context)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/extra/grpc/autogptq/backend_pb2.py
+++ b/extra/grpc/autogptq/backend_pb2.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: backend.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rbackend.proto\x12\x07\x62\x61\x63kend\"\x0f\n\rHealthMessage\"\x86\x06\n\x0ePredictOptions\x12\x0e\n\x06Prompt\x18\x01 \x01(\t\x12\x0c\n\x04Seed\x18\x02 \x01(\x05\x12\x0f\n\x07Threads\x18\x03 \x01(\x05\x12\x0e\n\x06Tokens\x18\x04 \x01(\x05\x12\x0c\n\x04TopK\x18\x05 \x01(\x05\x12\x0e\n\x06Repeat\x18\x06 \x01(\x05\x12\r\n\x05\x42\x61tch\x18\x07 \x01(\x05\x12\r\n\x05NKeep\x18\x08 \x01(\x05\x12\x13\n\x0bTemperature\x18\t \x01(\x02\x12\x0f\n\x07Penalty\x18\n \x01(\x02\x12\r\n\x05\x46\x31\x36KV\x18\x0b \x01(\x08\x12\x11\n\tDebugMode\x18\x0c \x01(\x08\x12\x13\n\x0bStopPrompts\x18\r \x03(\t\x12\x11\n\tIgnoreEOS\x18\x0e \x01(\x08\x12\x19\n\x11TailFreeSamplingZ\x18\x0f \x01(\x02\x12\x10\n\x08TypicalP\x18\x10 \x01(\x02\x12\x18\n\x10\x46requencyPenalty\x18\x11 \x01(\x02\x12\x17\n\x0fPresencePenalty\x18\x12 \x01(\x02\x12\x10\n\x08Mirostat\x18\x13 \x01(\x05\x12\x13\n\x0bMirostatETA\x18\x14 \x01(\x02\x12\x13\n\x0bMirostatTAU\x18\x15 \x01(\x02\x12\x12\n\nPenalizeNL\x18\x16 \x01(\x08\x12\x11\n\tLogitBias\x18\x17 \x01(\t\x12\r\n\x05MLock\x18\x19 \x01(\x08\x12\x0c\n\x04MMap\x18\x1a \x01(\x08\x12\x16\n\x0ePromptCacheAll\x18\x1b \x01(\x08\x12\x15\n\rPromptCacheRO\x18\x1c \x01(\x08\x12\x0f\n\x07Grammar\x18\x1d \x01(\t\x12\x0f\n\x07MainGPU\x18\x1e \x01(\t\x12\x13\n\x0bTensorSplit\x18\x1f \x01(\t\x12\x0c\n\x04TopP\x18  \x01(\x02\x12\x17\n\x0fPromptCachePath\x18! \x01(\t\x12\r\n\x05\x44\x65\x62ug\x18\" \x01(\x08\x12\x17\n\x0f\x45mbeddingTokens\x18# \x03(\x05\x12\x12\n\nEmbeddings\x18$ \x01(\t\x12\x14\n\x0cRopeFreqBase\x18% \x01(\x02\x12\x15\n\rRopeFreqScale\x18& \x01(\x02\x12\x1b\n\x13NegativePromptScale\x18\' \x01(\x02\x12\x16\n\x0eNegativePrompt\x18( \x01(\t\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\x0c\"\x9d\x04\n\x0cModelOptions\x12\r\n\x05Model\x18\x01 \x01(\t\x12\x13\n\x0b\x43ontextSize\x18\x02 \x01(\x05\x12\x0c\n\x04Seed\x18\x03 \x01(\x05\x12\x0e\n\x06NBatch\x18\x04 \x01(\x05\x12\x11\n\tF16Memory\x18\x05 \x01(\x08\x12\r\n\x05MLock\x18\x06 \x01(\x08\x12\x0c\n\x04MMap\x18\x07 \x01(\x08\x12\x11\n\tVocabOnly\x18\x08 \x01(\x08\x12\x0f\n\x07LowVRAM\x18\t \x01(\x08\x12\x12\n\nEmbeddings\x18\n \x01(\x08\x12\x0c\n\x04NUMA\x18\x0b \x01(\x08\x12\x12\n\nNGPULayers\x18\x0c \x01(\x05\x12\x0f\n\x07MainGPU\x18\r \x01(\t\x12\x13\n\x0bTensorSplit\x18\x0e \x01(\t\x12\x0f\n\x07Threads\x18\x0f \x01(\x05\x12\x19\n\x11LibrarySearchPath\x18\x10 \x01(\t\x12\x14\n\x0cRopeFreqBase\x18\x11 \x01(\x02\x12\x15\n\rRopeFreqScale\x18\x12 \x01(\x02\x12\x12\n\nRMSNormEps\x18\x13 \x01(\x02\x12\x0c\n\x04NGQA\x18\x14 \x01(\x05\x12\x11\n\tModelFile\x18\x15 \x01(\t\x12\x0e\n\x06\x44\x65vice\x18\x16 \x01(\t\x12\x11\n\tUseTriton\x18\x17 \x01(\x08\x12\x15\n\rModelBaseName\x18\x18 \x01(\t\x12\x18\n\x10UseFastTokenizer\x18\x19 \x01(\x08\x12\x14\n\x0cPipelineType\x18\x1a \x01(\t\x12\x15\n\rSchedulerType\x18\x1b \x01(\t\x12\x0c\n\x04\x43UDA\x18\x1c \x01(\x08\"*\n\x06Result\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\"%\n\x0f\x45mbeddingResult\x12\x12\n\nembeddings\x18\x01 \x03(\x02\"C\n\x11TranscriptRequest\x12\x0b\n\x03\x64st\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x0f\n\x07threads\x18\x04 \x01(\r\"N\n\x10TranscriptResult\x12,\n\x08segments\x18\x01 \x03(\x0b\x32\x1a.backend.TranscriptSegment\x12\x0c\n\x04text\x18\x02 \x01(\t\"Y\n\x11TranscriptSegment\x12\n\n\x02id\x18\x01 \x01(\x05\x12\r\n\x05start\x18\x02 \x01(\x03\x12\x0b\n\x03\x65nd\x18\x03 \x01(\x03\x12\x0c\n\x04text\x18\x04 \x01(\t\x12\x0e\n\x06tokens\x18\x05 \x03(\x05\"\x9e\x01\n\x14GenerateImageRequest\x12\x0e\n\x06height\x18\x01 \x01(\x05\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0c\n\x04mode\x18\x03 \x01(\x05\x12\x0c\n\x04step\x18\x04 \x01(\x05\x12\x0c\n\x04seed\x18\x05 \x01(\x05\x12\x17\n\x0fpositive_prompt\x18\x06 \x01(\t\x12\x17\n\x0fnegative_prompt\x18\x07 \x01(\t\x12\x0b\n\x03\x64st\x18\x08 \x01(\t\"6\n\nTTSRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05model\x18\x02 \x01(\t\x12\x0b\n\x03\x64st\x18\x03 \x01(\t2\xeb\x03\n\x07\x42\x61\x63kend\x12\x32\n\x06Health\x12\x16.backend.HealthMessage\x1a\x0e.backend.Reply\"\x00\x12\x34\n\x07Predict\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x12\x35\n\tLoadModel\x12\x15.backend.ModelOptions\x1a\x0f.backend.Result\"\x00\x12<\n\rPredictStream\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x30\x01\x12@\n\tEmbedding\x12\x17.backend.PredictOptions\x1a\x18.backend.EmbeddingResult\"\x00\x12\x41\n\rGenerateImage\x12\x1d.backend.GenerateImageRequest\x1a\x0f.backend.Result\"\x00\x12M\n\x12\x41udioTranscription\x12\x1a.backend.TranscriptRequest\x1a\x19.backend.TranscriptResult\"\x00\x12-\n\x03TTS\x12\x13.backend.TTSRequest\x1a\x0f.backend.Result\"\x00\x42Z\n\x19io.skynet.localai.backendB\x0eLocalAIBackendP\x01Z+github.com/go-skynet/LocalAI/pkg/grpc/protob\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'backend_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  DESCRIPTOR._serialized_options = b'\n\031io.skynet.localai.backendB\016LocalAIBackendP\001Z+github.com/go-skynet/LocalAI/pkg/grpc/proto'
+  _globals['_HEALTHMESSAGE']._serialized_start=26
+  _globals['_HEALTHMESSAGE']._serialized_end=41
+  _globals['_PREDICTOPTIONS']._serialized_start=44
+  _globals['_PREDICTOPTIONS']._serialized_end=818
+  _globals['_REPLY']._serialized_start=820
+  _globals['_REPLY']._serialized_end=844
+  _globals['_MODELOPTIONS']._serialized_start=847
+  _globals['_MODELOPTIONS']._serialized_end=1388
+  _globals['_RESULT']._serialized_start=1390
+  _globals['_RESULT']._serialized_end=1432
+  _globals['_EMBEDDINGRESULT']._serialized_start=1434
+  _globals['_EMBEDDINGRESULT']._serialized_end=1471
+  _globals['_TRANSCRIPTREQUEST']._serialized_start=1473
+  _globals['_TRANSCRIPTREQUEST']._serialized_end=1540
+  _globals['_TRANSCRIPTRESULT']._serialized_start=1542
+  _globals['_TRANSCRIPTRESULT']._serialized_end=1620
+  _globals['_TRANSCRIPTSEGMENT']._serialized_start=1622
+  _globals['_TRANSCRIPTSEGMENT']._serialized_end=1711
+  _globals['_GENERATEIMAGEREQUEST']._serialized_start=1714
+  _globals['_GENERATEIMAGEREQUEST']._serialized_end=1872
+  _globals['_TTSREQUEST']._serialized_start=1874
+  _globals['_TTSREQUEST']._serialized_end=1928
+  _globals['_BACKEND']._serialized_start=1931
+  _globals['_BACKEND']._serialized_end=2422
+# @@protoc_insertion_point(module_scope)
--- a/extra/grpc/autogptq/backend_pb2_grpc.py
+++ b/extra/grpc/autogptq/backend_pb2_grpc.py
@@ -0,0 +1,297 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import backend_pb2 as backend__pb2
+
+
+class BackendStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Health = channel.unary_unary(
+                '/backend.Backend/Health',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Predict = channel.unary_unary(
+                '/backend.Backend/Predict',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.LoadModel = channel.unary_unary(
+                '/backend.Backend/LoadModel',
+                request_serializer=backend__pb2.ModelOptions.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.PredictStream = channel.unary_stream(
+                '/backend.Backend/PredictStream',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Embedding = channel.unary_unary(
+                '/backend.Backend/Embedding',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.EmbeddingResult.FromString,
+                )
+        self.GenerateImage = channel.unary_unary(
+                '/backend.Backend/GenerateImage',
+                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.AudioTranscription = channel.unary_unary(
+                '/backend.Backend/AudioTranscription',
+                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
+                response_deserializer=backend__pb2.TranscriptResult.FromString,
+                )
+        self.TTS = channel.unary_unary(
+                '/backend.Backend/TTS',
+                request_serializer=backend__pb2.TTSRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+
+
+class BackendServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Health(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def LoadModel(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def PredictStream(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Embedding(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateImage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def AudioTranscription(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TTS(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BackendServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Health': grpc.unary_unary_rpc_method_handler(
+                    servicer.Health,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'LoadModel': grpc.unary_unary_rpc_method_handler(
+                    servicer.LoadModel,
+                    request_deserializer=backend__pb2.ModelOptions.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'PredictStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.PredictStream,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Embedding': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embedding,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
+            ),
+            'GenerateImage': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateImage,
+                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
+                    servicer.AudioTranscription,
+                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
+                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
+            ),
+            'TTS': grpc.unary_unary_rpc_method_handler(
+                    servicer.TTS,
+                    request_deserializer=backend__pb2.TTSRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'backend.Backend', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Backend(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Health(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def LoadModel(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
+            backend__pb2.ModelOptions.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def PredictStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Embedding(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.EmbeddingResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateImage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
+            backend__pb2.GenerateImageRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def AudioTranscription(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
+            backend__pb2.TranscriptRequest.SerializeToString,
+            backend__pb2.TranscriptResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TTS(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
+            backend__pb2.TTSRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/bark/backend_pb2.py
+++ b/extra/grpc/bark/backend_pb2.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: backend.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rbackend.proto\x12\x07\x62\x61\x63kend\"\x0f\n\rHealthMessage\"\x86\x06\n\x0ePredictOptions\x12\x0e\n\x06Prompt\x18\x01 \x01(\t\x12\x0c\n\x04Seed\x18\x02 \x01(\x05\x12\x0f\n\x07Threads\x18\x03 \x01(\x05\x12\x0e\n\x06Tokens\x18\x04 \x01(\x05\x12\x0c\n\x04TopK\x18\x05 \x01(\x05\x12\x0e\n\x06Repeat\x18\x06 \x01(\x05\x12\r\n\x05\x42\x61tch\x18\x07 \x01(\x05\x12\r\n\x05NKeep\x18\x08 \x01(\x05\x12\x13\n\x0bTemperature\x18\t \x01(\x02\x12\x0f\n\x07Penalty\x18\n \x01(\x02\x12\r\n\x05\x46\x31\x36KV\x18\x0b \x01(\x08\x12\x11\n\tDebugMode\x18\x0c \x01(\x08\x12\x13\n\x0bStopPrompts\x18\r \x03(\t\x12\x11\n\tIgnoreEOS\x18\x0e \x01(\x08\x12\x19\n\x11TailFreeSamplingZ\x18\x0f \x01(\x02\x12\x10\n\x08TypicalP\x18\x10 \x01(\x02\x12\x18\n\x10\x46requencyPenalty\x18\x11 \x01(\x02\x12\x17\n\x0fPresencePenalty\x18\x12 \x01(\x02\x12\x10\n\x08Mirostat\x18\x13 \x01(\x05\x12\x13\n\x0bMirostatETA\x18\x14 \x01(\x02\x12\x13\n\x0bMirostatTAU\x18\x15 \x01(\x02\x12\x12\n\nPenalizeNL\x18\x16 \x01(\x08\x12\x11\n\tLogitBias\x18\x17 \x01(\t\x12\r\n\x05MLock\x18\x19 \x01(\x08\x12\x0c\n\x04MMap\x18\x1a \x01(\x08\x12\x16\n\x0ePromptCacheAll\x18\x1b \x01(\x08\x12\x15\n\rPromptCacheRO\x18\x1c \x01(\x08\x12\x0f\n\x07Grammar\x18\x1d \x01(\t\x12\x0f\n\x07MainGPU\x18\x1e \x01(\t\x12\x13\n\x0bTensorSplit\x18\x1f \x01(\t\x12\x0c\n\x04TopP\x18  \x01(\x02\x12\x17\n\x0fPromptCachePath\x18! \x01(\t\x12\r\n\x05\x44\x65\x62ug\x18\" \x01(\x08\x12\x17\n\x0f\x45mbeddingTokens\x18# \x03(\x05\x12\x12\n\nEmbeddings\x18$ \x01(\t\x12\x14\n\x0cRopeFreqBase\x18% \x01(\x02\x12\x15\n\rRopeFreqScale\x18& \x01(\x02\x12\x1b\n\x13NegativePromptScale\x18\' \x01(\x02\x12\x16\n\x0eNegativePrompt\x18( \x01(\t\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\x0c\"\x9d\x04\n\x0cModelOptions\x12\r\n\x05Model\x18\x01 \x01(\t\x12\x13\n\x0b\x43ontextSize\x18\x02 \x01(\x05\x12\x0c\n\x04Seed\x18\x03 \x01(\x05\x12\x0e\n\x06NBatch\x18\x04 \x01(\x05\x12\x11\n\tF16Memory\x18\x05 \x01(\x08\x12\r\n\x05MLock\x18\x06 \x01(\x08\x12\x0c\n\x04MMap\x18\x07 \x01(\x08\x12\x11\n\tVocabOnly\x18\x08 \x01(\x08\x12\x0f\n\x07LowVRAM\x18\t \x01(\x08\x12\x12\n\nEmbeddings\x18\n \x01(\x08\x12\x0c\n\x04NUMA\x18\x0b \x01(\x08\x12\x12\n\nNGPULayers\x18\x0c \x01(\x05\x12\x0f\n\x07MainGPU\x18\r \x01(\t\x12\x13\n\x0bTensorSplit\x18\x0e \x01(\t\x12\x0f\n\x07Threads\x18\x0f \x01(\x05\x12\x19\n\x11LibrarySearchPath\x18\x10 \x01(\t\x12\x14\n\x0cRopeFreqBase\x18\x11 \x01(\x02\x12\x15\n\rRopeFreqScale\x18\x12 \x01(\x02\x12\x12\n\nRMSNormEps\x18\x13 \x01(\x02\x12\x0c\n\x04NGQA\x18\x14 \x01(\x05\x12\x11\n\tModelFile\x18\x15 \x01(\t\x12\x0e\n\x06\x44\x65vice\x18\x16 \x01(\t\x12\x11\n\tUseTriton\x18\x17 \x01(\x08\x12\x15\n\rModelBaseName\x18\x18 \x01(\t\x12\x18\n\x10UseFastTokenizer\x18\x19 \x01(\x08\x12\x14\n\x0cPipelineType\x18\x1a \x01(\t\x12\x15\n\rSchedulerType\x18\x1b \x01(\t\x12\x0c\n\x04\x43UDA\x18\x1c \x01(\x08\"*\n\x06Result\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\"%\n\x0f\x45mbeddingResult\x12\x12\n\nembeddings\x18\x01 \x03(\x02\"C\n\x11TranscriptRequest\x12\x0b\n\x03\x64st\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x0f\n\x07threads\x18\x04 \x01(\r\"N\n\x10TranscriptResult\x12,\n\x08segments\x18\x01 \x03(\x0b\x32\x1a.backend.TranscriptSegment\x12\x0c\n\x04text\x18\x02 \x01(\t\"Y\n\x11TranscriptSegment\x12\n\n\x02id\x18\x01 \x01(\x05\x12\r\n\x05start\x18\x02 \x01(\x03\x12\x0b\n\x03\x65nd\x18\x03 \x01(\x03\x12\x0c\n\x04text\x18\x04 \x01(\t\x12\x0e\n\x06tokens\x18\x05 \x03(\x05\"\x9e\x01\n\x14GenerateImageRequest\x12\x0e\n\x06height\x18\x01 \x01(\x05\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0c\n\x04mode\x18\x03 \x01(\x05\x12\x0c\n\x04step\x18\x04 \x01(\x05\x12\x0c\n\x04seed\x18\x05 \x01(\x05\x12\x17\n\x0fpositive_prompt\x18\x06 \x01(\t\x12\x17\n\x0fnegative_prompt\x18\x07 \x01(\t\x12\x0b\n\x03\x64st\x18\x08 \x01(\t\"6\n\nTTSRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05model\x18\x02 \x01(\t\x12\x0b\n\x03\x64st\x18\x03 \x01(\t2\xeb\x03\n\x07\x42\x61\x63kend\x12\x32\n\x06Health\x12\x16.backend.HealthMessage\x1a\x0e.backend.Reply\"\x00\x12\x34\n\x07Predict\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x12\x35\n\tLoadModel\x12\x15.backend.ModelOptions\x1a\x0f.backend.Result\"\x00\x12<\n\rPredictStream\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x30\x01\x12@\n\tEmbedding\x12\x17.backend.PredictOptions\x1a\x18.backend.EmbeddingResult\"\x00\x12\x41\n\rGenerateImage\x12\x1d.backend.GenerateImageRequest\x1a\x0f.backend.Result\"\x00\x12M\n\x12\x41udioTranscription\x12\x1a.backend.TranscriptRequest\x1a\x19.backend.TranscriptResult\"\x00\x12-\n\x03TTS\x12\x13.backend.TTSRequest\x1a\x0f.backend.Result\"\x00\x42Z\n\x19io.skynet.localai.backendB\x0eLocalAIBackendP\x01Z+github.com/go-skynet/LocalAI/pkg/grpc/protob\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'backend_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  DESCRIPTOR._serialized_options = b'\n\031io.skynet.localai.backendB\016LocalAIBackendP\001Z+github.com/go-skynet/LocalAI/pkg/grpc/proto'
+  _globals['_HEALTHMESSAGE']._serialized_start=26
+  _globals['_HEALTHMESSAGE']._serialized_end=41
+  _globals['_PREDICTOPTIONS']._serialized_start=44
+  _globals['_PREDICTOPTIONS']._serialized_end=818
+  _globals['_REPLY']._serialized_start=820
+  _globals['_REPLY']._serialized_end=844
+  _globals['_MODELOPTIONS']._serialized_start=847
+  _globals['_MODELOPTIONS']._serialized_end=1388
+  _globals['_RESULT']._serialized_start=1390
+  _globals['_RESULT']._serialized_end=1432
+  _globals['_EMBEDDINGRESULT']._serialized_start=1434
+  _globals['_EMBEDDINGRESULT']._serialized_end=1471
+  _globals['_TRANSCRIPTREQUEST']._serialized_start=1473
+  _globals['_TRANSCRIPTREQUEST']._serialized_end=1540
+  _globals['_TRANSCRIPTRESULT']._serialized_start=1542
+  _globals['_TRANSCRIPTRESULT']._serialized_end=1620
+  _globals['_TRANSCRIPTSEGMENT']._serialized_start=1622
+  _globals['_TRANSCRIPTSEGMENT']._serialized_end=1711
+  _globals['_GENERATEIMAGEREQUEST']._serialized_start=1714
+  _globals['_GENERATEIMAGEREQUEST']._serialized_end=1872
+  _globals['_TTSREQUEST']._serialized_start=1874
+  _globals['_TTSREQUEST']._serialized_end=1928
+  _globals['_BACKEND']._serialized_start=1931
+  _globals['_BACKEND']._serialized_end=2422
+# @@protoc_insertion_point(module_scope)
--- a/extra/grpc/bark/backend_pb2_grpc.py
+++ b/extra/grpc/bark/backend_pb2_grpc.py
@@ -0,0 +1,297 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import backend_pb2 as backend__pb2
+
+
+class BackendStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Health = channel.unary_unary(
+                '/backend.Backend/Health',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Predict = channel.unary_unary(
+                '/backend.Backend/Predict',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.LoadModel = channel.unary_unary(
+                '/backend.Backend/LoadModel',
+                request_serializer=backend__pb2.ModelOptions.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.PredictStream = channel.unary_stream(
+                '/backend.Backend/PredictStream',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Embedding = channel.unary_unary(
+                '/backend.Backend/Embedding',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.EmbeddingResult.FromString,
+                )
+        self.GenerateImage = channel.unary_unary(
+                '/backend.Backend/GenerateImage',
+                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.AudioTranscription = channel.unary_unary(
+                '/backend.Backend/AudioTranscription',
+                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
+                response_deserializer=backend__pb2.TranscriptResult.FromString,
+                )
+        self.TTS = channel.unary_unary(
+                '/backend.Backend/TTS',
+                request_serializer=backend__pb2.TTSRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+
+
+class BackendServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Health(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def LoadModel(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def PredictStream(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Embedding(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateImage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def AudioTranscription(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TTS(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BackendServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Health': grpc.unary_unary_rpc_method_handler(
+                    servicer.Health,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'LoadModel': grpc.unary_unary_rpc_method_handler(
+                    servicer.LoadModel,
+                    request_deserializer=backend__pb2.ModelOptions.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'PredictStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.PredictStream,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Embedding': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embedding,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
+            ),
+            'GenerateImage': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateImage,
+                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
+                    servicer.AudioTranscription,
+                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
+                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
+            ),
+            'TTS': grpc.unary_unary_rpc_method_handler(
+                    servicer.TTS,
+                    request_deserializer=backend__pb2.TTSRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'backend.Backend', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Backend(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Health(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def LoadModel(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
+            backend__pb2.ModelOptions.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def PredictStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Embedding(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.EmbeddingResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateImage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
+            backend__pb2.GenerateImageRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def AudioTranscription(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
+            backend__pb2.TranscriptRequest.SerializeToString,
+            backend__pb2.TranscriptResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TTS(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
+            backend__pb2.TTSRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/bark/ttsbark.py
+++ b/extra/grpc/bark/ttsbark.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from pathlib import Path
+from bark import SAMPLE_RATE, generate_audio, preload_models
+from scipy.io.wavfile import write as write_wav
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        model_name = request.Model
+        try:
+            print("Preparing models, please wait", file=sys.stderr)
+            # download and load all models
+            preload_models()
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        model = request.model
+        print(request, file=sys.stderr)
+        try:
+            audio_array = None
+            if model != "":
+                audio_array = generate_audio(request.text, history_prompt=model)
+            else:
+                audio_array = generate_audio(request.text)
+            print("saving to", request.dst, file=sys.stderr)
+            # save audio to disk
+            write_wav(request.dst, SAMPLE_RATE, audio_array)
+            print("saved to", request.dst, file=sys.stderr)
+            print("tts for", file=sys.stderr)
+            print(request, file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/extra/grpc/diffusers/backend_diffusers.py
+++ b/extra/grpc/diffusers/backend_diffusers.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os
+
+# import diffusers
+import torch
+from torch import autocast
+from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            print(f"Loading model {request.Model}...", file=sys.stderr)
+            print(f"Request {request}", file=sys.stderr)
+            torchType = torch.float32
+            if request.F16Memory:
+                torchType = torch.float16
+
+            if request.PipelineType == "":
+                request.PipelineType == "StableDiffusionPipeline"
+
+            if request.PipelineType == "StableDiffusionPipeline":
+                self.pipe = StableDiffusionPipeline.from_pretrained(request.Model,
+                                                               torch_dtype=torchType)
+
+            if request.PipelineType == "DiffusionPipeline":
+                self.pipe = DiffusionPipeline.from_pretrained(request.Model,
+                                                               torch_dtype=torchType)
+
+            if request.PipelineType == "StableDiffusionXLPipeline":
+                self.pipe = StableDiffusionXLPipeline.from_pretrained(
+                    request.Model, 
+                    torch_dtype=torchType, 
+                    use_safetensors=True, 
+             #       variant="fp16"
+                    )
+          
+            # torch_dtype needs to be customized. float16 for GPU, float32 for CPU
+            # TODO: this needs to be customized
+            if request.SchedulerType == "EulerAncestralDiscreteScheduler":
+                self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
+            if request.SchedulerType == "DPMSolverMultistepScheduler":
+                self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
+
+            if request.CUDA:
+                self.pipe.to('cuda')
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        # Implement your logic here for the LoadModel service
+        # Replace this with your desired response
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+    def GenerateImage(self, request, context):
+
+        prompt = request.positive_prompt
+        negative_prompt = request.negative_prompt
+
+        image = self.pipe(
+            prompt, 
+            negative_prompt=negative_prompt, 
+            width=request.width,
+            height=request.height,
+      #      guidance_scale=12,
+            target_size=(request.width,request.height),
+            original_size=(4096,4096),
+            num_inference_steps=request.step
+            ).images[0]
+
+        image.save(request.dst)
+
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/extra/grpc/diffusers/backend_pb2.py
+++ b/extra/grpc/diffusers/backend_pb2.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: backend.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rbackend.proto\x12\x07\x62\x61\x63kend\"\x0f\n\rHealthMessage\"\x86\x06\n\x0ePredictOptions\x12\x0e\n\x06Prompt\x18\x01 \x01(\t\x12\x0c\n\x04Seed\x18\x02 \x01(\x05\x12\x0f\n\x07Threads\x18\x03 \x01(\x05\x12\x0e\n\x06Tokens\x18\x04 \x01(\x05\x12\x0c\n\x04TopK\x18\x05 \x01(\x05\x12\x0e\n\x06Repeat\x18\x06 \x01(\x05\x12\r\n\x05\x42\x61tch\x18\x07 \x01(\x05\x12\r\n\x05NKeep\x18\x08 \x01(\x05\x12\x13\n\x0bTemperature\x18\t \x01(\x02\x12\x0f\n\x07Penalty\x18\n \x01(\x02\x12\r\n\x05\x46\x31\x36KV\x18\x0b \x01(\x08\x12\x11\n\tDebugMode\x18\x0c \x01(\x08\x12\x13\n\x0bStopPrompts\x18\r \x03(\t\x12\x11\n\tIgnoreEOS\x18\x0e \x01(\x08\x12\x19\n\x11TailFreeSamplingZ\x18\x0f \x01(\x02\x12\x10\n\x08TypicalP\x18\x10 \x01(\x02\x12\x18\n\x10\x46requencyPenalty\x18\x11 \x01(\x02\x12\x17\n\x0fPresencePenalty\x18\x12 \x01(\x02\x12\x10\n\x08Mirostat\x18\x13 \x01(\x05\x12\x13\n\x0bMirostatETA\x18\x14 \x01(\x02\x12\x13\n\x0bMirostatTAU\x18\x15 \x01(\x02\x12\x12\n\nPenalizeNL\x18\x16 \x01(\x08\x12\x11\n\tLogitBias\x18\x17 \x01(\t\x12\r\n\x05MLock\x18\x19 \x01(\x08\x12\x0c\n\x04MMap\x18\x1a \x01(\x08\x12\x16\n\x0ePromptCacheAll\x18\x1b \x01(\x08\x12\x15\n\rPromptCacheRO\x18\x1c \x01(\x08\x12\x0f\n\x07Grammar\x18\x1d \x01(\t\x12\x0f\n\x07MainGPU\x18\x1e \x01(\t\x12\x13\n\x0bTensorSplit\x18\x1f \x01(\t\x12\x0c\n\x04TopP\x18  \x01(\x02\x12\x17\n\x0fPromptCachePath\x18! \x01(\t\x12\r\n\x05\x44\x65\x62ug\x18\" \x01(\x08\x12\x17\n\x0f\x45mbeddingTokens\x18# \x03(\x05\x12\x12\n\nEmbeddings\x18$ \x01(\t\x12\x14\n\x0cRopeFreqBase\x18% \x01(\x02\x12\x15\n\rRopeFreqScale\x18& \x01(\x02\x12\x1b\n\x13NegativePromptScale\x18\' \x01(\x02\x12\x16\n\x0eNegativePrompt\x18( \x01(\t\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\x0c\"\x9d\x04\n\x0cModelOptions\x12\r\n\x05Model\x18\x01 \x01(\t\x12\x13\n\x0b\x43ontextSize\x18\x02 \x01(\x05\x12\x0c\n\x04Seed\x18\x03 \x01(\x05\x12\x0e\n\x06NBatch\x18\x04 \x01(\x05\x12\x11\n\tF16Memory\x18\x05 \x01(\x08\x12\r\n\x05MLock\x18\x06 \x01(\x08\x12\x0c\n\x04MMap\x18\x07 \x01(\x08\x12\x11\n\tVocabOnly\x18\x08 \x01(\x08\x12\x0f\n\x07LowVRAM\x18\t \x01(\x08\x12\x12\n\nEmbeddings\x18\n \x01(\x08\x12\x0c\n\x04NUMA\x18\x0b \x01(\x08\x12\x12\n\nNGPULayers\x18\x0c \x01(\x05\x12\x0f\n\x07MainGPU\x18\r \x01(\t\x12\x13\n\x0bTensorSplit\x18\x0e \x01(\t\x12\x0f\n\x07Threads\x18\x0f \x01(\x05\x12\x19\n\x11LibrarySearchPath\x18\x10 \x01(\t\x12\x14\n\x0cRopeFreqBase\x18\x11 \x01(\x02\x12\x15\n\rRopeFreqScale\x18\x12 \x01(\x02\x12\x12\n\nRMSNormEps\x18\x13 \x01(\x02\x12\x0c\n\x04NGQA\x18\x14 \x01(\x05\x12\x11\n\tModelFile\x18\x15 \x01(\t\x12\x0e\n\x06\x44\x65vice\x18\x16 \x01(\t\x12\x11\n\tUseTriton\x18\x17 \x01(\x08\x12\x15\n\rModelBaseName\x18\x18 \x01(\t\x12\x18\n\x10UseFastTokenizer\x18\x19 \x01(\x08\x12\x14\n\x0cPipelineType\x18\x1a \x01(\t\x12\x15\n\rSchedulerType\x18\x1b \x01(\t\x12\x0c\n\x04\x43UDA\x18\x1c \x01(\x08\"*\n\x06Result\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\"%\n\x0f\x45mbeddingResult\x12\x12\n\nembeddings\x18\x01 \x03(\x02\"C\n\x11TranscriptRequest\x12\x0b\n\x03\x64st\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x0f\n\x07threads\x18\x04 \x01(\r\"N\n\x10TranscriptResult\x12,\n\x08segments\x18\x01 \x03(\x0b\x32\x1a.backend.TranscriptSegment\x12\x0c\n\x04text\x18\x02 \x01(\t\"Y\n\x11TranscriptSegment\x12\n\n\x02id\x18\x01 \x01(\x05\x12\r\n\x05start\x18\x02 \x01(\x03\x12\x0b\n\x03\x65nd\x18\x03 \x01(\x03\x12\x0c\n\x04text\x18\x04 \x01(\t\x12\x0e\n\x06tokens\x18\x05 \x03(\x05\"\x9e\x01\n\x14GenerateImageRequest\x12\x0e\n\x06height\x18\x01 \x01(\x05\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0c\n\x04mode\x18\x03 \x01(\x05\x12\x0c\n\x04step\x18\x04 \x01(\x05\x12\x0c\n\x04seed\x18\x05 \x01(\x05\x12\x17\n\x0fpositive_prompt\x18\x06 \x01(\t\x12\x17\n\x0fnegative_prompt\x18\x07 \x01(\t\x12\x0b\n\x03\x64st\x18\x08 \x01(\t\"6\n\nTTSRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05model\x18\x02 \x01(\t\x12\x0b\n\x03\x64st\x18\x03 \x01(\t2\xeb\x03\n\x07\x42\x61\x63kend\x12\x32\n\x06Health\x12\x16.backend.HealthMessage\x1a\x0e.backend.Reply\"\x00\x12\x34\n\x07Predict\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x12\x35\n\tLoadModel\x12\x15.backend.ModelOptions\x1a\x0f.backend.Result\"\x00\x12<\n\rPredictStream\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x30\x01\x12@\n\tEmbedding\x12\x17.backend.PredictOptions\x1a\x18.backend.EmbeddingResult\"\x00\x12\x41\n\rGenerateImage\x12\x1d.backend.GenerateImageRequest\x1a\x0f.backend.Result\"\x00\x12M\n\x12\x41udioTranscription\x12\x1a.backend.TranscriptRequest\x1a\x19.backend.TranscriptResult\"\x00\x12-\n\x03TTS\x12\x13.backend.TTSRequest\x1a\x0f.backend.Result\"\x00\x42Z\n\x19io.skynet.localai.backendB\x0eLocalAIBackendP\x01Z+github.com/go-skynet/LocalAI/pkg/grpc/protob\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'backend_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  DESCRIPTOR._serialized_options = b'\n\031io.skynet.localai.backendB\016LocalAIBackendP\001Z+github.com/go-skynet/LocalAI/pkg/grpc/proto'
+  _globals['_HEALTHMESSAGE']._serialized_start=26
+  _globals['_HEALTHMESSAGE']._serialized_end=41
+  _globals['_PREDICTOPTIONS']._serialized_start=44
+  _globals['_PREDICTOPTIONS']._serialized_end=818
+  _globals['_REPLY']._serialized_start=820
+  _globals['_REPLY']._serialized_end=844
+  _globals['_MODELOPTIONS']._serialized_start=847
+  _globals['_MODELOPTIONS']._serialized_end=1388
+  _globals['_RESULT']._serialized_start=1390
+  _globals['_RESULT']._serialized_end=1432
+  _globals['_EMBEDDINGRESULT']._serialized_start=1434
+  _globals['_EMBEDDINGRESULT']._serialized_end=1471
+  _globals['_TRANSCRIPTREQUEST']._serialized_start=1473
+  _globals['_TRANSCRIPTREQUEST']._serialized_end=1540
+  _globals['_TRANSCRIPTRESULT']._serialized_start=1542
+  _globals['_TRANSCRIPTRESULT']._serialized_end=1620
+  _globals['_TRANSCRIPTSEGMENT']._serialized_start=1622
+  _globals['_TRANSCRIPTSEGMENT']._serialized_end=1711
+  _globals['_GENERATEIMAGEREQUEST']._serialized_start=1714
+  _globals['_GENERATEIMAGEREQUEST']._serialized_end=1872
+  _globals['_TTSREQUEST']._serialized_start=1874
+  _globals['_TTSREQUEST']._serialized_end=1928
+  _globals['_BACKEND']._serialized_start=1931
+  _globals['_BACKEND']._serialized_end=2422
+# @@protoc_insertion_point(module_scope)
--- a/extra/grpc/diffusers/backend_pb2_grpc.py
+++ b/extra/grpc/diffusers/backend_pb2_grpc.py
@@ -0,0 +1,297 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import backend_pb2 as backend__pb2
+
+
+class BackendStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Health = channel.unary_unary(
+                '/backend.Backend/Health',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Predict = channel.unary_unary(
+                '/backend.Backend/Predict',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.LoadModel = channel.unary_unary(
+                '/backend.Backend/LoadModel',
+                request_serializer=backend__pb2.ModelOptions.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.PredictStream = channel.unary_stream(
+                '/backend.Backend/PredictStream',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Embedding = channel.unary_unary(
+                '/backend.Backend/Embedding',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.EmbeddingResult.FromString,
+                )
+        self.GenerateImage = channel.unary_unary(
+                '/backend.Backend/GenerateImage',
+                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.AudioTranscription = channel.unary_unary(
+                '/backend.Backend/AudioTranscription',
+                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
+                response_deserializer=backend__pb2.TranscriptResult.FromString,
+                )
+        self.TTS = channel.unary_unary(
+                '/backend.Backend/TTS',
+                request_serializer=backend__pb2.TTSRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+
+
+class BackendServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Health(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def LoadModel(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def PredictStream(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Embedding(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateImage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def AudioTranscription(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TTS(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BackendServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Health': grpc.unary_unary_rpc_method_handler(
+                    servicer.Health,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'LoadModel': grpc.unary_unary_rpc_method_handler(
+                    servicer.LoadModel,
+                    request_deserializer=backend__pb2.ModelOptions.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'PredictStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.PredictStream,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Embedding': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embedding,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
+            ),
+            'GenerateImage': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateImage,
+                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
+                    servicer.AudioTranscription,
+                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
+                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
+            ),
+            'TTS': grpc.unary_unary_rpc_method_handler(
+                    servicer.TTS,
+                    request_deserializer=backend__pb2.TTSRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'backend.Backend', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Backend(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Health(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def LoadModel(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
+            backend__pb2.ModelOptions.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def PredictStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Embedding(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.EmbeddingResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateImage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
+            backend__pb2.GenerateImageRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def AudioTranscription(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
+            backend__pb2.TranscriptRequest.SerializeToString,
+            backend__pb2.TranscriptResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TTS(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
+            backend__pb2.TTSRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/exllama/backend_pb2.py
+++ b/extra/grpc/exllama/backend_pb2.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: backend.proto
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rbackend.proto\x12\x07\x62\x61\x63kend\"\x0f\n\rHealthMessage\"\x86\x06\n\x0ePredictOptions\x12\x0e\n\x06Prompt\x18\x01 \x01(\t\x12\x0c\n\x04Seed\x18\x02 \x01(\x05\x12\x0f\n\x07Threads\x18\x03 \x01(\x05\x12\x0e\n\x06Tokens\x18\x04 \x01(\x05\x12\x0c\n\x04TopK\x18\x05 \x01(\x05\x12\x0e\n\x06Repeat\x18\x06 \x01(\x05\x12\r\n\x05\x42\x61tch\x18\x07 \x01(\x05\x12\r\n\x05NKeep\x18\x08 \x01(\x05\x12\x13\n\x0bTemperature\x18\t \x01(\x02\x12\x0f\n\x07Penalty\x18\n \x01(\x02\x12\r\n\x05\x46\x31\x36KV\x18\x0b \x01(\x08\x12\x11\n\tDebugMode\x18\x0c \x01(\x08\x12\x13\n\x0bStopPrompts\x18\r \x03(\t\x12\x11\n\tIgnoreEOS\x18\x0e \x01(\x08\x12\x19\n\x11TailFreeSamplingZ\x18\x0f \x01(\x02\x12\x10\n\x08TypicalP\x18\x10 \x01(\x02\x12\x18\n\x10\x46requencyPenalty\x18\x11 \x01(\x02\x12\x17\n\x0fPresencePenalty\x18\x12 \x01(\x02\x12\x10\n\x08Mirostat\x18\x13 \x01(\x05\x12\x13\n\x0bMirostatETA\x18\x14 \x01(\x02\x12\x13\n\x0bMirostatTAU\x18\x15 \x01(\x02\x12\x12\n\nPenalizeNL\x18\x16 \x01(\x08\x12\x11\n\tLogitBias\x18\x17 \x01(\t\x12\r\n\x05MLock\x18\x19 \x01(\x08\x12\x0c\n\x04MMap\x18\x1a \x01(\x08\x12\x16\n\x0ePromptCacheAll\x18\x1b \x01(\x08\x12\x15\n\rPromptCacheRO\x18\x1c \x01(\x08\x12\x0f\n\x07Grammar\x18\x1d \x01(\t\x12\x0f\n\x07MainGPU\x18\x1e \x01(\t\x12\x13\n\x0bTensorSplit\x18\x1f \x01(\t\x12\x0c\n\x04TopP\x18  \x01(\x02\x12\x17\n\x0fPromptCachePath\x18! \x01(\t\x12\r\n\x05\x44\x65\x62ug\x18\" \x01(\x08\x12\x17\n\x0f\x45mbeddingTokens\x18# \x03(\x05\x12\x12\n\nEmbeddings\x18$ \x01(\t\x12\x14\n\x0cRopeFreqBase\x18% \x01(\x02\x12\x15\n\rRopeFreqScale\x18& \x01(\x02\x12\x1b\n\x13NegativePromptScale\x18\' \x01(\x02\x12\x16\n\x0eNegativePrompt\x18( \x01(\t\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\x0c\"\x9d\x04\n\x0cModelOptions\x12\r\n\x05Model\x18\x01 \x01(\t\x12\x13\n\x0b\x43ontextSize\x18\x02 \x01(\x05\x12\x0c\n\x04Seed\x18\x03 \x01(\x05\x12\x0e\n\x06NBatch\x18\x04 \x01(\x05\x12\x11\n\tF16Memory\x18\x05 \x01(\x08\x12\r\n\x05MLock\x18\x06 \x01(\x08\x12\x0c\n\x04MMap\x18\x07 \x01(\x08\x12\x11\n\tVocabOnly\x18\x08 \x01(\x08\x12\x0f\n\x07LowVRAM\x18\t \x01(\x08\x12\x12\n\nEmbeddings\x18\n \x01(\x08\x12\x0c\n\x04NUMA\x18\x0b \x01(\x08\x12\x12\n\nNGPULayers\x18\x0c \x01(\x05\x12\x0f\n\x07MainGPU\x18\r \x01(\t\x12\x13\n\x0bTensorSplit\x18\x0e \x01(\t\x12\x0f\n\x07Threads\x18\x0f \x01(\x05\x12\x19\n\x11LibrarySearchPath\x18\x10 \x01(\t\x12\x14\n\x0cRopeFreqBase\x18\x11 \x01(\x02\x12\x15\n\rRopeFreqScale\x18\x12 \x01(\x02\x12\x12\n\nRMSNormEps\x18\x13 \x01(\x02\x12\x0c\n\x04NGQA\x18\x14 \x01(\x05\x12\x11\n\tModelFile\x18\x15 \x01(\t\x12\x0e\n\x06\x44\x65vice\x18\x16 \x01(\t\x12\x11\n\tUseTriton\x18\x17 \x01(\x08\x12\x15\n\rModelBaseName\x18\x18 \x01(\t\x12\x18\n\x10UseFastTokenizer\x18\x19 \x01(\x08\x12\x14\n\x0cPipelineType\x18\x1a \x01(\t\x12\x15\n\rSchedulerType\x18\x1b \x01(\t\x12\x0c\n\x04\x43UDA\x18\x1c \x01(\x08\"*\n\x06Result\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\"%\n\x0f\x45mbeddingResult\x12\x12\n\nembeddings\x18\x01 \x03(\x02\"C\n\x11TranscriptRequest\x12\x0b\n\x03\x64st\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x0f\n\x07threads\x18\x04 \x01(\r\"N\n\x10TranscriptResult\x12,\n\x08segments\x18\x01 \x03(\x0b\x32\x1a.backend.TranscriptSegment\x12\x0c\n\x04text\x18\x02 \x01(\t\"Y\n\x11TranscriptSegment\x12\n\n\x02id\x18\x01 \x01(\x05\x12\r\n\x05start\x18\x02 \x01(\x03\x12\x0b\n\x03\x65nd\x18\x03 \x01(\x03\x12\x0c\n\x04text\x18\x04 \x01(\t\x12\x0e\n\x06tokens\x18\x05 \x03(\x05\"\x9e\x01\n\x14GenerateImageRequest\x12\x0e\n\x06height\x18\x01 \x01(\x05\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0c\n\x04mode\x18\x03 \x01(\x05\x12\x0c\n\x04step\x18\x04 \x01(\x05\x12\x0c\n\x04seed\x18\x05 \x01(\x05\x12\x17\n\x0fpositive_prompt\x18\x06 \x01(\t\x12\x17\n\x0fnegative_prompt\x18\x07 \x01(\t\x12\x0b\n\x03\x64st\x18\x08 \x01(\t\"6\n\nTTSRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05model\x18\x02 \x01(\t\x12\x0b\n\x03\x64st\x18\x03 \x01(\t2\xeb\x03\n\x07\x42\x61\x63kend\x12\x32\n\x06Health\x12\x16.backend.HealthMessage\x1a\x0e.backend.Reply\"\x00\x12\x34\n\x07Predict\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x12\x35\n\tLoadModel\x12\x15.backend.ModelOptions\x1a\x0f.backend.Result\"\x00\x12<\n\rPredictStream\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x30\x01\x12@\n\tEmbedding\x12\x17.backend.PredictOptions\x1a\x18.backend.EmbeddingResult\"\x00\x12\x41\n\rGenerateImage\x12\x1d.backend.GenerateImageRequest\x1a\x0f.backend.Result\"\x00\x12M\n\x12\x41udioTranscription\x12\x1a.backend.TranscriptRequest\x1a\x19.backend.TranscriptResult\"\x00\x12-\n\x03TTS\x12\x13.backend.TTSRequest\x1a\x0f.backend.Result\"\x00\x42Z\n\x19io.skynet.localai.backendB\x0eLocalAIBackendP\x01Z+github.com/go-skynet/LocalAI/pkg/grpc/protob\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'backend_pb2', _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+
+  DESCRIPTOR._options = None
+  DESCRIPTOR._serialized_options = b'\n\031io.skynet.localai.backendB\016LocalAIBackendP\001Z+github.com/go-skynet/LocalAI/pkg/grpc/proto'
+  _globals['_HEALTHMESSAGE']._serialized_start=26
+  _globals['_HEALTHMESSAGE']._serialized_end=41
+  _globals['_PREDICTOPTIONS']._serialized_start=44
+  _globals['_PREDICTOPTIONS']._serialized_end=818
+  _globals['_REPLY']._serialized_start=820
+  _globals['_REPLY']._serialized_end=844
+  _globals['_MODELOPTIONS']._serialized_start=847
+  _globals['_MODELOPTIONS']._serialized_end=1388
+  _globals['_RESULT']._serialized_start=1390
+  _globals['_RESULT']._serialized_end=1432
+  _globals['_EMBEDDINGRESULT']._serialized_start=1434
+  _globals['_EMBEDDINGRESULT']._serialized_end=1471
+  _globals['_TRANSCRIPTREQUEST']._serialized_start=1473
+  _globals['_TRANSCRIPTREQUEST']._serialized_end=1540
+  _globals['_TRANSCRIPTRESULT']._serialized_start=1542
+  _globals['_TRANSCRIPTRESULT']._serialized_end=1620
+  _globals['_TRANSCRIPTSEGMENT']._serialized_start=1622
+  _globals['_TRANSCRIPTSEGMENT']._serialized_end=1711
+  _globals['_GENERATEIMAGEREQUEST']._serialized_start=1714
+  _globals['_GENERATEIMAGEREQUEST']._serialized_end=1872
+  _globals['_TTSREQUEST']._serialized_start=1874
+  _globals['_TTSREQUEST']._serialized_end=1928
+  _globals['_BACKEND']._serialized_start=1931
+  _globals['_BACKEND']._serialized_end=2422
+# @@protoc_insertion_point(module_scope)
--- a/extra/grpc/exllama/backend_pb2_grpc.py
+++ b/extra/grpc/exllama/backend_pb2_grpc.py
@@ -0,0 +1,297 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+
+import backend_pb2 as backend__pb2
+
+
+class BackendStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Health = channel.unary_unary(
+                '/backend.Backend/Health',
+                request_serializer=backend__pb2.HealthMessage.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Predict = channel.unary_unary(
+                '/backend.Backend/Predict',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.LoadModel = channel.unary_unary(
+                '/backend.Backend/LoadModel',
+                request_serializer=backend__pb2.ModelOptions.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.PredictStream = channel.unary_stream(
+                '/backend.Backend/PredictStream',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.Reply.FromString,
+                )
+        self.Embedding = channel.unary_unary(
+                '/backend.Backend/Embedding',
+                request_serializer=backend__pb2.PredictOptions.SerializeToString,
+                response_deserializer=backend__pb2.EmbeddingResult.FromString,
+                )
+        self.GenerateImage = channel.unary_unary(
+                '/backend.Backend/GenerateImage',
+                request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+        self.AudioTranscription = channel.unary_unary(
+                '/backend.Backend/AudioTranscription',
+                request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
+                response_deserializer=backend__pb2.TranscriptResult.FromString,
+                )
+        self.TTS = channel.unary_unary(
+                '/backend.Backend/TTS',
+                request_serializer=backend__pb2.TTSRequest.SerializeToString,
+                response_deserializer=backend__pb2.Result.FromString,
+                )
+
+
+class BackendServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def Health(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Predict(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def LoadModel(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def PredictStream(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Embedding(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GenerateImage(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def AudioTranscription(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TTS(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_BackendServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Health': grpc.unary_unary_rpc_method_handler(
+                    servicer.Health,
+                    request_deserializer=backend__pb2.HealthMessage.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Predict': grpc.unary_unary_rpc_method_handler(
+                    servicer.Predict,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'LoadModel': grpc.unary_unary_rpc_method_handler(
+                    servicer.LoadModel,
+                    request_deserializer=backend__pb2.ModelOptions.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'PredictStream': grpc.unary_stream_rpc_method_handler(
+                    servicer.PredictStream,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.Reply.SerializeToString,
+            ),
+            'Embedding': grpc.unary_unary_rpc_method_handler(
+                    servicer.Embedding,
+                    request_deserializer=backend__pb2.PredictOptions.FromString,
+                    response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
+            ),
+            'GenerateImage': grpc.unary_unary_rpc_method_handler(
+                    servicer.GenerateImage,
+                    request_deserializer=backend__pb2.GenerateImageRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+            'AudioTranscription': grpc.unary_unary_rpc_method_handler(
+                    servicer.AudioTranscription,
+                    request_deserializer=backend__pb2.TranscriptRequest.FromString,
+                    response_serializer=backend__pb2.TranscriptResult.SerializeToString,
+            ),
+            'TTS': grpc.unary_unary_rpc_method_handler(
+                    servicer.TTS,
+                    request_deserializer=backend__pb2.TTSRequest.FromString,
+                    response_serializer=backend__pb2.Result.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'backend.Backend', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Backend(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def Health(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
+            backend__pb2.HealthMessage.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Predict(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def LoadModel(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
+            backend__pb2.ModelOptions.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def PredictStream(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.Reply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Embedding(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
+            backend__pb2.PredictOptions.SerializeToString,
+            backend__pb2.EmbeddingResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GenerateImage(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
+            backend__pb2.GenerateImageRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def AudioTranscription(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
+            backend__pb2.TranscriptRequest.SerializeToString,
+            backend__pb2.TranscriptResult.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TTS(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
+            backend__pb2.TTSRequest.SerializeToString,
+            backend__pb2.Result.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
--- a/extra/grpc/exllama/exllama.py
+++ b/extra/grpc/exllama/exllama.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os, glob
+
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from torch import version as torch_version
+from exllama.generator import ExLlamaGenerator
+from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
+from exllama.tokenizer import ExLlamaTokenizer
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def generate(self,prompt, max_new_tokens):
+        self.generator.end_beam_search()
+
+        # Tokenizing the input
+        ids = self.generator.tokenizer.encode(prompt)
+
+        self.generator.gen_begin_reuse(ids)
+        initial_len = self.generator.sequence[0].shape[0]
+        has_leading_space = False
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            token = self.generator.gen_single_token()
+            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                has_leading_space = True
+
+            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+            if has_leading_space:
+                decoded_text = ' ' + decoded_text
+
+            if token.item() == self.generator.tokenizer.eos_token_id:
+                break
+        return decoded_text
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            # https://github.com/turboderp/exllama/blob/master/example_cfg.py
+            model_directory = request.ModelFile
+
+            # Locate files we need within that directory
+            tokenizer_path = os.path.join(model_directory, "tokenizer.model")
+            model_config_path = os.path.join(model_directory, "config.json")
+            st_pattern = os.path.join(model_directory, "*.safetensors")
+            model_path = glob.glob(st_pattern)[0]
+
+            # Create config, model, tokenizer and generator
+
+            config = ExLlamaConfig(model_config_path)               # create config from config.json
+            config.model_path = model_path                          # supply path to model weights file
+
+            model = ExLlama(config)                                 # create ExLlama instance and load the weights
+            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
+
+            cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
+            generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
+
+            self.generator= generator
+            self.model = model
+            self.tokenizer = tokenizer
+            self.cache = cache
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Predict(self, request, context):
+        penalty = 1.15
+        if request.Penalty != 0.0:
+            penalty = request.Penalty
+        self.generator.settings.token_repetition_penalty_max = penalty
+        self.generator.settings.temperature = request.Temperature
+        self.generator.settings.top_k = request.TopK
+        self.generator.settings.top_p = request.TopP
+
+        tokens = 512
+        if request.Tokens != 0:
+            tokens = request.Tokens
+
+        if self.cache.batch_size == 1:
+            del self.cache
+            self.cache = ExLlamaCache(self.model, batch_size=2)
+            self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
+
+        t = self.generate(request.Prompt, tokens)
+
+        # Remove prompt from response if present
+        if request.Prompt in t:
+            t = t.replace(request.Prompt, "")
+
+        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
+
+    def PredictStream(self, request, context):
+        # Implement PredictStream RPC
+        #for reply in some_data_generator():
+        #    yield reply
+        # Not implemented yet
+        return self.Predict(request, context)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/extra/grpc/huggingface/backend_pb2.py
+++ b/extra/grpc/huggingface/backend_pb2.py
@@ -13,7 +13,7 @@ _sym_db = _symbol_database.Default()



-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rbackend.proto\x12\x07\x62\x61\x63kend\"\x0f\n\rHealthMessage\"\xa4\x05\n\x0ePredictOptions\x12\x0e\n\x06Prompt\x18\x01 \x01(\t\x12\x0c\n\x04Seed\x18\x02 \x01(\x05\x12\x0f\n\x07Threads\x18\x03 \x01(\x05\x12\x0e\n\x06Tokens\x18\x04 \x01(\x05\x12\x0c\n\x04TopK\x18\x05 \x01(\x05\x12\x0e\n\x06Repeat\x18\x06 \x01(\x05\x12\r\n\x05\x42\x61tch\x18\x07 \x01(\x05\x12\r\n\x05NKeep\x18\x08 \x01(\x05\x12\x13\n\x0bTemperature\x18\t \x01(\x02\x12\x0f\n\x07Penalty\x18\n \x01(\x02\x12\r\n\x05\x46\x31\x36KV\x18\x0b \x01(\x08\x12\x11\n\tDebugMode\x18\x0c \x01(\x08\x12\x13\n\x0bStopPrompts\x18\r \x03(\t\x12\x11\n\tIgnoreEOS\x18\x0e \x01(\x08\x12\x19\n\x11TailFreeSamplingZ\x18\x0f \x01(\x02\x12\x10\n\x08TypicalP\x18\x10 \x01(\x02\x12\x18\n\x10\x46requencyPenalty\x18\x11 \x01(\x02\x12\x17\n\x0fPresencePenalty\x18\x12 \x01(\x02\x12\x10\n\x08Mirostat\x18\x13 \x01(\x05\x12\x13\n\x0bMirostatETA\x18\x14 \x01(\x02\x12\x13\n\x0bMirostatTAU\x18\x15 \x01(\x02\x12\x12\n\nPenalizeNL\x18\x16 \x01(\x08\x12\x11\n\tLogitBias\x18\x17 \x01(\t\x12\r\n\x05MLock\x18\x19 \x01(\x08\x12\x0c\n\x04MMap\x18\x1a \x01(\x08\x12\x16\n\x0ePromptCacheAll\x18\x1b \x01(\x08\x12\x15\n\rPromptCacheRO\x18\x1c \x01(\x08\x12\x0f\n\x07Grammar\x18\x1d \x01(\t\x12\x0f\n\x07MainGPU\x18\x1e \x01(\t\x12\x13\n\x0bTensorSplit\x18\x1f \x01(\t\x12\x0c\n\x04TopP\x18  \x01(\x02\x12\x17\n\x0fPromptCachePath\x18! \x01(\t\x12\r\n\x05\x44\x65\x62ug\x18\" \x01(\x08\x12\x17\n\x0f\x45mbeddingTokens\x18# \x03(\x05\x12\x12\n\nEmbeddings\x18$ \x01(\t\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\t\"\xac\x02\n\x0cModelOptions\x12\r\n\x05Model\x18\x01 \x01(\t\x12\x13\n\x0b\x43ontextSize\x18\x02 \x01(\x05\x12\x0c\n\x04Seed\x18\x03 \x01(\x05\x12\x0e\n\x06NBatch\x18\x04 \x01(\x05\x12\x11\n\tF16Memory\x18\x05 \x01(\x08\x12\r\n\x05MLock\x18\x06 \x01(\x08\x12\x0c\n\x04MMap\x18\x07 \x01(\x08\x12\x11\n\tVocabOnly\x18\x08 \x01(\x08\x12\x0f\n\x07LowVRAM\x18\t \x01(\x08\x12\x12\n\nEmbeddings\x18\n \x01(\x08\x12\x0c\n\x04NUMA\x18\x0b \x01(\x08\x12\x12\n\nNGPULayers\x18\x0c \x01(\x05\x12\x0f\n\x07MainGPU\x18\r \x01(\t\x12\x13\n\x0bTensorSplit\x18\x0e \x01(\t\x12\x0f\n\x07Threads\x18\x0f \x01(\x05\x12\x19\n\x11LibrarySearchPath\x18\x10 \x01(\t\"*\n\x06Result\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\"%\n\x0f\x45mbeddingResult\x12\x12\n\nembeddings\x18\x01 \x03(\x02\"C\n\x11TranscriptRequest\x12\x0b\n\x03\x64st\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x0f\n\x07threads\x18\x04 \x01(\r\"N\n\x10TranscriptResult\x12,\n\x08segments\x18\x01 \x03(\x0b\x32\x1a.backend.TranscriptSegment\x12\x0c\n\x04text\x18\x02 \x01(\t\"Y\n\x11TranscriptSegment\x12\n\n\x02id\x18\x01 \x01(\x05\x12\r\n\x05start\x18\x02 \x01(\x03\x12\x0b\n\x03\x65nd\x18\x03 \x01(\x03\x12\x0c\n\x04text\x18\x04 \x01(\t\x12\x0e\n\x06tokens\x18\x05 \x03(\x05\"\x9e\x01\n\x14GenerateImageRequest\x12\x0e\n\x06height\x18\x01 \x01(\x05\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0c\n\x04mode\x18\x03 \x01(\x05\x12\x0c\n\x04step\x18\x04 \x01(\x05\x12\x0c\n\x04seed\x18\x05 \x01(\x05\x12\x17\n\x0fpositive_prompt\x18\x06 \x01(\t\x12\x17\n\x0fnegative_prompt\x18\x07 \x01(\t\x12\x0b\n\x03\x64st\x18\x08 \x01(\t\"6\n\nTTSRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05model\x18\x02 \x01(\t\x12\x0b\n\x03\x64st\x18\x03 \x01(\t2\xeb\x03\n\x07\x42\x61\x63kend\x12\x32\n\x06Health\x12\x16.backend.HealthMessage\x1a\x0e.backend.Reply\"\x00\x12\x34\n\x07Predict\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x12\x35\n\tLoadModel\x12\x15.backend.ModelOptions\x1a\x0f.backend.Result\"\x00\x12<\n\rPredictStream\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x30\x01\x12@\n\tEmbedding\x12\x17.backend.PredictOptions\x1a\x18.backend.EmbeddingResult\"\x00\x12\x41\n\rGenerateImage\x12\x1d.backend.GenerateImageRequest\x1a\x0f.backend.Result\"\x00\x12M\n\x12\x41udioTranscription\x12\x1a.backend.TranscriptRequest\x1a\x19.backend.TranscriptResult\"\x00\x12-\n\x03TTS\x12\x13.backend.TTSRequest\x1a\x0f.backend.Result\"\x00\x42Z\n\x19io.skynet.localai.backendB\x0eLocalAIBackendP\x01Z+github.com/go-skynet/LocalAI/pkg/grpc/protob\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rbackend.proto\x12\x07\x62\x61\x63kend\"\x0f\n\rHealthMessage\"\x86\x06\n\x0ePredictOptions\x12\x0e\n\x06Prompt\x18\x01 \x01(\t\x12\x0c\n\x04Seed\x18\x02 \x01(\x05\x12\x0f\n\x07Threads\x18\x03 \x01(\x05\x12\x0e\n\x06Tokens\x18\x04 \x01(\x05\x12\x0c\n\x04TopK\x18\x05 \x01(\x05\x12\x0e\n\x06Repeat\x18\x06 \x01(\x05\x12\r\n\x05\x42\x61tch\x18\x07 \x01(\x05\x12\r\n\x05NKeep\x18\x08 \x01(\x05\x12\x13\n\x0bTemperature\x18\t \x01(\x02\x12\x0f\n\x07Penalty\x18\n \x01(\x02\x12\r\n\x05\x46\x31\x36KV\x18\x0b \x01(\x08\x12\x11\n\tDebugMode\x18\x0c \x01(\x08\x12\x13\n\x0bStopPrompts\x18\r \x03(\t\x12\x11\n\tIgnoreEOS\x18\x0e \x01(\x08\x12\x19\n\x11TailFreeSamplingZ\x18\x0f \x01(\x02\x12\x10\n\x08TypicalP\x18\x10 \x01(\x02\x12\x18\n\x10\x46requencyPenalty\x18\x11 \x01(\x02\x12\x17\n\x0fPresencePenalty\x18\x12 \x01(\x02\x12\x10\n\x08Mirostat\x18\x13 \x01(\x05\x12\x13\n\x0bMirostatETA\x18\x14 \x01(\x02\x12\x13\n\x0bMirostatTAU\x18\x15 \x01(\x02\x12\x12\n\nPenalizeNL\x18\x16 \x01(\x08\x12\x11\n\tLogitBias\x18\x17 \x01(\t\x12\r\n\x05MLock\x18\x19 \x01(\x08\x12\x0c\n\x04MMap\x18\x1a \x01(\x08\x12\x16\n\x0ePromptCacheAll\x18\x1b \x01(\x08\x12\x15\n\rPromptCacheRO\x18\x1c \x01(\x08\x12\x0f\n\x07Grammar\x18\x1d \x01(\t\x12\x0f\n\x07MainGPU\x18\x1e \x01(\t\x12\x13\n\x0bTensorSplit\x18\x1f \x01(\t\x12\x0c\n\x04TopP\x18  \x01(\x02\x12\x17\n\x0fPromptCachePath\x18! \x01(\t\x12\r\n\x05\x44\x65\x62ug\x18\" \x01(\x08\x12\x17\n\x0f\x45mbeddingTokens\x18# \x03(\x05\x12\x12\n\nEmbeddings\x18$ \x01(\t\x12\x14\n\x0cRopeFreqBase\x18% \x01(\x02\x12\x15\n\rRopeFreqScale\x18& \x01(\x02\x12\x1b\n\x13NegativePromptScale\x18\' \x01(\x02\x12\x16\n\x0eNegativePrompt\x18( \x01(\t\"\x18\n\x05Reply\x12\x0f\n\x07message\x18\x01 \x01(\x0c\"\x9d\x04\n\x0cModelOptions\x12\r\n\x05Model\x18\x01 \x01(\t\x12\x13\n\x0b\x43ontextSize\x18\x02 \x01(\x05\x12\x0c\n\x04Seed\x18\x03 \x01(\x05\x12\x0e\n\x06NBatch\x18\x04 \x01(\x05\x12\x11\n\tF16Memory\x18\x05 \x01(\x08\x12\r\n\x05MLock\x18\x06 \x01(\x08\x12\x0c\n\x04MMap\x18\x07 \x01(\x08\x12\x11\n\tVocabOnly\x18\x08 \x01(\x08\x12\x0f\n\x07LowVRAM\x18\t \x01(\x08\x12\x12\n\nEmbeddings\x18\n \x01(\x08\x12\x0c\n\x04NUMA\x18\x0b \x01(\x08\x12\x12\n\nNGPULayers\x18\x0c \x01(\x05\x12\x0f\n\x07MainGPU\x18\r \x01(\t\x12\x13\n\x0bTensorSplit\x18\x0e \x01(\t\x12\x0f\n\x07Threads\x18\x0f \x01(\x05\x12\x19\n\x11LibrarySearchPath\x18\x10 \x01(\t\x12\x14\n\x0cRopeFreqBase\x18\x11 \x01(\x02\x12\x15\n\rRopeFreqScale\x18\x12 \x01(\x02\x12\x12\n\nRMSNormEps\x18\x13 \x01(\x02\x12\x0c\n\x04NGQA\x18\x14 \x01(\x05\x12\x11\n\tModelFile\x18\x15 \x01(\t\x12\x0e\n\x06\x44\x65vice\x18\x16 \x01(\t\x12\x11\n\tUseTriton\x18\x17 \x01(\x08\x12\x15\n\rModelBaseName\x18\x18 \x01(\t\x12\x18\n\x10UseFastTokenizer\x18\x19 \x01(\x08\x12\x14\n\x0cPipelineType\x18\x1a \x01(\t\x12\x15\n\rSchedulerType\x18\x1b \x01(\t\x12\x0c\n\x04\x43UDA\x18\x1c \x01(\x08\"*\n\x06Result\x12\x0f\n\x07message\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\"%\n\x0f\x45mbeddingResult\x12\x12\n\nembeddings\x18\x01 \x03(\x02\"C\n\x11TranscriptRequest\x12\x0b\n\x03\x64st\x18\x02 \x01(\t\x12\x10\n\x08language\x18\x03 \x01(\t\x12\x0f\n\x07threads\x18\x04 \x01(\r\"N\n\x10TranscriptResult\x12,\n\x08segments\x18\x01 \x03(\x0b\x32\x1a.backend.TranscriptSegment\x12\x0c\n\x04text\x18\x02 \x01(\t\"Y\n\x11TranscriptSegment\x12\n\n\x02id\x18\x01 \x01(\x05\x12\r\n\x05start\x18\x02 \x01(\x03\x12\x0b\n\x03\x65nd\x18\x03 \x01(\x03\x12\x0c\n\x04text\x18\x04 \x01(\t\x12\x0e\n\x06tokens\x18\x05 \x03(\x05\"\x9e\x01\n\x14GenerateImageRequest\x12\x0e\n\x06height\x18\x01 \x01(\x05\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0c\n\x04mode\x18\x03 \x01(\x05\x12\x0c\n\x04step\x18\x04 \x01(\x05\x12\x0c\n\x04seed\x18\x05 \x01(\x05\x12\x17\n\x0fpositive_prompt\x18\x06 \x01(\t\x12\x17\n\x0fnegative_prompt\x18\x07 \x01(\t\x12\x0b\n\x03\x64st\x18\x08 \x01(\t\"6\n\nTTSRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\r\n\x05model\x18\x02 \x01(\t\x12\x0b\n\x03\x64st\x18\x03 \x01(\t2\xeb\x03\n\x07\x42\x61\x63kend\x12\x32\n\x06Health\x12\x16.backend.HealthMessage\x1a\x0e.backend.Reply\"\x00\x12\x34\n\x07Predict\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x12\x35\n\tLoadModel\x12\x15.backend.ModelOptions\x1a\x0f.backend.Result\"\x00\x12<\n\rPredictStream\x12\x17.backend.PredictOptions\x1a\x0e.backend.Reply\"\x00\x30\x01\x12@\n\tEmbedding\x12\x17.backend.PredictOptions\x1a\x18.backend.EmbeddingResult\"\x00\x12\x41\n\rGenerateImage\x12\x1d.backend.GenerateImageRequest\x1a\x0f.backend.Result\"\x00\x12M\n\x12\x41udioTranscription\x12\x1a.backend.TranscriptRequest\x1a\x19.backend.TranscriptResult\"\x00\x12-\n\x03TTS\x12\x13.backend.TTSRequest\x1a\x0f.backend.Result\"\x00\x42Z\n\x19io.skynet.localai.backendB\x0eLocalAIBackendP\x01Z+github.com/go-skynet/LocalAI/pkg/grpc/protob\x06proto3')

 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -25,25 +25,25 @@ if _descriptor._USE_C_DESCRIPTORS == False:
  _globals['_HEALTHMESSAGE']._serialized_start=26
  _globals['_HEALTHMESSAGE']._serialized_end=41
  _globals['_PREDICTOPTIONS']._serialized_start=44
-  _globals['_PREDICTOPTIONS']._serialized_end=720
-  _globals['_REPLY']._serialized_start=722
-  _globals['_REPLY']._serialized_end=746
-  _globals['_MODELOPTIONS']._serialized_start=749
-  _globals['_MODELOPTIONS']._serialized_end=1049
-  _globals['_RESULT']._serialized_start=1051
-  _globals['_RESULT']._serialized_end=1093
-  _globals['_EMBEDDINGRESULT']._serialized_start=1095
-  _globals['_EMBEDDINGRESULT']._serialized_end=1132
-  _globals['_TRANSCRIPTREQUEST']._serialized_start=1134
-  _globals['_TRANSCRIPTREQUEST']._serialized_end=1201
-  _globals['_TRANSCRIPTRESULT']._serialized_start=1203
-  _globals['_TRANSCRIPTRESULT']._serialized_end=1281
-  _globals['_TRANSCRIPTSEGMENT']._serialized_start=1283
-  _globals['_TRANSCRIPTSEGMENT']._serialized_end=1372
-  _globals['_GENERATEIMAGEREQUEST']._serialized_start=1375
-  _globals['_GENERATEIMAGEREQUEST']._serialized_end=1533
-  _globals['_TTSREQUEST']._serialized_start=1535
-  _globals['_TTSREQUEST']._serialized_end=1589
-  _globals['_BACKEND']._serialized_start=1592
-  _globals['_BACKEND']._serialized_end=2083
+  _globals['_PREDICTOPTIONS']._serialized_end=818
+  _globals['_REPLY']._serialized_start=820
+  _globals['_REPLY']._serialized_end=844
+  _globals['_MODELOPTIONS']._serialized_start=847
+  _globals['_MODELOPTIONS']._serialized_end=1388
+  _globals['_RESULT']._serialized_start=1390
+  _globals['_RESULT']._serialized_end=1432
+  _globals['_EMBEDDINGRESULT']._serialized_start=1434
+  _globals['_EMBEDDINGRESULT']._serialized_end=1471
+  _globals['_TRANSCRIPTREQUEST']._serialized_start=1473
+  _globals['_TRANSCRIPTREQUEST']._serialized_end=1540
+  _globals['_TRANSCRIPTRESULT']._serialized_start=1542
+  _globals['_TRANSCRIPTRESULT']._serialized_end=1620
+  _globals['_TRANSCRIPTSEGMENT']._serialized_start=1622
+  _globals['_TRANSCRIPTSEGMENT']._serialized_end=1711
+  _globals['_GENERATEIMAGEREQUEST']._serialized_start=1714
+  _globals['_GENERATEIMAGEREQUEST']._serialized_end=1872
+  _globals['_TTSREQUEST']._serialized_start=1874
+  _globals['_TTSREQUEST']._serialized_end=1928
+  _globals['_BACKEND']._serialized_start=1931
+  _globals['_BACKEND']._serialized_end=2422
 # @@protoc_insertion_point(module_scope)
--- a/extra/grpc/huggingface/huggingface.py
+++ b/extra/grpc/huggingface/huggingface.py
@@ -15,10 +15,9 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def Health(self, request, context):
-        return backend_pb2.Reply(message="OK")
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        model_name = request.Model
-        model_name = os.path.basename(model_name)
        try:
            self.model = SentenceTransformer(model_name)
        except Exception as err:
--- a/go.mod
+++ b/go.mod
@@ -3,13 +3,13 @@ module github.com/go-skynet/LocalAI
 go 1.20

 require (
-	github.com/donomii/go-rwkv.cpp v0.0.0-20230619005719-f5a8c4539674
+	github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df
 	github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e
 	github.com/go-audio/wav v1.1.0
 	github.com/go-skynet/bloomz.cpp v0.0.0-20230529155654-1834e77b83fa
 	github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1
 	github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230714203132-ffb09d7dd71e
-	github.com/go-skynet/go-llama.cpp v0.0.0-20230709163512-6c97625cca76
+	github.com/go-skynet/go-llama.cpp v0.0.0-20230802220037-50cee7712066
 	github.com/gofiber/fiber/v2 v2.48.0
 	github.com/google/uuid v1.3.0
 	github.com/hashicorp/go-multierror v1.1.1
@@ -20,17 +20,17 @@ require (
 	github.com/mudler/go-ggllm.cpp v0.0.0-20230709223052-862477d16eef
 	github.com/mudler/go-processmanager v0.0.0-20220724164624-c45b5c61312d
 	github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af
-	github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230714185456-cfd70b69fcf5
+	github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230811181453-4d855afe973a
 	github.com/onsi/ginkgo/v2 v2.11.0
-	github.com/onsi/gomega v1.27.8
+	github.com/onsi/gomega v1.27.10
 	github.com/otiai10/openaigo v1.5.2
 	github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
-	github.com/rs/zerolog v1.29.1
-	github.com/sashabaranov/go-openai v1.14.0
-	github.com/tmc/langchaingo v0.0.0-20230713201705-dcf7ecdc8ac8
+	github.com/rs/zerolog v1.30.0
+	github.com/sashabaranov/go-openai v1.14.1
+	github.com/tmc/langchaingo v0.0.0-20230811231558-fd8b7f099537
 	github.com/urfave/cli/v2 v2.25.7
 	github.com/valyala/fasthttp v1.48.0
-	google.golang.org/grpc v1.56.2
+	google.golang.org/grpc v1.57.0
 	google.golang.org/protobuf v1.31.0
 	gopkg.in/yaml.v2 v2.4.0
 	gopkg.in/yaml.v3 v3.0.1
@@ -39,7 +39,6 @@ require (
 require (
 	github.com/dlclark/regexp2 v1.8.1 // indirect
 	github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect
-	github.com/go-skynet/go-llama.cpp-grammar v0.0.0-20230703203849-ffa57fbc3a12 // indirect
 	github.com/golang/protobuf v1.5.3 // indirect
 	github.com/golang/snappy v0.0.2 // indirect
 	github.com/klauspost/pgzip v1.2.5 // indirect
@@ -51,7 +50,8 @@ require (
 	github.com/pkoukk/tiktoken-go v0.1.2 // indirect
 	github.com/ulikunitz/xz v0.5.9 // indirect
 	github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
-	google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
+	google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 // indirect
 	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
 	gopkg.in/fsnotify.v1 v1.4.7 // indirect
 	gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
@@ -72,17 +72,13 @@ require (
 	github.com/mattn/go-isatty v0.0.19 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
 	github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760
-	github.com/philhofer/fwd v1.1.2 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
-	github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 // indirect
-	github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee // indirect
-	github.com/tinylib/msgp v1.1.8 // indirect
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
-	golang.org/x/net v0.10.0 // indirect
+	golang.org/x/net v0.12.0 // indirect
 	golang.org/x/sys v0.10.0 // indirect
-	golang.org/x/text v0.9.0 // indirect
+	golang.org/x/text v0.11.0 // indirect
 	golang.org/x/tools v0.9.3 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -15,6 +15,8 @@ github.com/dlclark/regexp2 v1.8.1 h1:6Lcdwya6GjPUNsBct8Lg/yRPwMhABj269AAzdGSiR+0
 github.com/dlclark/regexp2 v1.8.1/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/donomii/go-rwkv.cpp v0.0.0-20230619005719-f5a8c4539674 h1:G70Yf/QOCEL1v24idWnGd6rJsbqiGkJAJnMaWaolzEg=
 github.com/donomii/go-rwkv.cpp v0.0.0-20230619005719-f5a8c4539674/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df h1:qVcBEZlvp5A1gGWNJj02xyDtbsUI2hohlQMSB1fgER4=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY=
 github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
@@ -33,24 +35,24 @@ github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
 github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-skynet/bloomz.cpp v0.0.0-20230529155654-1834e77b83fa h1:gxr68r/6EWroay4iI81jxqGCDbKotY4+CiwdUkBz2NQ=
 github.com/go-skynet/bloomz.cpp v0.0.0-20230529155654-1834e77b83fa/go.mod h1:wc0fJ9V04yiYTfgKvE5RUUSRQ5Kzi0Bo4I+U3nNOUuA=
-github.com/go-skynet/go-bert.cpp v0.0.0-20230607105116-6069103f54b9 h1:wRGbDwNwPmSzoXVw/HLzXY4blpRvPWg7QW2OA0WKezA=
-github.com/go-skynet/go-bert.cpp v0.0.0-20230607105116-6069103f54b9/go.mod h1:pXKCpYYXujMeAvgJHU6WoMfvYbr84563+J8+Ebkyr5U=
 github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1 h1:yXvc7QfGtoZ51tUW/YVjoTwAfh8HG88XU7UOrbNlz5Y=
 github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1/go.mod h1:fYjkCDRzC+oRLHSjQoajmYK6AmeJnmEanV27CClAcDc=
-github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230630204211-3fec197a1dc4 h1:LScGc8yWTS9wbS2RTOq6s+waeHElLIQDJg2SUCwrO3E=
-github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230630204211-3fec197a1dc4/go.mod h1:31j1odgFXP8hDSUVfH0zErKI5aYVP18ddYnPkwCso2A=
 github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230714203132-ffb09d7dd71e h1:4reMY29i1eOZaRaSTMPNyXI7X8RMNxCTfDDBXYzrbr0=
 github.com/go-skynet/go-ggml-transformers.cpp v0.0.0-20230714203132-ffb09d7dd71e/go.mod h1:31j1odgFXP8hDSUVfH0zErKI5aYVP18ddYnPkwCso2A=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230703203849-ffa57fbc3a12 h1:cfGZiZana0gPD0i8nmyOGTUQGb4N8PYqaBqhhukREPc=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230703203849-ffa57fbc3a12/go.mod h1:tzi97YvT1bVQ+iTG39LvpDkKG1WbizgtljC+orSoM40=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230709163512-6c97625cca76 h1:NRdxo2MKi8qhWZXxu6CIZOkdH+LBERFz1kk22U1FD3k=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230709163512-6c97625cca76/go.mod h1:tzi97YvT1bVQ+iTG39LvpDkKG1WbizgtljC+orSoM40=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230724222459-562d2b5a7119 h1:FeUSk5yMHT7J7jeCQKAOs4x5LRNSYH0SR6djM/i1jcc=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230724222459-562d2b5a7119/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230727163958-6ba16de8e965 h1:2MO/rABKpkXnnKQ3Ar90aqhnlMEejE9gnKG6bafv+ow=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230727163958-6ba16de8e965/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230729200103-8c51308e42d7 h1:1uBwholTaJ8Lva8ySJjT4jNaCDAh+MJXtsbZBbQq9lA=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230729200103-8c51308e42d7/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230802220037-50cee7712066 h1:v4Js+yEdgY9IV7n35M+5MELLxlOMp3qC5whZm5YTLjI=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230802220037-50cee7712066/go.mod h1:fiJBto+Le1XLtD/cID5SAKs8cKE7wFXJKfTT3wvPQRA=
 github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
-github.com/gofiber/fiber/v2 v2.47.0 h1:EN5lHVCc+Pyqh5OEsk8fzRiifgwpbrP0rulQ4iNf3fs=
-github.com/gofiber/fiber/v2 v2.47.0/go.mod h1:mbFMVN1lQuzziTkkakgtKKdjfsXSw9BKR5lmcNksUoU=
 github.com/gofiber/fiber/v2 v2.48.0 h1:cRVMCb9aUJDsyHxGFLwz/sGzDggdailZZyptU9F9cU0=
 github.com/gofiber/fiber/v2 v2.48.0/go.mod h1:xqJgfqrc23FJuqGOW6DVgi3HyZEm2Mn9pRqUb2kHSX8=
 github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
@@ -116,8 +118,6 @@ github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OH
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
-github.com/mudler/go-ggllm.cpp v0.0.0-20230708215552-a6504d5bc137 h1:d+XGcCrw65q6KDUbF2wZBPVZ7i7kU6I7fKSX+UwzP7w=
-github.com/mudler/go-ggllm.cpp v0.0.0-20230708215552-a6504d5bc137/go.mod h1:00giAi/vwF8LX29JBjkPQhtASsivPnGNzB6sdmk8JGE=
 github.com/mudler/go-ggllm.cpp v0.0.0-20230709223052-862477d16eef h1:OJZtJ5vYhlkTJI0RHIl62kOkhiINQEhZgsXlwmmNDhM=
 github.com/mudler/go-ggllm.cpp v0.0.0-20230709223052-862477d16eef/go.mod h1:00giAi/vwF8LX29JBjkPQhtASsivPnGNzB6sdmk8JGE=
 github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760 h1:OFVkSxR7CRSRSNm5dvpMRZwmSwWa8EMMnHbc84fW5tU=
@@ -126,10 +126,20 @@ github.com/mudler/go-processmanager v0.0.0-20220724164624-c45b5c61312d h1:/lAg9v
 github.com/mudler/go-processmanager v0.0.0-20220724164624-c45b5c61312d/go.mod h1:HGGAOJhipApckwNV8ZTliRJqxctUv3xRY+zbQEwuytc=
 github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af h1:XFq6OUqsWQam0OrEr05okXsJK/TQur3zoZTHbiZD3Ks=
 github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af/go.mod h1:8ufRkpz/S/9ahkaxzZ5i4WMgO9w4InEhuRoT7vK5Rnw=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230708212935-d611d107479f h1:FtXRIjsBvoBQ5xmA26QbzyG4RjV2U5lOpUgP4npITOM=
-github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230708212935-d611d107479f/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
 github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230714185456-cfd70b69fcf5 h1:bmQnxyKiqCu8i2y/N/Sf0coWoG2/Ed12YGQeb7lTnjo=
 github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230714185456-cfd70b69fcf5/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230725212419-9100b2ef6fb9 h1:/oRwZhulKTU8LpPD2fXi2o2kdlTutQjYWDVMkrv14po=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230725212419-9100b2ef6fb9/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230727161923-39acbc837816 h1:hRi7hpDUuaO0dB4NZ8eyaeD2fRar6CPyNAARsO5DhzA=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230727161923-39acbc837816/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230731161838-cbdcde8b7586 h1:WVEMSZMyHFe68PN204c3Fdk5g2lZouPvbU9/2zkPpWc=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230731161838-cbdcde8b7586/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230802145814-c449b71b56de h1:E5EGczxEAcbaO8yqj074MQxU609QbtB6in3qTOW1EFo=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230802145814-c449b71b56de/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230807175413-0f2bb506a8ee h1:Y/j+GNytyncmDnAEuDZwzkYC9nzUPvXJPF+nntQG0VU=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230807175413-0f2bb506a8ee/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230811181453-4d855afe973a h1:bX26Zfwh72ug2aZTEwFISTMEJ56Wa/4KqboidD+g92A=
+github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20230811181453-4d855afe973a/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
 github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
 github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
 github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
@@ -146,14 +156,13 @@ github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1y
 github.com/onsi/gomega v1.16.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY=
 github.com/onsi/gomega v1.27.8 h1:gegWiwZjBsf2DgiSbf5hpokZ98JVDMcWkUiigk6/KXc=
 github.com/onsi/gomega v1.27.8/go.mod h1:2J8vzI/s+2shY9XHRApDkdgPo1TKT7P2u6fXeJKFnNQ=
+github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI=
+github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M=
 github.com/otiai10/mint v1.6.1 h1:kgbTJmOpp/0ce7hk3H8jiSuR0MXmpwWRfqUdKww17qg=
 github.com/otiai10/openaigo v1.5.2 h1:YnNDisZmA4syArF3IxMCIrfgZOq30PLV219gPY7n2z8=
 github.com/otiai10/openaigo v1.5.2/go.mod h1:kIaXc3V+Xy5JLplcBxehVyGYDtufHp3PFPy04jOwOAI=
 github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI=
 github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE=
-github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
-github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw=
-github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0=
 github.com/pierrec/lz4/v4 v4.1.2 h1:qvY3YFXRQE/XB8MlLzJH7mSzBs74eA2gg52YTk6jUPM=
 github.com/pierrec/lz4/v4 v4.1.2/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
@@ -164,31 +173,34 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
+github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
 github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
 github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
+github.com/rs/zerolog v1.30.0 h1:SymVODrcRsaRaSInD9yQtKbtWqwsfoPcRff/oRXLj4c=
+github.com/rs/zerolog v1.30.0/go.mod h1:/tk+P47gFdPXq4QYjvCmT5/Gsug2nagsFWBWhAiSi1w=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/sashabaranov/go-openai v1.13.0 h1:EAusFfnhaMaaUspUZ2+MbB/ZcVeD4epJmTOlZ+8AcAE=
-github.com/sashabaranov/go-openai v1.13.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/sashabaranov/go-openai v1.14.0 h1:D1yAB+DHElgbJFdYyjxfTWMFzhddn+PwZmkQ039L7mQ=
 github.com/sashabaranov/go-openai v1.14.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
-github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
-github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94/go.mod h1:90zrgN3D/WJsDd1iXHT96alCoN2KJo6/4x1DZC3wZs8=
-github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d/go.mod h1:Gy+0tqhJvgGlqnTF8CVGP0AaGRjwBtXs/a5PA0Y3+A4=
-github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee h1:8Iv5m6xEo1NR1AvpV+7XmhI4r39LGNzwUL4YpMuL5vk=
-github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee/go.mod h1:qwtSXrKuJh/zsFQ12yEE89xfCrGKK63Rr7ctU/uCo4g=
+github.com/sashabaranov/go-openai v1.14.1 h1:jqfkdj8XHnBF84oi2aNtT8Ktp3EJ0MfuVjvcMkfI0LA=
+github.com/sashabaranov/go-openai v1.14.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
-github.com/tinylib/msgp v1.1.6/go.mod h1:75BAfg2hauQhs3qedfdDZmWAPcFMAvJE5b9rGOMufyw=
-github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0=
-github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw=
-github.com/tmc/langchaingo v0.0.0-20230709010448-a875e6bc0c54 h1:MZSC3/pdBzkoPG49uTRvtEepOQKdbdgaT1aLtaEwxx4=
-github.com/tmc/langchaingo v0.0.0-20230709010448-a875e6bc0c54/go.mod h1:RsMJqgUynOtr2jWNhUF41R3j6SDkKq9c8UfE0nJYBb4=
 github.com/tmc/langchaingo v0.0.0-20230713201705-dcf7ecdc8ac8 h1:wdJigYmmIRCuXhCkADDr53Oa1fp/WlxCPoVXR2r7GrU=
 github.com/tmc/langchaingo v0.0.0-20230713201705-dcf7ecdc8ac8/go.mod h1:mTzgQfAGwmBz2hhQELZfu2bwsbHwyKHA6IHOa+9LDFg=
+github.com/tmc/langchaingo v0.0.0-20230726025230-7d5f9fd5e90a h1:I/2JSuYXkWaVVLSZmrPfrgbvvvPR0IaulZcB0Iu8oVI=
+github.com/tmc/langchaingo v0.0.0-20230726025230-7d5f9fd5e90a/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
+github.com/tmc/langchaingo v0.0.0-20230729232647-7df4fe5fb8fe h1:+XVrCjh3rPibfISkUFG2Ck5NLKODQ9cFdmraFye1bGA=
+github.com/tmc/langchaingo v0.0.0-20230729232647-7df4fe5fb8fe/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
+github.com/tmc/langchaingo v0.0.0-20230731024823-8f101609f600 h1:SABuIthjhIXEsxnokuA16CZOxxdW9XohIHQqd/go8Nc=
+github.com/tmc/langchaingo v0.0.0-20230731024823-8f101609f600/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
+github.com/tmc/langchaingo v0.0.0-20230802030916-271e9bd7e7c5 h1:js7vYDJGzUGVSt0YlIusUc5BXYVECu3LUI/asby5Ggo=
+github.com/tmc/langchaingo v0.0.0-20230802030916-271e9bd7e7c5/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
+github.com/tmc/langchaingo v0.0.0-20230811231558-fd8b7f099537 h1:vkeNjlW+0Xiw2XizMHoQuLG8pg6AN1hU8zJuMV9GQBc=
+github.com/tmc/langchaingo v0.0.0-20230811231558-fd8b7f099537/go.mod h1:8T+nNIGBr3nYQEYFmF/YaT8t8YTKLvFYZBuVZOAYn5E=
 github.com/ulikunitz/xz v0.5.8/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/ulikunitz/xz v0.5.9 h1:RsKRIA2MO8x56wkkcd3LbtcE/uMszhb6DpRf+3uwa3I=
 github.com/ulikunitz/xz v0.5.9/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
@@ -205,31 +217,24 @@ github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMx
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
-github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
-golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
-golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
 golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk=
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
 golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk=
-golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
-golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
 golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
 golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50=
+golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -242,34 +247,23 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
-golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA=
 golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
 golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4=
+golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
-golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
-golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
-golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ=
 golang.org/x/tools v0.9.3 h1:Gn1I8+64MsuTb/HpH+LmQtNas23LhUVr3rYZ0eKuaMM=
 golang.org/x/tools v0.9.3/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -278,8 +272,14 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A=
 google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU=
+google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54 h1:9NWlQfY2ePejTmfwUH1OWwmznFa+0kKcHGPDvcPza9M=
+google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54/go.mod h1:zqTuNwFlFRsw5zIts5VnzLQxSRqh+CGOTVMlYbY0Eyk=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19 h1:0nDDozoAU19Qb2HwhXadU8OcsiO/09cnTqhUtq2MEOM=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA=
 google.golang.org/grpc v1.56.2 h1:fVRFRnXvU+x6C4IlHZewvJOVHoOv1TUuQyoRsYnB4bI=
 google.golang.org/grpc v1.56.2/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s=
+google.golang.org/grpc v1.57.0 h1:kfzNeI/klCGD2YPMUlaGNT3pxvYfga7smW3Vth8Zsiw=
+google.golang.org/grpc v1.57.0/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
 google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
 google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
@@ -288,8 +288,6 @@ google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzi
 google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
-google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
-google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
 google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
--- a/main.go
+++ b/main.go
@@ -130,6 +130,11 @@ func main() {
 				EnvVars: []string{"UPLOAD_LIMIT"},
 				Value:   15,
 			},
+			&cli.StringSliceFlag{
+				Name:    "api-keys",
+				Usage:   "List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys.",
+				EnvVars: []string{"API_KEY"},
+			},
 		},
 		Description: `
 LocalAI is a drop-in replacement OpenAI API which runs inference locally.
@@ -167,6 +172,7 @@ For a list of compatible model, check out: https://localai.io/model-compatibilit
 				options.WithBackendAssets(backendAssets),
 				options.WithBackendAssetsOutput(ctx.String("backend-assets-path")),
 				options.WithUploadLimitMB(ctx.Int("upload-limit")),
+				options.WithApiKeys(ctx.StringSlice("api-keys")),
 			}

 			externalgRPC := ctx.StringSlice("external-grpc-backends")
--- a/pkg/gallery/gallery.go
+++ b/pkg/gallery/gallery.go
@@ -19,6 +19,8 @@ type Gallery struct {
 // Installs a model from the gallery (galleryname@modelname)
 func InstallModelFromGallery(galleries []Gallery, name string, basePath string, req GalleryModel, downloadStatus func(string, string, string, float64)) error {
 	applyModel := func(model *GalleryModel) error {
+		name = strings.ReplaceAll(name, string(os.PathSeparator), "__")
+
 		config, err := GetGalleryConfigFromURL(model.URL)
 		if err != nil {
 			return err
@@ -51,7 +53,11 @@ func InstallModelFromGallery(galleries []Gallery, name string, basePath string,

 	model, err := FindGallery(models, name)
 	if err != nil {
-		return err
+		var err2 error
+		model, err2 = FindGallery(models, strings.ToLower(name))
+		if err2 != nil {
+			return err
+		}
 	}

 	return applyModel(model)
@@ -79,7 +85,7 @@ func InstallModelFromGalleryByName(galleries []Gallery, name string, basePath st
 	name = strings.ReplaceAll(name, string(os.PathSeparator), "__")
 	var model *GalleryModel
 	for _, m := range models {
-		if name == m.Name {
+		if name == m.Name || m.Name == strings.ToLower(name) {
 			model = m
 		}
 	}
--- a/pkg/gallery/models_test.go
+++ b/pkg/gallery/models_test.go
@@ -1,7 +1,6 @@
 package gallery_test

 import (
-	"io/ioutil"
 	"os"
 	"path/filepath"

@@ -50,7 +49,7 @@ var _ = Describe("Model test", func() {
 			}}
 			out, err := yaml.Marshal(gallery)
 			Expect(err).ToNot(HaveOccurred())
-			err = ioutil.WriteFile(filepath.Join(tempdir, "gallery_simple.yaml"), out, 0644)
+			err = os.WriteFile(filepath.Join(tempdir, "gallery_simple.yaml"), out, 0644)
 			Expect(err).ToNot(HaveOccurred())

 			galleries := []Gallery{
--- a/pkg/grammar/json_schema.go
+++ b/pkg/grammar/json_schema.go
@@ -15,10 +15,13 @@ var (

 	PRIMITIVE_RULES = map[string]string{
 		"boolean": `("true" | "false") space`,
-		"number":  `[0-9]+ space`,                    // TODO complete
-		"integer": `[0-9]+ space`,                    // TODO complete
-		"string":  `"\"" [ \t!#-\[\]-~]* "\"" space`, // TODO complete
-		"null":    `"null" space`,
+		"number":  `("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space`,
+		"integer": `("-"? ([0-9] | [1-9] [0-9]*)) space`,
+		"string": `"\"" (
+			[^"\\] |
+			"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+		  )* "\"" space`,
+		"null": `"null" space`,
 	}

 	INVALID_RULE_CHARS_RE     = regexp.MustCompile(`[^a-zA-Z0-9-]+`)
@@ -176,6 +179,9 @@ func (sc *JSONSchemaConverter) visit(schema map[string]interface{}, name string,
 		if !exists {
 			panic(fmt.Sprintf("Unrecognized schema: %v", schema))
 		}
+		if ruleName == "root" {
+			schemaType = "root"
+		}
 		return sc.addRule(schemaType, primitiveRule)
 	}
 }
--- a/pkg/grammar/json_schema_test.go
+++ b/pkg/grammar/json_schema_test.go
@@ -48,7 +48,10 @@ root ::= root-0 | root-1
 space ::= " "?
 root-0-arguments ::= "{" space "\"date\"" space ":" space string "," space "\"time\"" space ":" space string "," space "\"title\"" space ":" space string "}" space
 root-1 ::= "{" space "\"arguments\"" space ":" space root-1-arguments "," space "\"function\"" space ":" space root-1-function "}" space
-string ::= "\"" [ \t!#-\[\]-~]* "\"" space
+string ::= "\"" (
+	[^"\\] |
+	"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+)* "\"" space
 root-1-function ::= "\"search\""`
 )

--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -42,7 +42,7 @@ func (c *Client) HealthCheck(ctx context.Context) bool {
 		return false
 	}

-	if res.Message == "OK" {
+	if string(res.Message) == "OK" {
 		return true
 	}
 	return false
@@ -80,7 +80,7 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
 	return client.LoadModel(ctx, in, opts...)
 }

-func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s string), opts ...grpc.CallOption) error {
+func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error {
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
 		return err
--- a/pkg/grpc/image/stablediffusion.go
+++ b/pkg/grpc/image/stablediffusion.go
@@ -16,7 +16,7 @@ type StableDiffusion struct {
 func (sd *StableDiffusion) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
-	sd.stablediffusion, err = stablediffusion.New(opts.Model)
+	sd.stablediffusion, err = stablediffusion.New(opts.ModelFile)
 	return err
 }

--- a/pkg/grpc/interface.go
+++ b/pkg/grpc/interface.go
@@ -14,3 +14,7 @@ type LLM interface {
 	AudioTranscription(*pb.TranscriptRequest) (api.Result, error)
 	TTS(*pb.TTSRequest) error
 }
+
+func newReply(s string) *pb.Reply {
+	return &pb.Reply{Message: []byte(s)}
+}
--- a/pkg/grpc/llm/bert/bert.go
+++ b/pkg/grpc/llm/bert/bert.go
@@ -15,7 +15,7 @@ type Embeddings struct {
 }

 func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
-	model, err := bert.New(opts.Model)
+	model, err := bert.New(opts.ModelFile)
 	llm.bert = model
 	return err
 }
--- a/pkg/grpc/llm/bloomz/bloomz.go
+++ b/pkg/grpc/llm/bloomz/bloomz.go
@@ -18,7 +18,7 @@ type LLM struct {
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	model, err := bloomz.New(opts.Model)
+	model, err := bloomz.New(opts.ModelFile)
 	llm.bloomz = model
 	return err
 }
--- a/pkg/grpc/llm/falcon/falcon.go
+++ b/pkg/grpc/llm/falcon/falcon.go
@@ -40,7 +40,7 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 		ggllmOpts = append(ggllmOpts, ggllm.SetNBatch(512))
 	}

-	model, err := ggllm.New(opts.Model, ggllmOpts...)
+	model, err := ggllm.New(opts.ModelFile, ggllmOpts...)
 	llm.falcon = model
 	return err
 }
--- a/pkg/grpc/llm/gpt4all/gpt4all.go
+++ b/pkg/grpc/llm/gpt4all/gpt4all.go
@@ -17,7 +17,7 @@ type LLM struct {
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	model, err := gpt4all.New(opts.Model,
+	model, err := gpt4all.New(opts.ModelFile,
 		gpt4all.SetThreads(int(opts.Threads)),
 		gpt4all.SetLibrarySearchPath(opts.LibrarySearchPath))
 	llm.gpt4all = model
--- a/pkg/grpc/llm/llama-grammar/llama.go
+++ b/pkg/grpc/llm/llama-grammar/llama.go
@@ -1,170 +0,0 @@
-package llama
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/go-llama.cpp-grammar"
-)
-
-type LLM struct {
-	base.Base
-
-	llama *llama.LLama
-}
-
-func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	llamaOpts := []llama.ModelOption{}
-
-	if opts.ContextSize != 0 {
-		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
-	}
-	if opts.F16Memory {
-		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-	}
-	if opts.Embeddings {
-		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
-	}
-	if opts.NGPULayers != 0 {
-		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
-	}
-
-	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
-	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
-	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
-	if opts.NBatch != 0 {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
-	} else {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
-	}
-
-	if opts.NUMA {
-		llamaOpts = append(llamaOpts, llama.EnableNUMA)
-	}
-
-	if opts.LowVRAM {
-		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
-	}
-
-	model, err := llama.New(opts.Model, llamaOpts...)
-	llm.llama = model
-	return err
-}
-
-func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
-	predictOptions := []llama.PredictOption{
-		llama.SetTemperature(float64(opts.Temperature)),
-		llama.SetTopP(float64(opts.TopP)),
-		llama.SetTopK(int(opts.TopK)),
-		llama.SetTokens(int(opts.Tokens)),
-		llama.SetThreads(int(opts.Threads)),
-	}
-
-	if opts.PromptCacheAll {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
-	}
-
-	if opts.PromptCacheRO {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
-	}
-
-	predictOptions = append(predictOptions, llama.WithGrammar(opts.Grammar))
-
-	// Expected absolute path
-	if opts.PromptCachePath != "" {
-		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
-	}
-
-	if opts.Mirostat != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
-	}
-
-	if opts.MirostatETA != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA)))
-	}
-
-	if opts.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU)))
-	}
-
-	if opts.Debug {
-		predictOptions = append(predictOptions, llama.Debug)
-	}
-
-	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
-
-	if opts.PresencePenalty != 0 {
-		predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty)))
-	}
-
-	if opts.NKeep != 0 {
-		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
-	}
-
-	if opts.Batch != 0 {
-		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
-	}
-
-	if opts.F16KV {
-		predictOptions = append(predictOptions, llama.EnableF16KV)
-	}
-
-	if opts.IgnoreEOS {
-		predictOptions = append(predictOptions, llama.IgnoreEOS)
-	}
-
-	if opts.Seed != 0 {
-		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
-	}
-
-	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
-
-	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
-	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
-	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
-	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
-	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
-	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
-	predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP)))
-	return predictOptions
-}
-
-func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	predictOptions := buildPredictOptions(opts)
-
-	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
-		results <- token
-		return true
-	}))
-
-	go func() {
-		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		close(results)
-	}()
-
-	return nil
-}
-
-func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	predictOptions := buildPredictOptions(opts)
-
-	if len(opts.EmbeddingTokens) > 0 {
-		tokens := []int{}
-		for _, t := range opts.EmbeddingTokens {
-			tokens = append(tokens, int(t))
-		}
-		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
-	}
-
-	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
-}
--- a/pkg/grpc/llm/llama/llama.go
+++ b/pkg/grpc/llm/llama/llama.go
@@ -17,7 +17,29 @@ type LLM struct {
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	llamaOpts := []llama.ModelOption{}
+
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+
+	llamaOpts := []llama.ModelOption{
+		llama.WithRopeFreqBase(ropeFreqBase),
+		llama.WithRopeFreqScale(ropeFreqScale),
+	}
+
+	if opts.NGQA != 0 {
+		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
+	}
+
+	if opts.RMSNormEps != 0 {
+		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
+	}

 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
@@ -49,18 +71,32 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
 	}

-	model, err := llama.New(opts.Model, llamaOpts...)
+	model, err := llama.New(opts.ModelFile, llamaOpts...)
 	llm.llama = model
 	return err
 }

 func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
 	predictOptions := []llama.PredictOption{
-		llama.SetTemperature(float64(opts.Temperature)),
-		llama.SetTopP(float64(opts.TopP)),
+		llama.SetTemperature(opts.Temperature),
+		llama.SetTopP(opts.TopP),
 		llama.SetTopK(int(opts.TopK)),
 		llama.SetTokens(int(opts.Tokens)),
 		llama.SetThreads(int(opts.Threads)),
+		llama.WithGrammar(opts.Grammar),
+		llama.SetRopeFreqBase(ropeFreqBase),
+		llama.SetRopeFreqScale(ropeFreqScale),
+		llama.SetNegativePromptScale(opts.NegativePromptScale),
+		llama.SetNegativePrompt(opts.NegativePrompt),
 	}

 	if opts.PromptCacheAll {
@@ -81,11 +117,11 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	}

 	if opts.MirostatETA != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA)))
+		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
 	}

 	if opts.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU)))
+		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
 	}

 	if opts.Debug {
@@ -95,7 +131,7 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))

 	if opts.PresencePenalty != 0 {
-		predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty)))
+		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
 	}

 	if opts.NKeep != 0 {
@@ -120,13 +156,13 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {

 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))

-	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
+	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
 	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
 	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
 	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
 	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
-	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
-	predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP)))
+	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
+	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
 	return predictOptions
 }

--- a/pkg/grpc/llm/rwkv/rwkv.go
+++ b/pkg/grpc/llm/rwkv/rwkv.go
@@ -20,9 +20,9 @@ type LLM struct {
 }

 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	modelPath := filepath.Dir(opts.Model)
-	modelFile := filepath.Base(opts.Model)
-	model := rwkv.LoadFiles(opts.Model, filepath.Join(modelPath, modelFile+tokenizerSuffix), uint32(opts.GetThreads()))
+	modelPath := filepath.Dir(opts.ModelFile)
+	modelFile := filepath.Base(opts.ModelFile)
+	model := rwkv.LoadFiles(opts.ModelFile, filepath.Join(modelPath, modelFile+tokenizerSuffix), uint32(opts.GetThreads()))

 	if model == nil {
 		return fmt.Errorf("could not load model")
--- a/pkg/grpc/llm/transformers/dolly.go
+++ b/pkg/grpc/llm/transformers/dolly.go
@@ -18,7 +18,7 @@ type Dolly struct {
 }

 func (llm *Dolly) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewDolly(opts.Model)
+	model, err := transformers.NewDolly(opts.ModelFile)
 	llm.dolly = model
 	return err
 }
--- a/pkg/grpc/llm/transformers/falcon.go
+++ b/pkg/grpc/llm/transformers/falcon.go
@@ -18,7 +18,7 @@ type Falcon struct {
 }

 func (llm *Falcon) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewFalcon(opts.Model)
+	model, err := transformers.NewFalcon(opts.ModelFile)
 	llm.falcon = model
 	return err
 }
--- a/pkg/grpc/llm/transformers/gpt2.go
+++ b/pkg/grpc/llm/transformers/gpt2.go
@@ -18,7 +18,7 @@ type GPT2 struct {
 }

 func (llm *GPT2) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.New(opts.Model)
+	model, err := transformers.New(opts.ModelFile)
 	llm.gpt2 = model
 	return err
 }
--- a/pkg/grpc/llm/transformers/gptj.go
+++ b/pkg/grpc/llm/transformers/gptj.go
@@ -18,7 +18,7 @@ type GPTJ struct {
 }

 func (llm *GPTJ) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewGPTJ(opts.Model)
+	model, err := transformers.NewGPTJ(opts.ModelFile)
 	llm.gptj = model
 	return err
 }
--- a/pkg/grpc/llm/transformers/gptneox.go
+++ b/pkg/grpc/llm/transformers/gptneox.go
@@ -18,7 +18,7 @@ type GPTNeoX struct {
 }

 func (llm *GPTNeoX) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewGPTNeoX(opts.Model)
+	model, err := transformers.NewGPTNeoX(opts.ModelFile)
 	llm.gptneox = model
 	return err
 }
--- a/pkg/grpc/llm/transformers/mpt.go
+++ b/pkg/grpc/llm/transformers/mpt.go
@@ -18,7 +18,7 @@ type MPT struct {
 }

 func (llm *MPT) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewMPT(opts.Model)
+	model, err := transformers.NewMPT(opts.ModelFile)
 	llm.mpt = model
 	return err
 }
--- a/pkg/grpc/llm/transformers/replit.go
+++ b/pkg/grpc/llm/transformers/replit.go
@@ -18,7 +18,7 @@ type Replit struct {
 }

 func (llm *Replit) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewReplit(opts.Model)
+	model, err := transformers.NewReplit(opts.ModelFile)
 	llm.replit = model
 	return err
 }
--- a/pkg/grpc/llm/transformers/starcoder.go
+++ b/pkg/grpc/llm/transformers/starcoder.go
@@ -18,7 +18,7 @@ type Starcoder struct {
 }

 func (llm *Starcoder) Load(opts *pb.ModelOptions) error {
-	model, err := transformers.NewStarcoder(opts.Model)
+	model, err := transformers.NewStarcoder(opts.ModelFile)
 	llm.starcoder = model
 	return err
 }
--- a/pkg/grpc/proto/backend.pb.go
+++ b/pkg/grpc/proto/backend.pb.go
@@ -64,41 +64,45 @@ type PredictOptions struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields

-	Prompt            string   `protobuf:"bytes,1,opt,name=Prompt,proto3" json:"Prompt,omitempty"`
-	Seed              int32    `protobuf:"varint,2,opt,name=Seed,proto3" json:"Seed,omitempty"`
-	Threads           int32    `protobuf:"varint,3,opt,name=Threads,proto3" json:"Threads,omitempty"`
-	Tokens            int32    `protobuf:"varint,4,opt,name=Tokens,proto3" json:"Tokens,omitempty"`
-	TopK              int32    `protobuf:"varint,5,opt,name=TopK,proto3" json:"TopK,omitempty"`
-	Repeat            int32    `protobuf:"varint,6,opt,name=Repeat,proto3" json:"Repeat,omitempty"`
-	Batch             int32    `protobuf:"varint,7,opt,name=Batch,proto3" json:"Batch,omitempty"`
-	NKeep             int32    `protobuf:"varint,8,opt,name=NKeep,proto3" json:"NKeep,omitempty"`
-	Temperature       float32  `protobuf:"fixed32,9,opt,name=Temperature,proto3" json:"Temperature,omitempty"`
-	Penalty           float32  `protobuf:"fixed32,10,opt,name=Penalty,proto3" json:"Penalty,omitempty"`
-	F16KV             bool     `protobuf:"varint,11,opt,name=F16KV,proto3" json:"F16KV,omitempty"`
-	DebugMode         bool     `protobuf:"varint,12,opt,name=DebugMode,proto3" json:"DebugMode,omitempty"`
-	StopPrompts       []string `protobuf:"bytes,13,rep,name=StopPrompts,proto3" json:"StopPrompts,omitempty"`
-	IgnoreEOS         bool     `protobuf:"varint,14,opt,name=IgnoreEOS,proto3" json:"IgnoreEOS,omitempty"`
-	TailFreeSamplingZ float32  `protobuf:"fixed32,15,opt,name=TailFreeSamplingZ,proto3" json:"TailFreeSamplingZ,omitempty"`
-	TypicalP          float32  `protobuf:"fixed32,16,opt,name=TypicalP,proto3" json:"TypicalP,omitempty"`
-	FrequencyPenalty  float32  `protobuf:"fixed32,17,opt,name=FrequencyPenalty,proto3" json:"FrequencyPenalty,omitempty"`
-	PresencePenalty   float32  `protobuf:"fixed32,18,opt,name=PresencePenalty,proto3" json:"PresencePenalty,omitempty"`
-	Mirostat          int32    `protobuf:"varint,19,opt,name=Mirostat,proto3" json:"Mirostat,omitempty"`
-	MirostatETA       float32  `protobuf:"fixed32,20,opt,name=MirostatETA,proto3" json:"MirostatETA,omitempty"`
-	MirostatTAU       float32  `protobuf:"fixed32,21,opt,name=MirostatTAU,proto3" json:"MirostatTAU,omitempty"`
-	PenalizeNL        bool     `protobuf:"varint,22,opt,name=PenalizeNL,proto3" json:"PenalizeNL,omitempty"`
-	LogitBias         string   `protobuf:"bytes,23,opt,name=LogitBias,proto3" json:"LogitBias,omitempty"`
-	MLock             bool     `protobuf:"varint,25,opt,name=MLock,proto3" json:"MLock,omitempty"`
-	MMap              bool     `protobuf:"varint,26,opt,name=MMap,proto3" json:"MMap,omitempty"`
-	PromptCacheAll    bool     `protobuf:"varint,27,opt,name=PromptCacheAll,proto3" json:"PromptCacheAll,omitempty"`
-	PromptCacheRO     bool     `protobuf:"varint,28,opt,name=PromptCacheRO,proto3" json:"PromptCacheRO,omitempty"`
-	Grammar           string   `protobuf:"bytes,29,opt,name=Grammar,proto3" json:"Grammar,omitempty"`
-	MainGPU           string   `protobuf:"bytes,30,opt,name=MainGPU,proto3" json:"MainGPU,omitempty"`
-	TensorSplit       string   `protobuf:"bytes,31,opt,name=TensorSplit,proto3" json:"TensorSplit,omitempty"`
-	TopP              float32  `protobuf:"fixed32,32,opt,name=TopP,proto3" json:"TopP,omitempty"`
-	PromptCachePath   string   `protobuf:"bytes,33,opt,name=PromptCachePath,proto3" json:"PromptCachePath,omitempty"`
-	Debug             bool     `protobuf:"varint,34,opt,name=Debug,proto3" json:"Debug,omitempty"`
-	EmbeddingTokens   []int32  `protobuf:"varint,35,rep,packed,name=EmbeddingTokens,proto3" json:"EmbeddingTokens,omitempty"`
-	Embeddings        string   `protobuf:"bytes,36,opt,name=Embeddings,proto3" json:"Embeddings,omitempty"`
+	Prompt              string   `protobuf:"bytes,1,opt,name=Prompt,proto3" json:"Prompt,omitempty"`
+	Seed                int32    `protobuf:"varint,2,opt,name=Seed,proto3" json:"Seed,omitempty"`
+	Threads             int32    `protobuf:"varint,3,opt,name=Threads,proto3" json:"Threads,omitempty"`
+	Tokens              int32    `protobuf:"varint,4,opt,name=Tokens,proto3" json:"Tokens,omitempty"`
+	TopK                int32    `protobuf:"varint,5,opt,name=TopK,proto3" json:"TopK,omitempty"`
+	Repeat              int32    `protobuf:"varint,6,opt,name=Repeat,proto3" json:"Repeat,omitempty"`
+	Batch               int32    `protobuf:"varint,7,opt,name=Batch,proto3" json:"Batch,omitempty"`
+	NKeep               int32    `protobuf:"varint,8,opt,name=NKeep,proto3" json:"NKeep,omitempty"`
+	Temperature         float32  `protobuf:"fixed32,9,opt,name=Temperature,proto3" json:"Temperature,omitempty"`
+	Penalty             float32  `protobuf:"fixed32,10,opt,name=Penalty,proto3" json:"Penalty,omitempty"`
+	F16KV               bool     `protobuf:"varint,11,opt,name=F16KV,proto3" json:"F16KV,omitempty"`
+	DebugMode           bool     `protobuf:"varint,12,opt,name=DebugMode,proto3" json:"DebugMode,omitempty"`
+	StopPrompts         []string `protobuf:"bytes,13,rep,name=StopPrompts,proto3" json:"StopPrompts,omitempty"`
+	IgnoreEOS           bool     `protobuf:"varint,14,opt,name=IgnoreEOS,proto3" json:"IgnoreEOS,omitempty"`
+	TailFreeSamplingZ   float32  `protobuf:"fixed32,15,opt,name=TailFreeSamplingZ,proto3" json:"TailFreeSamplingZ,omitempty"`
+	TypicalP            float32  `protobuf:"fixed32,16,opt,name=TypicalP,proto3" json:"TypicalP,omitempty"`
+	FrequencyPenalty    float32  `protobuf:"fixed32,17,opt,name=FrequencyPenalty,proto3" json:"FrequencyPenalty,omitempty"`
+	PresencePenalty     float32  `protobuf:"fixed32,18,opt,name=PresencePenalty,proto3" json:"PresencePenalty,omitempty"`
+	Mirostat            int32    `protobuf:"varint,19,opt,name=Mirostat,proto3" json:"Mirostat,omitempty"`
+	MirostatETA         float32  `protobuf:"fixed32,20,opt,name=MirostatETA,proto3" json:"MirostatETA,omitempty"`
+	MirostatTAU         float32  `protobuf:"fixed32,21,opt,name=MirostatTAU,proto3" json:"MirostatTAU,omitempty"`
+	PenalizeNL          bool     `protobuf:"varint,22,opt,name=PenalizeNL,proto3" json:"PenalizeNL,omitempty"`
+	LogitBias           string   `protobuf:"bytes,23,opt,name=LogitBias,proto3" json:"LogitBias,omitempty"`
+	MLock               bool     `protobuf:"varint,25,opt,name=MLock,proto3" json:"MLock,omitempty"`
+	MMap                bool     `protobuf:"varint,26,opt,name=MMap,proto3" json:"MMap,omitempty"`
+	PromptCacheAll      bool     `protobuf:"varint,27,opt,name=PromptCacheAll,proto3" json:"PromptCacheAll,omitempty"`
+	PromptCacheRO       bool     `protobuf:"varint,28,opt,name=PromptCacheRO,proto3" json:"PromptCacheRO,omitempty"`
+	Grammar             string   `protobuf:"bytes,29,opt,name=Grammar,proto3" json:"Grammar,omitempty"`
+	MainGPU             string   `protobuf:"bytes,30,opt,name=MainGPU,proto3" json:"MainGPU,omitempty"`
+	TensorSplit         string   `protobuf:"bytes,31,opt,name=TensorSplit,proto3" json:"TensorSplit,omitempty"`
+	TopP                float32  `protobuf:"fixed32,32,opt,name=TopP,proto3" json:"TopP,omitempty"`
+	PromptCachePath     string   `protobuf:"bytes,33,opt,name=PromptCachePath,proto3" json:"PromptCachePath,omitempty"`
+	Debug               bool     `protobuf:"varint,34,opt,name=Debug,proto3" json:"Debug,omitempty"`
+	EmbeddingTokens     []int32  `protobuf:"varint,35,rep,packed,name=EmbeddingTokens,proto3" json:"EmbeddingTokens,omitempty"`
+	Embeddings          string   `protobuf:"bytes,36,opt,name=Embeddings,proto3" json:"Embeddings,omitempty"`
+	RopeFreqBase        float32  `protobuf:"fixed32,37,opt,name=RopeFreqBase,proto3" json:"RopeFreqBase,omitempty"`
+	RopeFreqScale       float32  `protobuf:"fixed32,38,opt,name=RopeFreqScale,proto3" json:"RopeFreqScale,omitempty"`
+	NegativePromptScale float32  `protobuf:"fixed32,39,opt,name=NegativePromptScale,proto3" json:"NegativePromptScale,omitempty"`
+	NegativePrompt      string   `protobuf:"bytes,40,opt,name=NegativePrompt,proto3" json:"NegativePrompt,omitempty"`
 }

 func (x *PredictOptions) Reset() {
@@ -378,13 +382,41 @@ func (x *PredictOptions) GetEmbeddings() string {
 	return ""
 }

+func (x *PredictOptions) GetRopeFreqBase() float32 {
+	if x != nil {
+		return x.RopeFreqBase
+	}
+	return 0
+}
+
+func (x *PredictOptions) GetRopeFreqScale() float32 {
+	if x != nil {
+		return x.RopeFreqScale
+	}
+	return 0
+}
+
+func (x *PredictOptions) GetNegativePromptScale() float32 {
+	if x != nil {
+		return x.NegativePromptScale
+	}
+	return 0
+}
+
+func (x *PredictOptions) GetNegativePrompt() string {
+	if x != nil {
+		return x.NegativePrompt
+	}
+	return ""
+}
+
 // The response message containing the result
 type Reply struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields

-	Message string `protobuf:"bytes,1,opt,name=message,proto3" json:"message,omitempty"`
+	Message []byte `protobuf:"bytes,1,opt,name=message,proto3" json:"message,omitempty"`
 }

 func (x *Reply) Reset() {
@@ -419,11 +451,11 @@ func (*Reply) Descriptor() ([]byte, []int) {
 	return file_pkg_grpc_proto_backend_proto_rawDescGZIP(), []int{2}
 }

-func (x *Reply) GetMessage() string {
+func (x *Reply) GetMessage() []byte {
 	if x != nil {
 		return x.Message
 	}
-	return ""
+	return nil
 }

 type ModelOptions struct {
@@ -431,22 +463,36 @@ type ModelOptions struct {
 	sizeCache     protoimpl.SizeCache
 	unknownFields protoimpl.UnknownFields

-	Model             string `protobuf:"bytes,1,opt,name=Model,proto3" json:"Model,omitempty"`
-	ContextSize       int32  `protobuf:"varint,2,opt,name=ContextSize,proto3" json:"ContextSize,omitempty"`
-	Seed              int32  `protobuf:"varint,3,opt,name=Seed,proto3" json:"Seed,omitempty"`
-	NBatch            int32  `protobuf:"varint,4,opt,name=NBatch,proto3" json:"NBatch,omitempty"`
-	F16Memory         bool   `protobuf:"varint,5,opt,name=F16Memory,proto3" json:"F16Memory,omitempty"`
-	MLock             bool   `protobuf:"varint,6,opt,name=MLock,proto3" json:"MLock,omitempty"`
-	MMap              bool   `protobuf:"varint,7,opt,name=MMap,proto3" json:"MMap,omitempty"`
-	VocabOnly         bool   `protobuf:"varint,8,opt,name=VocabOnly,proto3" json:"VocabOnly,omitempty"`
-	LowVRAM           bool   `protobuf:"varint,9,opt,name=LowVRAM,proto3" json:"LowVRAM,omitempty"`
-	Embeddings        bool   `protobuf:"varint,10,opt,name=Embeddings,proto3" json:"Embeddings,omitempty"`
-	NUMA              bool   `protobuf:"varint,11,opt,name=NUMA,proto3" json:"NUMA,omitempty"`
-	NGPULayers        int32  `protobuf:"varint,12,opt,name=NGPULayers,proto3" json:"NGPULayers,omitempty"`
-	MainGPU           string `protobuf:"bytes,13,opt,name=MainGPU,proto3" json:"MainGPU,omitempty"`
-	TensorSplit       string `protobuf:"bytes,14,opt,name=TensorSplit,proto3" json:"TensorSplit,omitempty"`
-	Threads           int32  `protobuf:"varint,15,opt,name=Threads,proto3" json:"Threads,omitempty"`
-	LibrarySearchPath string `protobuf:"bytes,16,opt,name=LibrarySearchPath,proto3" json:"LibrarySearchPath,omitempty"`
+	Model             string  `protobuf:"bytes,1,opt,name=Model,proto3" json:"Model,omitempty"`
+	ContextSize       int32   `protobuf:"varint,2,opt,name=ContextSize,proto3" json:"ContextSize,omitempty"`
+	Seed              int32   `protobuf:"varint,3,opt,name=Seed,proto3" json:"Seed,omitempty"`
+	NBatch            int32   `protobuf:"varint,4,opt,name=NBatch,proto3" json:"NBatch,omitempty"`
+	F16Memory         bool    `protobuf:"varint,5,opt,name=F16Memory,proto3" json:"F16Memory,omitempty"`
+	MLock             bool    `protobuf:"varint,6,opt,name=MLock,proto3" json:"MLock,omitempty"`
+	MMap              bool    `protobuf:"varint,7,opt,name=MMap,proto3" json:"MMap,omitempty"`
+	VocabOnly         bool    `protobuf:"varint,8,opt,name=VocabOnly,proto3" json:"VocabOnly,omitempty"`
+	LowVRAM           bool    `protobuf:"varint,9,opt,name=LowVRAM,proto3" json:"LowVRAM,omitempty"`
+	Embeddings        bool    `protobuf:"varint,10,opt,name=Embeddings,proto3" json:"Embeddings,omitempty"`
+	NUMA              bool    `protobuf:"varint,11,opt,name=NUMA,proto3" json:"NUMA,omitempty"`
+	NGPULayers        int32   `protobuf:"varint,12,opt,name=NGPULayers,proto3" json:"NGPULayers,omitempty"`
+	MainGPU           string  `protobuf:"bytes,13,opt,name=MainGPU,proto3" json:"MainGPU,omitempty"`
+	TensorSplit       string  `protobuf:"bytes,14,opt,name=TensorSplit,proto3" json:"TensorSplit,omitempty"`
+	Threads           int32   `protobuf:"varint,15,opt,name=Threads,proto3" json:"Threads,omitempty"`
+	LibrarySearchPath string  `protobuf:"bytes,16,opt,name=LibrarySearchPath,proto3" json:"LibrarySearchPath,omitempty"`
+	RopeFreqBase      float32 `protobuf:"fixed32,17,opt,name=RopeFreqBase,proto3" json:"RopeFreqBase,omitempty"`
+	RopeFreqScale     float32 `protobuf:"fixed32,18,opt,name=RopeFreqScale,proto3" json:"RopeFreqScale,omitempty"`
+	RMSNormEps        float32 `protobuf:"fixed32,19,opt,name=RMSNormEps,proto3" json:"RMSNormEps,omitempty"`
+	NGQA              int32   `protobuf:"varint,20,opt,name=NGQA,proto3" json:"NGQA,omitempty"`
+	ModelFile         string  `protobuf:"bytes,21,opt,name=ModelFile,proto3" json:"ModelFile,omitempty"`
+	// AutoGPTQ
+	Device           string `protobuf:"bytes,22,opt,name=Device,proto3" json:"Device,omitempty"`
+	UseTriton        bool   `protobuf:"varint,23,opt,name=UseTriton,proto3" json:"UseTriton,omitempty"`
+	ModelBaseName    string `protobuf:"bytes,24,opt,name=ModelBaseName,proto3" json:"ModelBaseName,omitempty"`
+	UseFastTokenizer bool   `protobuf:"varint,25,opt,name=UseFastTokenizer,proto3" json:"UseFastTokenizer,omitempty"`
+	// Diffusers
+	PipelineType  string `protobuf:"bytes,26,opt,name=PipelineType,proto3" json:"PipelineType,omitempty"`
+	SchedulerType string `protobuf:"bytes,27,opt,name=SchedulerType,proto3" json:"SchedulerType,omitempty"`
+	CUDA          bool   `protobuf:"varint,28,opt,name=CUDA,proto3" json:"CUDA,omitempty"`
 }

 func (x *ModelOptions) Reset() {
@@ -593,6 +639,90 @@ func (x *ModelOptions) GetLibrarySearchPath() string {
 	return ""
 }

+func (x *ModelOptions) GetRopeFreqBase() float32 {
+	if x != nil {
+		return x.RopeFreqBase
+	}
+	return 0
+}
+
+func (x *ModelOptions) GetRopeFreqScale() float32 {
+	if x != nil {
+		return x.RopeFreqScale
+	}
+	return 0
+}
+
+func (x *ModelOptions) GetRMSNormEps() float32 {
+	if x != nil {
+		return x.RMSNormEps
+	}
+	return 0
+}
+
+func (x *ModelOptions) GetNGQA() int32 {
+	if x != nil {
+		return x.NGQA
+	}
+	return 0
+}
+
+func (x *ModelOptions) GetModelFile() string {
+	if x != nil {
+		return x.ModelFile
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetDevice() string {
+	if x != nil {
+		return x.Device
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetUseTriton() bool {
+	if x != nil {
+		return x.UseTriton
+	}
+	return false
+}
+
+func (x *ModelOptions) GetModelBaseName() string {
+	if x != nil {
+		return x.ModelBaseName
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetUseFastTokenizer() bool {
+	if x != nil {
+		return x.UseFastTokenizer
+	}
+	return false
+}
+
+func (x *ModelOptions) GetPipelineType() string {
+	if x != nil {
+		return x.PipelineType
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetSchedulerType() string {
+	if x != nil {
+		return x.SchedulerType
+	}
+	return ""
+}
+
+func (x *ModelOptions) GetCUDA() bool {
+	if x != nil {
+		return x.CUDA
+	}
+	return false
+}
+
 type Result struct {
 	state         protoimpl.MessageState
 	sizeCache     protoimpl.SizeCache
@@ -1064,7 +1194,7 @@ var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
 	0x0a, 0x1c, 0x70, 0x6b, 0x67, 0x2f, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f,
 	0x2f, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x07,
 	0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x22, 0x0f, 0x0a, 0x0d, 0x48, 0x65, 0x61, 0x6c, 0x74,
-	0x68, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0xa0, 0x08, 0x0a, 0x0e, 0x50, 0x72, 0x65,
+	0x68, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0xc4, 0x09, 0x0a, 0x0e, 0x50, 0x72, 0x65,
 	0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x50,
 	0x72, 0x6f, 0x6d, 0x70, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x50, 0x72, 0x6f,
 	0x6d, 0x70, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x53, 0x65, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28,
@@ -1130,121 +1260,155 @@ var file_pkg_grpc_proto_backend_proto_rawDesc = []byte{
 	0x6b, 0x65, 0x6e, 0x73, 0x18, 0x23, 0x20, 0x03, 0x28, 0x05, 0x52, 0x0f, 0x45, 0x6d, 0x62, 0x65,
 	0x64, 0x64, 0x69, 0x6e, 0x67, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x12, 0x1e, 0x0a, 0x0a, 0x45,
 	0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x18, 0x24, 0x20, 0x01, 0x28, 0x09, 0x52,
-	0x0a, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x21, 0x0a, 0x05, 0x52,
-	0x65, 0x70, 0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18,
-	0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0xca,
-	0x03, 0x0a, 0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12,
-	0x14, 0x0a, 0x05, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05,
-	0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x20, 0x0a, 0x0b, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
-	0x53, 0x69, 0x7a, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b, 0x43, 0x6f, 0x6e, 0x74,
-	0x65, 0x78, 0x74, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x53, 0x65, 0x65, 0x64, 0x18,
-	0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x53, 0x65, 0x65, 0x64, 0x12, 0x16, 0x0a, 0x06, 0x4e,
-	0x42, 0x61, 0x74, 0x63, 0x68, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x4e, 0x42, 0x61,
-	0x74, 0x63, 0x68, 0x12, 0x1c, 0x0a, 0x09, 0x46, 0x31, 0x36, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79,
-	0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x46, 0x31, 0x36, 0x4d, 0x65, 0x6d, 0x6f, 0x72,
-	0x79, 0x12, 0x14, 0x0a, 0x05, 0x4d, 0x4c, 0x6f, 0x63, 0x6b, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08,
-	0x52, 0x05, 0x4d, 0x4c, 0x6f, 0x63, 0x6b, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x4d, 0x61, 0x70, 0x18,
-	0x07, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, 0x4d, 0x4d, 0x61, 0x70, 0x12, 0x1c, 0x0a, 0x09, 0x56,
-	0x6f, 0x63, 0x61, 0x62, 0x4f, 0x6e, 0x6c, 0x79, 0x18, 0x08, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09,
-	0x56, 0x6f, 0x63, 0x61, 0x62, 0x4f, 0x6e, 0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x4c, 0x6f, 0x77,
-	0x56, 0x52, 0x41, 0x4d, 0x18, 0x09, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x4c, 0x6f, 0x77, 0x56,
-	0x52, 0x41, 0x4d, 0x12, 0x1e, 0x0a, 0x0a, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67,
-	0x73, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69,
-	0x6e, 0x67, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x4e, 0x55, 0x4d, 0x41, 0x18, 0x0b, 0x20, 0x01, 0x28,
-	0x08, 0x52, 0x04, 0x4e, 0x55, 0x4d, 0x41, 0x12, 0x1e, 0x0a, 0x0a, 0x4e, 0x47, 0x50, 0x55, 0x4c,
-	0x61, 0x79, 0x65, 0x72, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x4e, 0x47, 0x50,
-	0x55, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x4d, 0x61, 0x69, 0x6e, 0x47,
-	0x50, 0x55, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x4d, 0x61, 0x69, 0x6e, 0x47, 0x50,
-	0x55, 0x12, 0x20, 0x0a, 0x0b, 0x54, 0x65, 0x6e, 0x73, 0x6f, 0x72, 0x53, 0x70, 0x6c, 0x69, 0x74,
-	0x18, 0x0e, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x54, 0x65, 0x6e, 0x73, 0x6f, 0x72, 0x53, 0x70,
-	0x6c, 0x69, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x18, 0x0f,
-	0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x12, 0x2c, 0x0a,
-	0x11, 0x4c, 0x69, 0x62, 0x72, 0x61, 0x72, 0x79, 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x50, 0x61,
-	0x74, 0x68, 0x18, 0x10, 0x20, 0x01, 0x28, 0x09, 0x52, 0x11, 0x4c, 0x69, 0x62, 0x72, 0x61, 0x72,
-	0x79, 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x50, 0x61, 0x74, 0x68, 0x22, 0x3c, 0x0a, 0x06, 0x52,
-	0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-	0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12,
-	0x18, 0x0a, 0x07, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08,
-	0x52, 0x07, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x22, 0x31, 0x0a, 0x0f, 0x45, 0x6d, 0x62,
-	0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x1e, 0x0a, 0x0a,
-	0x65, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x02,
-	0x52, 0x0a, 0x65, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x5b, 0x0a, 0x11,
-	0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73,
-	0x74, 0x12, 0x10, 0x0a, 0x03, 0x64, 0x73, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
-	0x64, 0x73, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x6c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x18,
-	0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x6c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x12,
-	0x18, 0x0a, 0x07, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d,
-	0x52, 0x07, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x22, 0x5e, 0x0a, 0x10, 0x54, 0x72, 0x61,
-	0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x36, 0x0a,
-	0x08, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32,
+	0x0a, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x12, 0x22, 0x0a, 0x0c, 0x52,
+	0x6f, 0x70, 0x65, 0x46, 0x72, 0x65, 0x71, 0x42, 0x61, 0x73, 0x65, 0x18, 0x25, 0x20, 0x01, 0x28,
+	0x02, 0x52, 0x0c, 0x52, 0x6f, 0x70, 0x65, 0x46, 0x72, 0x65, 0x71, 0x42, 0x61, 0x73, 0x65, 0x12,
+	0x24, 0x0a, 0x0d, 0x52, 0x6f, 0x70, 0x65, 0x46, 0x72, 0x65, 0x71, 0x53, 0x63, 0x61, 0x6c, 0x65,
+	0x18, 0x26, 0x20, 0x01, 0x28, 0x02, 0x52, 0x0d, 0x52, 0x6f, 0x70, 0x65, 0x46, 0x72, 0x65, 0x71,
+	0x53, 0x63, 0x61, 0x6c, 0x65, 0x12, 0x30, 0x0a, 0x13, 0x4e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76,
+	0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x53, 0x63, 0x61, 0x6c, 0x65, 0x18, 0x27, 0x20, 0x01,
+	0x28, 0x02, 0x52, 0x13, 0x4e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x6d,
+	0x70, 0x74, 0x53, 0x63, 0x61, 0x6c, 0x65, 0x12, 0x26, 0x0a, 0x0e, 0x4e, 0x65, 0x67, 0x61, 0x74,
+	0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x18, 0x28, 0x20, 0x01, 0x28, 0x09, 0x52,
+	0x0e, 0x4e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22,
+	0x21, 0x0a, 0x05, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73,
+	0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61,
+	0x67, 0x65, 0x22, 0xcc, 0x06, 0x0a, 0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69,
+	0x6f, 0x6e, 0x73, 0x12, 0x14, 0x0a, 0x05, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x18, 0x01, 0x20, 0x01,
+	0x28, 0x09, 0x52, 0x05, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x20, 0x0a, 0x0b, 0x43, 0x6f, 0x6e,
+	0x74, 0x65, 0x78, 0x74, 0x53, 0x69, 0x7a, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b,
+	0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x53,
+	0x65, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x53, 0x65, 0x65, 0x64, 0x12,
+	0x16, 0x0a, 0x06, 0x4e, 0x42, 0x61, 0x74, 0x63, 0x68, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52,
+	0x06, 0x4e, 0x42, 0x61, 0x74, 0x63, 0x68, 0x12, 0x1c, 0x0a, 0x09, 0x46, 0x31, 0x36, 0x4d, 0x65,
+	0x6d, 0x6f, 0x72, 0x79, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x46, 0x31, 0x36, 0x4d,
+	0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x4d, 0x4c, 0x6f, 0x63, 0x6b, 0x18, 0x06,
+	0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x4d, 0x4c, 0x6f, 0x63, 0x6b, 0x12, 0x12, 0x0a, 0x04, 0x4d,
+	0x4d, 0x61, 0x70, 0x18, 0x07, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, 0x4d, 0x4d, 0x61, 0x70, 0x12,
+	0x1c, 0x0a, 0x09, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x4f, 0x6e, 0x6c, 0x79, 0x18, 0x08, 0x20, 0x01,
+	0x28, 0x08, 0x52, 0x09, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x4f, 0x6e, 0x6c, 0x79, 0x12, 0x18, 0x0a,
+	0x07, 0x4c, 0x6f, 0x77, 0x56, 0x52, 0x41, 0x4d, 0x18, 0x09, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07,
+	0x4c, 0x6f, 0x77, 0x56, 0x52, 0x41, 0x4d, 0x12, 0x1e, 0x0a, 0x0a, 0x45, 0x6d, 0x62, 0x65, 0x64,
+	0x64, 0x69, 0x6e, 0x67, 0x73, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x45, 0x6d, 0x62,
+	0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x4e, 0x55, 0x4d, 0x41, 0x18,
+	0x0b, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, 0x4e, 0x55, 0x4d, 0x41, 0x12, 0x1e, 0x0a, 0x0a, 0x4e,
+	0x47, 0x50, 0x55, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x05, 0x52,
+	0x0a, 0x4e, 0x47, 0x50, 0x55, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x4d,
+	0x61, 0x69, 0x6e, 0x47, 0x50, 0x55, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x4d, 0x61,
+	0x69, 0x6e, 0x47, 0x50, 0x55, 0x12, 0x20, 0x0a, 0x0b, 0x54, 0x65, 0x6e, 0x73, 0x6f, 0x72, 0x53,
+	0x70, 0x6c, 0x69, 0x74, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x54, 0x65, 0x6e, 0x73,
+	0x6f, 0x72, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61,
+	0x64, 0x73, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64,
+	0x73, 0x12, 0x2c, 0x0a, 0x11, 0x4c, 0x69, 0x62, 0x72, 0x61, 0x72, 0x79, 0x53, 0x65, 0x61, 0x72,
+	0x63, 0x68, 0x50, 0x61, 0x74, 0x68, 0x18, 0x10, 0x20, 0x01, 0x28, 0x09, 0x52, 0x11, 0x4c, 0x69,
+	0x62, 0x72, 0x61, 0x72, 0x79, 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x50, 0x61, 0x74, 0x68, 0x12,
+	0x22, 0x0a, 0x0c, 0x52, 0x6f, 0x70, 0x65, 0x46, 0x72, 0x65, 0x71, 0x42, 0x61, 0x73, 0x65, 0x18,
+	0x11, 0x20, 0x01, 0x28, 0x02, 0x52, 0x0c, 0x52, 0x6f, 0x70, 0x65, 0x46, 0x72, 0x65, 0x71, 0x42,
+	0x61, 0x73, 0x65, 0x12, 0x24, 0x0a, 0x0d, 0x52, 0x6f, 0x70, 0x65, 0x46, 0x72, 0x65, 0x71, 0x53,
+	0x63, 0x61, 0x6c, 0x65, 0x18, 0x12, 0x20, 0x01, 0x28, 0x02, 0x52, 0x0d, 0x52, 0x6f, 0x70, 0x65,
+	0x46, 0x72, 0x65, 0x71, 0x53, 0x63, 0x61, 0x6c, 0x65, 0x12, 0x1e, 0x0a, 0x0a, 0x52, 0x4d, 0x53,
+	0x4e, 0x6f, 0x72, 0x6d, 0x45, 0x70, 0x73, 0x18, 0x13, 0x20, 0x01, 0x28, 0x02, 0x52, 0x0a, 0x52,
+	0x4d, 0x53, 0x4e, 0x6f, 0x72, 0x6d, 0x45, 0x70, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x4e, 0x47, 0x51,
+	0x41, 0x18, 0x14, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x4e, 0x47, 0x51, 0x41, 0x12, 0x1c, 0x0a,
+	0x09, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x46, 0x69, 0x6c, 0x65, 0x18, 0x15, 0x20, 0x01, 0x28, 0x09,
+	0x52, 0x09, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x44,
+	0x65, 0x76, 0x69, 0x63, 0x65, 0x18, 0x16, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x44, 0x65, 0x76,
+	0x69, 0x63, 0x65, 0x12, 0x1c, 0x0a, 0x09, 0x55, 0x73, 0x65, 0x54, 0x72, 0x69, 0x74, 0x6f, 0x6e,
+	0x18, 0x17, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x55, 0x73, 0x65, 0x54, 0x72, 0x69, 0x74, 0x6f,
+	0x6e, 0x12, 0x24, 0x0a, 0x0d, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x42, 0x61, 0x73, 0x65, 0x4e, 0x61,
+	0x6d, 0x65, 0x18, 0x18, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x42,
+	0x61, 0x73, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x2a, 0x0a, 0x10, 0x55, 0x73, 0x65, 0x46, 0x61,
+	0x73, 0x74, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x65, 0x72, 0x18, 0x19, 0x20, 0x01, 0x28,
+	0x08, 0x52, 0x10, 0x55, 0x73, 0x65, 0x46, 0x61, 0x73, 0x74, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x69,
+	0x7a, 0x65, 0x72, 0x12, 0x22, 0x0a, 0x0c, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x54,
+	0x79, 0x70, 0x65, 0x18, 0x1a, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x50, 0x69, 0x70, 0x65, 0x6c,
+	0x69, 0x6e, 0x65, 0x54, 0x79, 0x70, 0x65, 0x12, 0x24, 0x0a, 0x0d, 0x53, 0x63, 0x68, 0x65, 0x64,
+	0x75, 0x6c, 0x65, 0x72, 0x54, 0x79, 0x70, 0x65, 0x18, 0x1b, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d,
+	0x53, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x54, 0x79, 0x70, 0x65, 0x12, 0x12, 0x0a,
+	0x04, 0x43, 0x55, 0x44, 0x41, 0x18, 0x1c, 0x20, 0x01, 0x28, 0x08, 0x52, 0x04, 0x43, 0x55, 0x44,
+	0x41, 0x22, 0x3c, 0x0a, 0x06, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x6d,
+	0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65,
+	0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73,
+	0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x22,
+	0x31, 0x0a, 0x0f, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x75,
+	0x6c, 0x74, 0x12, 0x1e, 0x0a, 0x0a, 0x65, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73,
+	0x18, 0x01, 0x20, 0x03, 0x28, 0x02, 0x52, 0x0a, 0x65, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e,
+	0x67, 0x73, 0x22, 0x5b, 0x0a, 0x11, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
+	0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x64, 0x73, 0x74, 0x18, 0x02,
+	0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x64, 0x73, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x6c, 0x61, 0x6e,
+	0x67, 0x75, 0x61, 0x67, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x6c, 0x61, 0x6e,
+	0x67, 0x75, 0x61, 0x67, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73,
+	0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x74, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x22,
+	0x5e, 0x0a, 0x10, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x65, 0x73,
+	0x75, 0x6c, 0x74, 0x12, 0x36, 0x0a, 0x08, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x18,
+	0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e,
+	0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e,
+	0x74, 0x52, 0x08, 0x73, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x74,
+	0x65, 0x78, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74, 0x22,
+	0x77, 0x0a, 0x11, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x53, 0x65, 0x67,
+	0x6d, 0x65, 0x6e, 0x74, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05,
+	0x52, 0x02, 0x69, 0x64, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x72, 0x74, 0x18, 0x02, 0x20,
+	0x01, 0x28, 0x03, 0x52, 0x05, 0x73, 0x74, 0x61, 0x72, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x65, 0x6e,
+	0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x03, 0x65, 0x6e, 0x64, 0x12, 0x12, 0x0a, 0x04,
+	0x74, 0x65, 0x78, 0x74, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74,
+	0x12, 0x16, 0x0a, 0x06, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x05,
+	0x52, 0x06, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x22, 0xe4, 0x01, 0x0a, 0x14, 0x47, 0x65, 0x6e,
+	0x65, 0x72, 0x61, 0x74, 0x65, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73,
+	0x74, 0x12, 0x16, 0x0a, 0x06, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28,
+	0x05, 0x52, 0x06, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x77, 0x69, 0x64,
+	0x74, 0x68, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x77, 0x69, 0x64, 0x74, 0x68, 0x12,
+	0x12, 0x0a, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x6d,
+	0x6f, 0x64, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x74, 0x65, 0x70, 0x18, 0x04, 0x20, 0x01, 0x28,
+	0x05, 0x52, 0x04, 0x73, 0x74, 0x65, 0x70, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x65, 0x65, 0x64, 0x18,
+	0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x73, 0x65, 0x65, 0x64, 0x12, 0x27, 0x0a, 0x0f, 0x70,
+	0x6f, 0x73, 0x69, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x18, 0x06,
+	0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x76, 0x65, 0x50, 0x72,
+	0x6f, 0x6d, 0x70, 0x74, 0x12, 0x27, 0x0a, 0x0f, 0x6e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65,
+	0x5f, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x6e,
+	0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x12, 0x10, 0x0a,
+	0x03, 0x64, 0x73, 0x74, 0x18, 0x08, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x64, 0x73, 0x74, 0x22,
+	0x48, 0x0a, 0x0a, 0x54, 0x54, 0x53, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a,
+	0x04, 0x74, 0x65, 0x78, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78,
+	0x74, 0x12, 0x14, 0x0a, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09,
+	0x52, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x10, 0x0a, 0x03, 0x64, 0x73, 0x74, 0x18, 0x03,
+	0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x64, 0x73, 0x74, 0x32, 0xeb, 0x03, 0x0a, 0x07, 0x42, 0x61,
+	0x63, 0x6b, 0x65, 0x6e, 0x64, 0x12, 0x32, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12,
+	0x16, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68,
+	0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x1a, 0x0e, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e,
+	0x64, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x34, 0x0a, 0x07, 0x50, 0x72, 0x65,
+	0x64, 0x69, 0x63, 0x74, 0x12, 0x17, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x50,
+	0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x0e, 0x2e,
+	0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12,
+	0x35, 0x0a, 0x09, 0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x15, 0x2e, 0x62,
+	0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69,
+	0x6f, 0x6e, 0x73, 0x1a, 0x0f, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65,
+	0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x3c, 0x0a, 0x0d, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63,
+	0x74, 0x53, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x12, 0x17, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e,
+	0x64, 0x2e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73,
+	0x1a, 0x0e, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79,
+	0x22, 0x00, 0x30, 0x01, 0x12, 0x40, 0x0a, 0x09, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e,
+	0x67, 0x12, 0x17, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x50, 0x72, 0x65, 0x64,
+	0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x18, 0x2e, 0x62, 0x61, 0x63,
+	0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x52, 0x65,
+	0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x41, 0x0a, 0x0d, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61,
+	0x74, 0x65, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x12, 0x1d, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e,
+	0x64, 0x2e, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x52,
+	0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x0f, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64,
+	0x2e, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x4d, 0x0a, 0x12, 0x41, 0x75, 0x64,
+	0x69, 0x6f, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x12,
 	0x1a, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63,
-	0x72, 0x69, 0x70, 0x74, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x08, 0x73, 0x65, 0x67,
-	0x6d, 0x65, 0x6e, 0x74, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x74, 0x65, 0x78, 0x74, 0x18, 0x02, 0x20,
-	0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74, 0x22, 0x77, 0x0a, 0x11, 0x54, 0x72, 0x61,
-	0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x53, 0x65, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x0e,
-	0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x69, 0x64, 0x12, 0x14,
-	0x0a, 0x05, 0x73, 0x74, 0x61, 0x72, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x05, 0x73,
-	0x74, 0x61, 0x72, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x65, 0x6e, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28,
-	0x03, 0x52, 0x03, 0x65, 0x6e, 0x64, 0x12, 0x12, 0x0a, 0x04, 0x74, 0x65, 0x78, 0x74, 0x18, 0x04,
-	0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x74, 0x6f,
-	0x6b, 0x65, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x03, 0x28, 0x05, 0x52, 0x06, 0x74, 0x6f, 0x6b, 0x65,
-	0x6e, 0x73, 0x22, 0xe4, 0x01, 0x0a, 0x14, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x49,
-	0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x16, 0x0a, 0x06, 0x68,
-	0x65, 0x69, 0x67, 0x68, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x68, 0x65, 0x69,
-	0x67, 0x68, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x77, 0x69, 0x64, 0x74, 0x68, 0x18, 0x02, 0x20, 0x01,
-	0x28, 0x05, 0x52, 0x05, 0x77, 0x69, 0x64, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x6d, 0x6f, 0x64,
-	0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x6d, 0x6f, 0x64, 0x65, 0x12, 0x12, 0x0a,
-	0x04, 0x73, 0x74, 0x65, 0x70, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x73, 0x74, 0x65,
-	0x70, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x65, 0x65, 0x64, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52,
-	0x04, 0x73, 0x65, 0x65, 0x64, 0x12, 0x27, 0x0a, 0x0f, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x76,
-	0x65, 0x5f, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e,
-	0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x12, 0x27,
-	0x0a, 0x0f, 0x6e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-	0x74, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0e, 0x6e, 0x65, 0x67, 0x61, 0x74, 0x69, 0x76,
-	0x65, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x64, 0x73, 0x74, 0x18, 0x08,
-	0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x64, 0x73, 0x74, 0x22, 0x48, 0x0a, 0x0a, 0x54, 0x54, 0x53,
-	0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x74, 0x65, 0x78, 0x74, 0x18,
-	0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x74, 0x65, 0x78, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x6d,
-	0x6f, 0x64, 0x65, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x6d, 0x6f, 0x64, 0x65,
-	0x6c, 0x12, 0x10, 0x0a, 0x03, 0x64, 0x73, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03,
-	0x64, 0x73, 0x74, 0x32, 0xeb, 0x03, 0x0a, 0x07, 0x42, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x12,
-	0x32, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x16, 0x2e, 0x62, 0x61, 0x63, 0x6b,
-	0x65, 0x6e, 0x64, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67,
-	0x65, 0x1a, 0x0e, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65, 0x70, 0x6c,
-	0x79, 0x22, 0x00, 0x12, 0x34, 0x0a, 0x07, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x12, 0x17,
-	0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74,
-	0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x0e, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e,
-	0x64, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x35, 0x0a, 0x09, 0x4c, 0x6f, 0x61,
-	0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x15, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64,
-	0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x0f, 0x2e,
-	0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00,
-	0x12, 0x3c, 0x0a, 0x0d, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x53, 0x74, 0x72, 0x65, 0x61,
-	0x6d, 0x12, 0x17, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x50, 0x72, 0x65, 0x64,
-	0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x0e, 0x2e, 0x62, 0x61, 0x63,
-	0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x30, 0x01, 0x12, 0x40,
-	0x0a, 0x09, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x12, 0x17, 0x2e, 0x62, 0x61,
-	0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74,
-	0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x18, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x45,
-	0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00,
-	0x12, 0x41, 0x0a, 0x0d, 0x47, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x49, 0x6d, 0x61, 0x67,
-	0x65, 0x12, 0x1d, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x47, 0x65, 0x6e, 0x65,
-	0x72, 0x61, 0x74, 0x65, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
-	0x1a, 0x0f, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65, 0x73, 0x75, 0x6c,
-	0x74, 0x22, 0x00, 0x12, 0x4d, 0x0a, 0x12, 0x41, 0x75, 0x64, 0x69, 0x6f, 0x54, 0x72, 0x61, 0x6e,
-	0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x1a, 0x2e, 0x62, 0x61, 0x63, 0x6b,
-	0x65, 0x6e, 0x64, 0x2e, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x65,
-	0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x19, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e,
-	0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74,
-	0x22, 0x00, 0x12, 0x2d, 0x0a, 0x03, 0x54, 0x54, 0x53, 0x12, 0x13, 0x2e, 0x62, 0x61, 0x63, 0x6b,
-	0x65, 0x6e, 0x64, 0x2e, 0x54, 0x54, 0x53, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x0f,
-	0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22,
-	0x00, 0x42, 0x5a, 0x0a, 0x19, 0x69, 0x6f, 0x2e, 0x73, 0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2e, 0x6c,
-	0x6f, 0x63, 0x61, 0x6c, 0x61, 0x69, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x42, 0x0e,
-	0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x41, 0x49, 0x42, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x50, 0x01,
-	0x5a, 0x2b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x6f, 0x2d,
-	0x73, 0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2f, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x41, 0x49, 0x2f, 0x70,
-	0x6b, 0x67, 0x2f, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x06, 0x70,
-	0x72, 0x6f, 0x74, 0x6f, 0x33,
+	0x72, 0x69, 0x70, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x19, 0x2e, 0x62, 0x61,
+	0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
+	0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x2d, 0x0a, 0x03, 0x54, 0x54, 0x53, 0x12,
+	0x13, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x54, 0x54, 0x53, 0x52, 0x65, 0x71,
+	0x75, 0x65, 0x73, 0x74, 0x1a, 0x0f, 0x2e, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x52,
+	0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x42, 0x5a, 0x0a, 0x19, 0x69, 0x6f, 0x2e, 0x73, 0x6b,
+	0x79, 0x6e, 0x65, 0x74, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x61, 0x69, 0x2e, 0x62, 0x61, 0x63,
+	0x6b, 0x65, 0x6e, 0x64, 0x42, 0x0e, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x41, 0x49, 0x42, 0x61, 0x63,
+	0x6b, 0x65, 0x6e, 0x64, 0x50, 0x01, 0x5a, 0x2b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63,
+	0x6f, 0x6d, 0x2f, 0x67, 0x6f, 0x2d, 0x73, 0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2f, 0x4c, 0x6f, 0x63,
+	0x61, 0x6c, 0x41, 0x49, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x70, 0x72,
+	0x6f, 0x74, 0x6f, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
 }

 var (
--- a/pkg/grpc/proto/backend.proto
+++ b/pkg/grpc/proto/backend.proto
@@ -57,11 +57,15 @@ message PredictOptions {
  bool Debug = 34;
  repeated int32 EmbeddingTokens = 35;
  string Embeddings = 36;
+  float RopeFreqBase = 37;
+  float RopeFreqScale = 38;
+  float NegativePromptScale = 39;
+  string NegativePrompt = 40;
 }

 // The response message containing the result
 message Reply {
-  string message = 1;
+  bytes message = 1;
 }

 message ModelOptions {
@@ -81,6 +85,22 @@ message ModelOptions {
  string TensorSplit = 14;
  int32 Threads = 15;
  string LibrarySearchPath = 16;
+  float RopeFreqBase = 17;
+  float RopeFreqScale = 18;
+  float RMSNormEps = 19;
+  int32 NGQA = 20;
+  string ModelFile = 21;
+
+  // AutoGPTQ
+  string Device = 22;
+  bool UseTriton = 23;
+  string ModelBaseName = 24;
+  bool UseFastTokenizer = 25;
+
+  // Diffusers
+  string PipelineType = 26;
+  string SchedulerType = 27;
+  bool CUDA = 28;
 }

 message Result {
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -26,7 +26,7 @@ type server struct {
 }

 func (s *server) Health(ctx context.Context, in *pb.HealthMessage) (*pb.Reply, error) {
-	return &pb.Reply{Message: "OK"}, nil
+	return newReply("OK"), nil
 }

 func (s *server) Embedding(ctx context.Context, in *pb.PredictOptions) (*pb.EmbeddingResult, error) {
@@ -48,7 +48,7 @@ func (s *server) LoadModel(ctx context.Context, in *pb.ModelOptions) (*pb.Result

 func (s *server) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.Reply, error) {
 	result, err := s.llm.Predict(in)
-	return &pb.Reply{Message: result}, err
+	return newReply(result), err
 }

 func (s *server) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest) (*pb.Result, error) {
@@ -99,7 +99,7 @@ func (s *server) PredictStream(in *pb.PredictOptions, stream pb.Backend_PredictS
 	done := make(chan bool)
 	go func() {
 		for result := range resultChan {
-			stream.Send(&pb.Reply{Message: result})
+			stream.Send(newReply(result))
 		}
 		done <- true
 	}()
--- a/pkg/grpc/transcribe/whisper.go
+++ b/pkg/grpc/transcribe/whisper.go
@@ -17,7 +17,7 @@ type Whisper struct {

 func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	// Note: the Model here is a path to a directory containing the model files
-	w, err := whisper.New(opts.Model)
+	w, err := whisper.New(opts.ModelFile)
 	sd.whisper = w
 	return err
 }
--- a/pkg/grpc/tts/piper.go
+++ b/pkg/grpc/tts/piper.go
@@ -18,8 +18,8 @@ type Piper struct {
 }

 func (sd *Piper) Load(opts *pb.ModelOptions) error {
-	if filepath.Ext(opts.Model) != ".onnx" {
-		return fmt.Errorf("unsupported model type %s (should end with .onnx)", opts.Model)
+	if filepath.Ext(opts.ModelFile) != ".onnx" {
+		return fmt.Errorf("unsupported model type %s (should end with .onnx)", opts.ModelFile)
 	}
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -35,7 +35,6 @@ const (
 	Gpt4All             = "gpt4all"
 	FalconBackend       = "falcon"
 	FalconGGMLBackend   = "falcon-ggml"
-	LlamaGrammarBackend = "llama-grammar"

 	BertEmbeddingsBackend  = "bert-embeddings"
 	RwkvBackend            = "rwkv"
@@ -51,7 +50,6 @@ var AutoLoadBackends []string = []string{
 	FalconBackend,
 	GPTNeoXBackend,
 	BertEmbeddingsBackend,
-	LlamaGrammarBackend,
 	FalconGGMLBackend,
 	GPTJBackend,
 	Gpt2Backend,
@@ -78,14 +76,16 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 		return err
 	}

-	log.Debug().Msgf("Loading GRPC Process", grpcProcess)
+	log.Debug().Msgf("Loading GRPC Process: %s", grpcProcess)

 	log.Debug().Msgf("GRPC Service for %s will be running at: '%s'", id, serverAddress)

 	grpcControlProcess := process.New(
 		process.WithTemporaryStateDir(),
 		process.WithName(grpcProcess),
-		process.WithArgs("--addr", serverAddress))
+		process.WithArgs("--addr", serverAddress),
+		process.WithEnvironment(os.Environ()...),
+	)

 	ml.grpcProcesses[id] = grpcControlProcess

@@ -126,8 +126,8 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string

 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
-func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (*grpc.Client, error) {
-	return func(s string) (*grpc.Client, error) {
+func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (*grpc.Client, error) {
+	return func(modelName, modelFile string) (*grpc.Client, error) {
 		log.Debug().Msgf("Loading GRPC Model %s: %+v", backend, *o)

 		var client *grpc.Client
@@ -150,7 +150,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (*grpc
 					return nil, fmt.Errorf("failed allocating free ports: %s", err.Error())
 				}
 				// Make sure the process is executable
-				if err := ml.startProcess(uri, o.modelFile, serverAddress); err != nil {
+				if err := ml.startProcess(uri, o.model, serverAddress); err != nil {
 					return nil, err
 				}

@@ -174,7 +174,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (*grpc
 			}

 			// Make sure the process is executable
-			if err := ml.startProcess(grpcProcess, o.modelFile, serverAddress); err != nil {
+			if err := ml.startProcess(grpcProcess, o.model, serverAddress); err != nil {
 				return nil, err
 			}

@@ -200,7 +200,8 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (*grpc
 		}

 		options := *o.gRPCOptions
-		options.Model = s
+		options.Model = modelName
+		options.ModelFile = modelFile

 		log.Debug().Msgf("GRPC: Loading model with options: %+v", options)

@@ -219,28 +220,28 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (*grpc
 func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err error) {
 	o := NewOptions(opts...)

-	log.Debug().Msgf("Loading model %s from %s", o.backendString, o.modelFile)
+	log.Debug().Msgf("Loading model %s from %s", o.backendString, o.model)

 	backend := strings.ToLower(o.backendString)

 	// if an external backend is provided, use it
 	_, externalBackendExists := o.externalBackends[backend]
 	if externalBackendExists {
-		return ml.LoadModel(o.modelFile, ml.grpcModel(backend, o))
+		return ml.LoadModel(o.model, ml.grpcModel(backend, o))
 	}

 	switch backend {
-	case LlamaBackend, LlamaGrammarBackend, GPTJBackend, DollyBackend,
+	case LlamaBackend, GPTJBackend, DollyBackend,
 		MPTBackend, Gpt2Backend, FalconBackend,
 		GPTNeoXBackend, ReplitBackend, StarcoderBackend, BloomzBackend,
 		RwkvBackend, LCHuggingFaceBackend, BertEmbeddingsBackend, FalconGGMLBackend, StableDiffusionBackend, WhisperBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(backend, o))
+		return ml.LoadModel(o.model, ml.grpcModel(backend, o))
 	case Gpt4AllLlamaBackend, Gpt4AllMptBackend, Gpt4AllJBackend, Gpt4All:
 		o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "gpt4all")
-		return ml.LoadModel(o.modelFile, ml.grpcModel(Gpt4All, o))
+		return ml.LoadModel(o.model, ml.grpcModel(Gpt4All, o))
 	case PiperBackend:
 		o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "espeak-ng-data")
-		return ml.LoadModel(o.modelFile, ml.grpcModel(PiperBackend, o))
+		return ml.LoadModel(o.model, ml.grpcModel(PiperBackend, o))
 	default:
 		return nil, fmt.Errorf("backend unsupported: %s", o.backendString)
 	}
@@ -251,8 +252,8 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {

 	// Is this really needed? BackendLoader already does this
 	ml.mu.Lock()
-	if m := ml.checkIsLoaded(o.modelFile); m != nil {
-		log.Debug().Msgf("Model '%s' already loaded", o.modelFile)
+	if m := ml.checkIsLoaded(o.model); m != nil {
+		log.Debug().Msgf("Model '%s' already loaded", o.model)
 		ml.mu.Unlock()
 		return m, nil
 	}
@@ -265,14 +266,14 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
 	for _, b := range o.externalBackends {
 		allBackendsToAutoLoad = append(allBackendsToAutoLoad, b)
 	}
-	log.Debug().Msgf("Loading model '%s' greedly from all the available backends: %s", o.modelFile, strings.Join(allBackendsToAutoLoad, ", "))
+	log.Debug().Msgf("Loading model '%s' greedly from all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", "))

 	for _, b := range allBackendsToAutoLoad {
 		log.Debug().Msgf("[%s] Attempting to load", b)
 		options := []Option{
 			WithBackendString(b),
-			WithModelFile(o.modelFile),
-			WithLoadGRPCLLMModelOpts(o.gRPCOptions),
+			WithModel(o.model),
+			WithLoadGRPCLoadModelOpts(o.gRPCOptions),
 			WithThreads(o.threads),
 			WithAssetDir(o.assetDir),
 		}
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -20,10 +20,12 @@ import (
 // These are the definitions of all possible variables LocalAI will currently populate for use in a prompt template file
 // Please note: Not all of these are populated on every endpoint - your template should either be tested for each endpoint you map it to, or tolerant of zero values.
 type PromptTemplateData struct {
-	Input        string
-	Instruction  string
-	Functions    []grammar.Function
-	MessageIndex int
+	SystemPrompt         string
+	SuppressSystemPrompt bool // used by chat specifically to indicate that SystemPrompt above should be _ignored_
+	Input                string
+	Instruction          string
+	Functions            []grammar.Function
+	MessageIndex         int
 }

 // TODO: Ask mudler about FunctionCall stuff being useful at the message level?
@@ -85,8 +87,8 @@ func (ml *ModelLoader) ListModels() ([]string, error) {

 	models := []string{}
 	for _, file := range files {
-		// Skip templates, YAML and .keep files
-		if strings.HasSuffix(file.Name(), ".tmpl") || strings.HasSuffix(file.Name(), ".keep") || strings.HasSuffix(file.Name(), ".yaml") || strings.HasSuffix(file.Name(), ".yml") {
+		// Skip templates, YAML, .keep, .json, and .DS_Store files - TODO: as this list grows, is there a more efficient method?
+		if strings.HasSuffix(file.Name(), ".tmpl") || strings.HasSuffix(file.Name(), ".keep") || strings.HasSuffix(file.Name(), ".yaml") || strings.HasSuffix(file.Name(), ".yml") || strings.HasSuffix(file.Name(), ".json") || strings.HasSuffix(file.Name(), ".DS_Store") {
 			continue
 		}

@@ -96,13 +98,12 @@ func (ml *ModelLoader) ListModels() ([]string, error) {
 	return models, nil
 }

-func (ml *ModelLoader) LoadModel(modelName string, loader func(string) (*grpc.Client, error)) (*grpc.Client, error) {
+func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (*grpc.Client, error)) (*grpc.Client, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()

 	// Check if we already have a loaded model
 	if model := ml.checkIsLoaded(modelName); model != nil {
-		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
 		return model, nil
 	}

@@ -110,7 +111,7 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string) (*grpc.Cl
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)

-	model, err := loader(modelFile)
+	model, err := loader(modelName, modelFile)
 	if err != nil {
 		return nil, err
 	}
--- a/pkg/model/options.go
+++ b/pkg/model/options.go
@@ -8,7 +8,7 @@ import (

 type Options struct {
 	backendString string
-	modelFile     string
+	model         string
 	threads       uint32
 	assetDir      string
 	context       context.Context
@@ -35,13 +35,13 @@ func WithBackendString(backend string) Option {
 	}
 }

-func WithModelFile(modelFile string) Option {
+func WithModel(modelFile string) Option {
 	return func(o *Options) {
-		o.modelFile = modelFile
+		o.model = modelFile
 	}
 }

-func WithLoadGRPCLLMModelOpts(opts *pb.ModelOptions) Option {
+func WithLoadGRPCLoadModelOpts(opts *pb.ModelOptions) Option {
 	return func(o *Options) {
 		o.gRPCOptions = opts
 	}
--- a/pkg/utils/json.go
+++ b/pkg/utils/json.go
@@ -0,0 +1,13 @@
+package utils
+
+import "regexp"
+
+var matchNewlines = regexp.MustCompile(`[\r\n]`)
+
+const doubleQuote = `"[^"\\]*(?:\\[\s\S][^"\\]*)*"`
+
+func EscapeNewLines(s string) string {
+	return regexp.MustCompile(doubleQuote).ReplaceAllStringFunc(s, func(s string) string {
+		return matchNewlines.ReplaceAllString(s, "\\n")
+	})
+}
--- a/pkg/utils/path.go
+++ b/pkg/utils/path.go
@@ -18,5 +18,5 @@ func inTrustedRoot(path string, trustedRoot string) error {
 // VerifyPath verifies that path is based in basePath.
 func VerifyPath(path, basePath string) error {
 	c := filepath.Clean(filepath.Join(basePath, path))
-	return inTrustedRoot(c, basePath)
+	return inTrustedRoot(c, filepath.Clean(basePath))
 }
--- a/pkg/utils/uri.go
+++ b/pkg/utils/uri.go
@@ -2,8 +2,9 @@ package utils

 import (
 	"fmt"
-	"io/ioutil"
+	"io"
 	"net/http"
+	"os"
 	"strings"
 )

@@ -32,7 +33,7 @@ func GetURI(url string, f func(url string, i []byte) error) error {
 	if strings.HasPrefix(url, "file://") {
 		rawURL := strings.TrimPrefix(url, "file://")
 		// Read the response body
-		body, err := ioutil.ReadFile(rawURL)
+		body, err := os.ReadFile(rawURL)
 		if err != nil {
 			return err
 		}
@@ -49,7 +50,7 @@ func GetURI(url string, f func(url string, i []byte) error) error {
 	defer response.Body.Close()

 	// Read the response body
-	body, err := ioutil.ReadAll(response.Body)
+	body, err := io.ReadAll(response.Body)
 	if err != nil {
 		return err
 	}
Author	SHA1	Message	Date
Ettore Di Giacinto	a6c621ef7f	feat: pre-configure LocalAI galleries (#886 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-12 11:25:17 +02:00
renovate[bot]	328289099a	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 4d855af (#875 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-12 08:58:55 +02:00
renovate[bot]	22ffd5f490	fix(deps): update github.com/tmc/langchaingo digest to fd8b7f0 (#882 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-12 08:56:15 +02:00
Ettore Di Giacinto	81708bb1e6	fix: workaround exllama import error (#885 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-12 08:56:01 +02:00
Ettore Di Giacinto	c81e9d8d1f	fix: add exllama to protogen	2023-08-11 01:02:31 +02:00
Ettore Di Giacinto	ff3ab5fcca	feat: Add exllama (#881 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-11 00:49:40 +02:00
Michael Nesbitt	1d1cae8e4d	feat: add API_KEY list support (#877 ) Co-authored-by: Harold Sun <sunhua@amazon.com>	2023-08-10 00:06:21 +02:00
Ettore Di Giacinto	8c781a6a44	feat: Add Diffusers (#874 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-09 08:38:51 +02:00
Ettore Di Giacinto	93a4bec06b	fix: upgrade pip (#872 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-08 23:20:03 +02:00
renovate[bot]	c93f57efd6	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 0f2bb50 (#869 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-08 21:57:16 +02:00
ci-robbot [bot]	0e4f93c5cf	⬆️ Update nomic-ai/gpt4all (#870 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-08 21:57:01 +02:00
Ettore Di Giacinto	5b3fedebfe	feat: add bark and AutoGPTQ (#871 )	2023-08-08 20:41:49 +02:00
Ettore Di Giacinto	219751bb21	fix: cut prompt from AutoGPTQ answers Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-08 01:27:38 +02:00
Ettore Di Giacinto	bb7772a364	fix: byte utf-8 encode results from autogptq Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-08 01:20:07 +02:00
Ettore Di Giacinto	3c8fc37c56	feat: Add UseFastTokenizer Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-08 01:10:05 +02:00
Ettore Di Giacinto	39805b09e5	fix: pass by env in managed services Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-08 00:58:38 +02:00
Ettore Di Giacinto	63b01199fe	fix: match lowercase of the input, not of the model	2023-08-08 00:46:22 +02:00
Ettore Di Giacinto	b09bae3443	fix: autogptq requirements Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-08 00:22:15 +02:00
Ettore Di Giacinto	de6fb98bed	feat: register autogptq and bark in the container image	2023-08-07 22:53:28 +02:00
Ettore Di Giacinto	433605e282	feat: add initial Bark backend implementation Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-07 22:53:28 +02:00
Ettore Di Giacinto	a843e64fc2	feat: add initial AutoGPTQ backend implementation	2023-08-07 22:53:28 +02:00
scott4290	71611d2dec	docs: base-Update comments in .env for cublas, openblas, clblas (#867 )	2023-08-07 08:22:42 +00:00
Ettore Di Giacinto	abf48e8a5d	readme: link to hot topics in the website	2023-08-07 00:31:46 +02:00
Ettore Di Giacinto	ac5ea0cd4d	readme: link usage to docs	2023-08-07 00:04:28 +02:00
Ettore Di Giacinto	a46fcacedd	readme: simplify, remove dups with website	2023-08-07 00:01:01 +02:00
Ettore Di Giacinto	df947fc933	examples: Update README	2023-08-06 23:07:06 +02:00
Ettore Di Giacinto	91d49cfe9f	Update README.md	2023-08-06 11:57:28 +02:00
Ettore Di Giacinto	19d15f83db	Update README.md	2023-08-06 00:04:06 +02:00
Ettore Di Giacinto	cde61cc518	Update README.md	2023-08-05 23:14:09 +02:00
Ettore Di Giacinto	acd829a7a0	fix: do not break on newlines on function returns (#864 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-04 21:46:36 +02:00
Ettore Di Giacinto	4aa5dac768	feat: update integer, number and string rules - allow primitives as root types (#862 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-03 23:32:30 +02:00
renovate[bot]	08b59b5cc5	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 50cee77 (#861 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-03 19:08:04 +02:00
ci-robbot [bot]	6b900e28cd	⬆️ Update nomic-ai/gpt4all (#859 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-03 19:07:53 +02:00
Ettore Di Giacinto	5ca21ee398	feat: add ngqa and RMSNormEps parameters (#860 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-03 00:51:08 +02:00
renovate[bot]	953e30814a	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to c449b71 (#858 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-03 00:20:45 +02:00
renovate[bot]	a65344cf25	fix(deps): update github.com/tmc/langchaingo digest to 271e9bd (#857 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-03 00:20:22 +02:00
Dave	7fb8b4191f	feat: "simple" chat/edit/completion template system prompt from config (#856 )	2023-08-03 00:19:55 +02:00
Tyler Harpool	fc8aec7324	Update to working k8sgpt + localai example in documentation (#852 )	2023-08-01 22:31:36 +02:00
Ettore Di Giacinto	c309aac8f5	fix(gallery): use inline YAML (#851 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-08-01 19:09:32 +02:00
Ettore Di Giacinto	1e37ec727d	Revert "⬆️ Update go-skynet/go-llama.cpp" (#850 )	2023-08-01 19:09:18 +02:00
ci-robbot [bot]	ae36bae59d	⬆️ Update nomic-ai/gpt4all (#847 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-08-01 00:48:10 +02:00
renovate[bot]	e663beebf0	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to cbdcde8 (#833 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-08-01 00:47:06 +02:00
Ettore Di Giacinto	9d0292e9e1	Update README.md	2023-08-01 00:42:56 +02:00
Ettore Di Giacinto	fe27bb7982	feat: Update logo (#849 )	2023-08-01 00:31:40 +02:00
Ettore Di Giacinto	d603a9cbb5	fix(gallery): preload from file should by in YAML format (#846 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-31 21:13:16 +02:00
Ettore Di Giacinto	c1fc22e746	fix(examples): use pinned versions in the k8sgpt example (#845 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-31 19:15:57 +02:00
renovate[bot]	85d3710924	fix(deps): update github.com/tmc/langchaingo digest to 8f10160 (#843 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-31 19:15:13 +02:00
ci-robbot [bot]	a0324245f1	⬆️ Update nomic-ai/gpt4all (#841 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-07-31 19:14:56 +02:00
Dave	ce8e9dc690	feature: model list :: filter query string parameter (#830 )	2023-07-31 19:14:32 +02:00
Andrew Zigler	32ca7efbeb	✨📝 Add OpenOps to README's project list (#832 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-07-30 15:30:14 +02:00
longy2k	27520eb169	Added "BMO Chatbot" to "Projects already using LocalAI to run local models" section." (#828 )	2023-07-30 15:29:23 +02:00
Andrew	9843adb4f1	Create .gitattributes to force git clone to keep the LF line endings on .sh files (#838 )	2023-07-30 15:27:43 +02:00
Dave	8e8d474ae8	refactor: Remove remaining uses of depreciated package `io/ioutil` (#837 )	2023-07-30 11:23:43 +00:00
renovate[bot]	6151ea1c4d	fix(deps): update github.com/tmc/langchaingo digest to 7df4fe5 (#826 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-30 09:49:12 +02:00
renovate[bot]	d969025f87	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 8c51308 (#822 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-30 09:48:52 +02:00
ci-robbot [bot]	18e1cb9c92	⬆️ Update nomic-ai/gpt4all (#825 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-07-30 09:48:30 +02:00
ci-robbot [bot]	e7ceb9e8f5	⬆️ Update go-skynet/go-llama.cpp (#824 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-07-30 09:48:10 +02:00
renovate[bot]	3a4675c8c3	fix(deps): update module github.com/rs/zerolog to v1.30.0 (#836 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-30 09:47:49 +02:00
Dave	5ce0f216cf	Fix: Model Gallery Downloads (#835 )	2023-07-30 09:47:22 +02:00
Ettore Di Giacinto	688f150463	fix: symlink libphonemize in the container (#831 )	2023-07-29 12:47:34 +02:00
Ettore Di Giacinto	00ccb8d4f1	fix: set default rope freq base to 10000 during model load Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-29 10:40:56 +02:00
Ettore Di Giacinto	e70b91aaef	tests: set a small context_size Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-29 10:29:47 +02:00
Dave	8b90ac2b1a	1000 -> 10,000 for ropeFreqBase? the error message talks about a default of 10k, so setting this to 10k instead of 1k experimentally.	2023-07-29 02:37:24 -04:00
Ettore Di Giacinto	f085baa77d	fix: set default rope if not specified Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-29 01:07:16 +02:00
Ettore Di Giacinto	fa4de05c14	fix: symlink libphonemize in the container Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-28 19:40:21 +02:00
Ettore Di Giacinto	dde12b492b	fix: select function calls if 'name' is set in the request (#827 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-28 01:17:11 +02:00
Ettore Di Giacinto	096d98c3d9	fix: add rope settings during model load, fix CUDA (#821 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-27 21:56:05 +02:00
renovate[bot]	147cae9ed8	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 39acbc8 (#817 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-27 18:56:59 +02:00
renovate[bot]	c63709014b	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 6ba16de (#820 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-27 18:56:39 +02:00
Wendy Liga	9b307799ce	fix missing openai_api_base on langchain-chroma example (#818 )	2023-07-27 18:41:53 +02:00
renovate[bot]	78e36779cf	fix(deps): update module google.golang.org/grpc to v1.57.0 (#815 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-27 18:41:29 +02:00
ci-robbot [bot]	90ae35e2e4	⬆️ Update nomic-ai/gpt4all (#814 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-07-27 18:41:15 +02:00
Ettore Di Giacinto	b96e30e66c	fix: use bytes in gRPC proto instead of strings (#813 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-27 18:41:04 +02:00
renovate[bot]	0af0df7423	fix(deps): update module github.com/sashabaranov/go-openai to v1.14.1 (#783 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-27 18:40:50 +02:00
renovate[bot]	0883d324d9	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 562d2b5 (#766 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-26 22:06:05 +02:00
renovate[bot]	77597e6a16	fix(deps): update github.com/nomic-ai/gpt4all/gpt4all-bindings/golang digest to 9100b2e (#753 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-26 22:05:55 +02:00
renovate[bot]	eae6b36d03	fix(deps): update github.com/donomii/go-rwkv.cpp digest to c898cd0 (#748 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-26 22:05:42 +02:00
renovate[bot]	c4bc7c41b1	fix(deps): update github.com/tmc/langchaingo digest to 7d5f9fd (#768 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-07-26 22:05:32 +02:00
ci-robbot [bot]	c79ddd6fc4	⬆️ Update nomic-ai/gpt4all (#807 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-07-25 23:03:02 +02:00
Dave	ae58fb8821	fix: update gitignore and make clean (#798 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-07-25 23:02:46 +02:00
Ettore Di Giacinto	569c1d1163	feat: add rope settings and negative prompt, drop grammar backend (#797 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2023-07-25 19:05:27 +02:00
Aman Gupta Karmani	12fe0932c4	feat: cancel stream generation if client disappears (#792 )	2023-07-24 23:10:54 +02:00
finger42	72e3e236de	Added CPU information to entrypoint.sh (#794 )	2023-07-23 19:27:55 +00:00
Ettore Di Giacinto	ab59b238b3	fix: update README	2023-07-23 18:58:24 +02:00