Update docs (#163 )

feat: support slices or strings in the prompt completion endpoint (#162 )
Signed-off-by: mudler <mudler@mocaccino.org>
2026-02-03 03:02:38 -05:00 · 2023-05-03 15:51:54 +02:00 · 2023-05-03 13:13:31 +02:00 · 2023-05-03 11:48:06 +02:00 · 2023-05-03 11:47:54 +02:00 · 2023-05-03 11:46:29 +02:00
45 changed files with 3414 additions and 521 deletions
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -54,8 +54,8 @@ jobs:
        uses: docker/login-action@v2
        with:
          registry: quay.io
-          username: ${{ secrets.QUAY_USERNAME }}
-          password: ${{ secrets.QUAY_PASSWORD }}
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
      - name: Build
        if: github.event_name != 'pull_request'
        uses: docker/build-push-action@v4
--- a/11
+++ b/11
@@ -1,14 +1,9 @@
 ARG GO_VERSION=1.20
-ARG DEBIAN_VERSION=11
 ARG BUILD_TYPE=
-
-FROM golang:$GO_VERSION as builder
+FROM golang:$GO_VERSION
 WORKDIR /build
 RUN apt-get update && apt-get install -y cmake
 COPY . .
-RUN make build
-
-FROM debian:$DEBIAN_VERSION
-COPY --from=builder /build/local-ai /usr/bin/local-ai
+RUN make prepare-sources
 EXPOSE 8080
-ENTRYPOINT [ "/usr/bin/local-ai" ]
+ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -0,0 +1,14 @@
+ARG GO_VERSION=1.20
+ARG DEBIAN_VERSION=11
+ARG BUILD_TYPE=
+
+FROM golang:$GO_VERSION as builder
+WORKDIR /build
+RUN apt-get update && apt-get install -y cmake
+COPY . .
+RUN make build
+
+FROM debian:$DEBIAN_VERSION
+COPY --from=builder /build/local-ai /usr/bin/local-ai
+EXPOSE 8080
+ENTRYPOINT [ "/usr/bin/local-ai" ]
--- a/67
+++ b/67
@@ -3,20 +3,23 @@ GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 # renovate: datasource=github-tags depName=go-skynet/go-llama.cpp
-GOLLAMA_VERSION?=llama.cpp-7f15c5c
+GOLLAMA_VERSION?=llama.cpp-f4cef87
 # renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt4all-j.cpp currentValueTemplate=master depNameTemplate=go-gpt4all-j.cpp
 GOGPT4ALLJ_VERSION?=1f7bff57f66cb7062e40d0ac3abd2217815e5109
 # renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt2.cpp currentValueTemplate=master depNameTemplate=go-gpt2.cpp
 GOGPT2_VERSION?=245a5bfe6708ab80dc5c733dcdbfbe3cfd2acdaa

+RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
+RWKV_VERSION?=af62fcc432be2847acb6e0688b2c2491d6588d58
+
 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
 WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

-C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2
-LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2
+C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv
+LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv

 # Use this if you want to set the default behavior
 ifndef BUILD_TYPE
@@ -33,20 +36,10 @@ endif

 all: help

-## Build:
-
-build: prepare ## Build the project
-	$(info ${GREEN}I local-ai build info:${RESET})
-	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
-	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) build -o $(BINARY_NAME) ./
-
-generic-build: ## Build the project using generic
-	BUILD_TYPE="generic" $(MAKE) build
-
 ## GPT4ALL-J
 go-gpt4all-j:
 	git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j
-	cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION)
+	cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION) && git submodule update --init --recursive --depth 1
 	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
 	@find ./go-gpt4all-j -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
@@ -57,13 +50,21 @@ go-gpt4all-j:
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_gptj_replace/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_gptj_replace/g' {} +

+## RWKV
+go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
+	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+
+go-rwkv/librwkv.a: go-rwkv
+	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a .. && cp ggml/src/libggml.a ..
+
 go-gpt4all-j/libgptj.a: go-gpt4all-j
 	$(MAKE) -C go-gpt4all-j $(GENERIC_PREFIX)libgptj.a

-# CEREBRAS GPT
-go-gpt2:
+## CEREBRAS GPT
+go-gpt2: 
 	git clone --recurse-submodules https://github.com/go-skynet/go-gpt2.cpp go-gpt2
-	cd go-gpt2 && git checkout -b build $(GOGPT2_VERSION)
+	cd go-gpt2 && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
 	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
 	@find ./go-gpt2 -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
 	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
@@ -74,29 +75,51 @@ go-gpt2:

 go-gpt2/libgpt2.a: go-gpt2
 	$(MAKE) -C go-gpt2 $(GENERIC_PREFIX)libgpt2.a
-	

 go-llama:
 	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama

-go-llama/libbinding.a: go-llama
+go-llama/libbinding.a: go-llama 
 	$(MAKE) -C go-llama $(GENERIC_PREFIX)libbinding.a

 replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt2.cpp=$(shell pwd)/go-gpt2
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv

-prepare: go-llama/libbinding.a go-gpt4all-j/libgptj.a go-gpt2/libgpt2.a replace
+prepare-sources: go-llama go-gpt2 go-gpt4all-j go-rwkv
+	$(GOCMD) mod download
+
+## GENERIC
+rebuild: ## Rebuilds the project
+	$(MAKE) -C go-llama clean
+	$(MAKE) -C go-gpt4all-j clean
+	$(MAKE) -C go-gpt2 clean
+	$(MAKE) -C go-rwkv clean
+	$(MAKE) build
+
+prepare: prepare-sources go-llama/libbinding.a go-gpt4all-j/libgptj.a go-gpt2/libgpt2.a go-rwkv/librwkv.a replace ## Prepares for building

 clean: ## Remove build related file
 	rm -fr ./go-llama
 	rm -rf ./go-gpt4all-j
 	rm -rf ./go-gpt2
+	rm -rf ./go-rwkv
 	rm -rf $(BINARY_NAME)

-## Run:
-run: prepare
+## Build:
+
+build: prepare ## Build the project
+	$(info ${GREEN}I local-ai build info:${RESET})
+	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
+	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) build -o $(BINARY_NAME) ./
+
+generic-build: ## Build the project using generic
+	BUILD_TYPE="generic" $(MAKE) build
+
+## Run
+run: prepare ## run local-ai
 	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) run ./main.go

 test-models/testmodel:
--- a/README.md
+++ b/README.md
@@ -9,17 +9,23 @@

 [![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 

-**LocalAI** is a straightforward, drop-in replacement API compatible with OpenAI for local CPU inferencing, based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is licensed under Apache 2.0.
+**LocalAI** is a drop-in replacement REST API compatible with OpenAI for local CPU inferencing. It allows to run models locally or on-prem with consumer grade hardware. It is based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all), [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is licensed under Apache 2.0.

 - OpenAI compatible API
 - Supports multiple-models
 - Once loaded the first time, it keep models loaded in memory for faster inference
 - Support for prompt templates
- Doesn't shell-out, but uses C bindings for a faster inference and better performance. Uses [go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) and [go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp).
+- Doesn't shell-out, but uses C bindings for a faster inference and better performance. 

 LocalAI is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! It was initially created by [mudler](https://github.com/mudler/) at the [SpectroCloud OSS Office](https://github.com/spectrocloud).

+### News
+
+- 02-05-2023: Support for `rwkv.cpp` models ( https://github.com/go-skynet/LocalAI/pull/158 ) and for `/edits` endpoint
+- 01-05-2023: Support for SSE stream of tokens in `llama.cpp` backends ( https://github.com/go-skynet/LocalAI/pull/152 )
+
 ### Socials and community chatter
+
 - Follow [@LocalAI_API](https://twitter.com/LocalAI_API) on twitter.

 - [Reddit post](https://www.reddit.com/r/selfhosted/comments/12w4p2f/localai_openai_compatible_api_to_run_llm_models/) about LocalAI.
@@ -39,11 +45,26 @@ Tested with:
 - [GPT4ALL-J](https://gpt4all.io/models/ggml-gpt4all-j.bin)
 - Koala
 - [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml)
+- [RWKV](https://github.com/BlinkDL/RWKV-LM) models with [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)

 It should also be compatible with StableLM and GPTNeoX ggml models (untested)

 Note: You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.

+### RWKV
+
+<details>
+
+For `rwkv` models, you need to put also the associated tokenizer along with the ggml model:
+
+```
+ls models
+36464540 -rw-r--r--  1 mudler mudler 1.2G May  3 10:51 rwkv_small
+36464543 -rw-r--r--  1 mudler mudler 2.4M May  3 10:51 rwkv_small.tokenizer.json
+```
+
+</details>
+
 ## Usage

 > `LocalAI` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).
@@ -120,184 +141,61 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso

 To build locally, run `make build` (see below).

-## Other examples
+### Other examples

 ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)

 To see other examples on how to integrate with other projects for instance chatbot-ui, see: [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/).

-## Prompt templates 

-The API doesn't inject a default prompt for talking to the model. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.
-
-<details>
-You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibling file, `foo.bin.tmpl` which will be used as a default prompt and can be used with alpaca:
-
-```
-The below instruction describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-{{.Input}}
-
-### Response:
-```
-
-See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prompt-templates) directory in this repository for templates for some of the most popular models.
-
-</details>
-
-## Installation
-
-Currently LocalAI comes as container images and can be used with docker or a containre engine of choice. 
-
-### Run LocalAI in Kubernetes
-
-LocalAI can be installed inside Kubernetes with helm.
-
-<details>
-The local-ai Helm chart supports two options for the LocalAI server's models directory:
-1. Basic deployment with no persistent volume. You must manually update the Deployment to configure your own models directory.
-
-    Install the chart with `.Values.deployment.volumes.enabled == false` and `.Values.dataVolume.enabled == false`.
-
-2. Advanced, two-phase deployment to provision the models directory using a DataVolume. Requires [Containerized Data Importer CDI](https://github.com/kubevirt/containerized-data-importer) to be pre-installed in your cluster.
-
-    First, install the chart with `.Values.deployment.volumes.enabled == false` and `.Values.dataVolume.enabled == true`:
-    ```bash
-    helm install local-ai charts/local-ai -n local-ai --create-namespace
-    ```
-    Wait for CDI to create an importer Pod for the DataVolume and for the importer pod to finish provisioning the model archive inside the PV.
-
-    Once the PV is provisioned and the importer Pod removed, set `.Values.deployment.volumes.enabled == true` and `.Values.dataVolume.enabled == false` and upgrade the chart:
-    ```bash
-    helm upgrade local-ai -n local-ai charts/local-ai
-    ```
-    This will update the local-ai Deployment to mount the PV that was provisioned by the DataVolume.
-
-</details>
-
-## API
-
-`LocalAI` provides an API for running text generation as a service, that follows the OpenAI reference and can be used as a drop-in. The models once loaded the first time will be kept in memory.
-
-<details>
-Example of starting the API with `docker`:
-
-```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
-```
-
-You should see:
-```
-┌───────────────────────────────────────────────────┐ 
-│                   Fiber v2.42.0                   │ 
-│               http://127.0.0.1:8080               │ 
-│       (bound on host 0.0.0.0 and port 8080)       │ 
-│                                                   │ 
-│ Handlers ............. 1  Processes ........... 1 │ 
-│ Prefork ....... Disabled  PID ................. 1 │ 
-└───────────────────────────────────────────────────┘ 
-```
-
-You can control the API server options with command line arguments:
-
-```
-local-api --models-path <model_path> [--address <address>] [--threads <num_threads>]
-```
-
-The API takes takes the following parameters:
-
-| Parameter    | Environment Variable | Default Value | Description                            |
-| ------------ | -------------------- | ------------- | -------------------------------------- |
-| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
-| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
-| address      | ADDRESS              | :8080         | The address and port to listen on. |
-| context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| debug | DEBUG         | false           | Enable debug mode. |
-| config-file | CONFIG_FILE         | empty           | Path to a LocalAI config file. |
-
-Once the server is running, you can start making requests to it using HTTP, using the OpenAI API. 
-
-</details>
-
-### Supported OpenAI API endpoints
-
-You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
-
-Following the list of endpoints/parameters supported. 
-
-Note:
-
- You can also specify the model as part of the OpenAI token.
- If only one model is available, the API will use it for all the requests.
-
-#### Chat completions
-
-<details>
-For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
-
-```
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-koala-7b-model-q4_0-r2.bin",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-Available additional parameters: `top_p`, `top_k`, `max_tokens`
-</details>
-
-#### Completions
-
-<details>
-
-To generate a completion, you can send a POST request to the `/v1/completions` endpoint with the instruction as per the request body:
-
-```
-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-koala-7b-model-q4_0-r2.bin",
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
-```
-
-Available additional parameters: `top_p`, `top_k`, `max_tokens`
-
-</details>
-
-#### List models
-
-<details>
-You can list all the models available with:
-
-```
-curl http://localhost:8080/v1/models
-```
-
-</details>
-
-## Advanced configuration
+### Advanced configuration

 LocalAI can be configured to serve user-defined models with a set of default parameters and templates.

 <details>
-You can create multiple `yaml` files in the models path or either specify a single YAML configuration file.

-For instance, a configuration file (`gpt-3.5-turbo.yaml`) can be declaring the "gpt-3.5-turbo" model but backed by the "testmodel" model file:
+You can create multiple `yaml` files in the models path or either specify a single YAML configuration file. 
+Consider the following `models` folder in the `example/chatbot-ui`:
+
+```
+base ❯ ls -liah examples/chatbot-ui/models 
+36487587 drwxr-xr-x 2 mudler mudler 4.0K May  3 12:27 .
+36487586 drwxr-xr-x 3 mudler mudler 4.0K May  3 10:42 ..
+36465214 -rw-r--r-- 1 mudler mudler   10 Apr 27 07:46 completion.tmpl
+36464855 -rw-r--r-- 1 mudler mudler 3.6G Apr 27 00:08 ggml-gpt4all-j
+36464537 -rw-r--r-- 1 mudler mudler  245 May  3 10:42 gpt-3.5-turbo.yaml
+36467388 -rw-r--r-- 1 mudler mudler  180 Apr 27 07:46 gpt4all.tmpl
+```
+
+In the `gpt-3.5-turbo.yaml` file it is defined the `gpt-3.5-turbo` model which is an alias to use `gpt4all-j` with pre-defined options.
+
+For instance, consider the following that declares `gpt-3.5-turbo` backed by the `ggml-gpt4all-j` model:

 ```yaml
 name: gpt-3.5-turbo
+# Default model parameters
 parameters:
-  model: testmodel
+  # Relative to the models path
+  model: ggml-gpt4all-j
+  # temperature
+  temperature: 0.3
+  # all the OpenAI request options here..
+
+# Default context size
 context_size: 512
 threads: 10
+# Define a backend (optional). By default it will try to guess the backend the first time the model is interacted with.
+backend: gptj # available: llama, stablelm, gpt2, gptj rwkv
+# stopwords (if supported by the backend)
 stopwords:
 - "HUMAN:"
 - "### Response:"
+# define chat roles
 roles:
  user: "HUMAN:"
  system: "GPT:"
 template:
+  # template file ".tmpl" with the prompt template to use by default on the endpoint call. Note there is no extension in the files
  completion: completion
  chat: ggml-gpt4all-j
 ```
@@ -332,20 +230,101 @@ Specifying a `config-file` via CLI allows to declare models in a single file as
    system: "GPT:"
  template:
    completion: completion
-    chat: ggml-gpt4all-j
+   chat: ggml-gpt4all-j
 ```

 See also [chatbot-ui](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) as an example on how to use config files.

 </details>

-## Windows compatibility
+### Prompt templates 

-It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/LocalAI/issues/2
+The API doesn't inject a default prompt for talking to the model. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.

-## Build locally
+<details>
+You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibling file, `foo.bin.tmpl` which will be used as a default prompt and can be used with alpaca:

-Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.
+```
+The below instruction describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Input}}
+
+### Response:
+```
+
+See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prompt-templates) directory in this repository for templates for some of the most popular models.
+
+
+For the edit endpoint, an example template for alpaca-based models can be:
+
+```yaml
+Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Instruction}}
+
+### Input:
+{{.Input}}
+
+### Response:
+```
+
+</details>
+
+### CLI
+
+You can control LocalAI with command line arguments, to specify a binding address, or the number of threads.
+
+<details>
+
+Usage:
+
+```
+local-ai --models-path <model_path> [--address <address>] [--threads <num_threads>]
+```
+
+| Parameter    | Environment Variable | Default Value | Description                            |
+| ------------ | -------------------- | ------------- | -------------------------------------- |
+| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
+| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
+| address      | ADDRESS              | :8080         | The address and port to listen on. |
+| context-size | CONTEXT_SIZE         | 512           | Default token context size. |
+| debug | DEBUG         | false           | Enable debug mode. |
+| config-file | CONFIG_FILE         | empty           | Path to a LocalAI config file. |
+
+</details>
+
+## Setup
+
+Currently LocalAI comes as a container image and can be used with docker or a container engine of choice. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).
+
+### Docker
+
+<details>
+Example of starting the API with `docker`:
+
+```bash
+docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
+```
+
+You should see:
+```
+┌───────────────────────────────────────────────────┐ 
+│                   Fiber v2.42.0                   │ 
+│               http://127.0.0.1:8080               │ 
+│       (bound on host 0.0.0.0 and port 8080)       │ 
+│                                                   │ 
+│ Handlers ............. 1  Processes ........... 1 │ 
+│ Prefork ....... Disabled  PID ................. 1 │ 
+└───────────────────────────────────────────────────┘ 
+```
+
+</details>
+
+### Build locally
+
+<details>

 In order to build the `LocalAI` container image locally you can use `docker`:

@@ -355,12 +334,182 @@ docker build -t LocalAI .
 docker run LocalAI
 ```

-Or build the binary with `make`:
+Or you can build the binary with `make`:

 ```
 make build
 ```

+</details>
+
+### Build on mac
+
+Building on Mac (M1 or M2) works, but you may need to install some prerequisites using `brew`. 
+
+<details>
+
+The below has been tested by one mac user and found to work. Note that this doesn't use docker to run the server:
+
+```
+# install build dependencies
+brew install cmake
+brew install go
+
+# clone the repo
+git clone https://github.com/go-skynet/LocalAI.git
+
+cd LocalAI
+
+# build the binary
+make build
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Use a template from the examples
+cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
+
+# Run LocalAI
+./local-ai --models-path ./models/ --debug
+
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-gpt4all-j",
+     "messages": [{"role": "user", "content": "How are you?"}],
+     "temperature": 0.9 
+   }'
+```
+
+</details>
+
+### Windows compatibility
+
+It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/LocalAI/issues/2
+
+### Run LocalAI in Kubernetes
+
+LocalAI can be installed inside Kubernetes with helm.
+
+<details>
+
+1. Add the helm repo
+    ```bash
+    helm repo add go-skynet https://go-skynet.github.io/helm-charts/
+    ```
+1. Create a values files with your settings:
+```bash
+cat <<EOF > values.yaml
+deployment:
+  image: quay.io/go-skynet/local-ai:latest
+  env:
+    threads: 4
+    contextSize: 1024
+    modelsPath: "/models"
+# Optionally create a PVC, mount the PV to the LocalAI Deployment,
+# and download a model to prepopulate the models directory
+modelsVolume:
+  enabled: true
+  url: "https://gpt4all.io/models/ggml-gpt4all-j.bin"
+  pvc:
+    size: 6Gi
+    accessModes:
+    - ReadWriteOnce
+  auth:
+    # Optional value for HTTP basic access authentication header
+    basic: "" # 'username:password' base64 encoded
+service:
+  type: ClusterIP
+  annotations: {}
+  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
+  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
+EOF
+```
+3. Install the helm chart:
+```bash
+helm repo update
+helm install local-ai go-skynet/local-ai -f values.yaml
+```
+
+Check out also the [helm chart repository on GitHub](https://github.com/go-skynet/helm-charts).
+
+</details>
+
+## Supported OpenAI API endpoints
+
+You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
+
+Following the list of endpoints/parameters supported. 
+
+Note:
+
+- You can also specify the model as part of the OpenAI token.
+- If only one model is available, the API will use it for all the requests.
+
+### Chat completions
+
+<details>
+For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
+
+```
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+</details>
+
+### Edit completions
+
+<details>
+To generate an edit completion you can send a POST request to the `/v1/edits` endpoint with the instruction as the request body:
+
+```
+curl http://localhost:8080/v1/edits -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "instruction": "rephrase",
+     "input": "Black cat jumped out of the window",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`.
+
+</details>
+
+### Completions
+
+<details>
+
+To generate a completion, you can send a POST request to the `/v1/completions` endpoint with the instruction as per the request body:
+
+```
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+</details>
+
+### List models
+
+<details>
+You can list all the models available with:
+
+```
+curl http://localhost:8080/v1/models
+```
+
+</details>
+
 ## Frequently asked questions

 Here are answers to some of the most common questions.
@@ -448,6 +597,13 @@ LocalAI is a community-driven project. It was initially created by [mudler](http

 MIT

+## Golang bindings used
+
+- [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+- [go-skynet/go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp)
+- [go-skynet/go-gpt2.cpp](https://github.com/go-skynet/go-gpt2.cpp)
+- [donomii/go-rwkv.cpp](https://github.com/donomii/go-rwkv.cpp)
+
 ## Acknowledgements

 - [llama.cpp](https://github.com/ggerganov/llama.cpp)
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -79,7 +79,7 @@ var _ = Describe("API test", func() {
 		It("returns errors", func() {
 			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
 			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: llama: model does not exist"))
+			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 5 errors occurred:"))
 		})

 	})
--- a/api/config.go
+++ b/api/config.go
@@ -21,6 +21,7 @@ type Config struct {
 	Threads        int               `yaml:"threads"`
 	Debug          bool              `yaml:"debug"`
 	Roles          map[string]string `yaml:"roles"`
+	Backend        string            `yaml:"backend"`
 	TemplateConfig TemplateConfig    `yaml:"template"`
 }

--- a/api/openai.go
+++ b/api/openai.go
@@ -2,6 +2,7 @@ package api

 import (
 	"bufio"
+	"bytes"
 	"encoding/json"
 	"fmt"
 	"os"
@@ -56,13 +57,13 @@ type OpenAIRequest struct {
 	Model string `json:"model" yaml:"model"`

 	// Prompt is read only by completion API calls
-	Prompt string `json:"prompt" yaml:"prompt"`
+	Prompt interface{} `json:"prompt" yaml:"prompt"`

 	// Edit endpoint
 	Instruction string `json:"instruction" yaml:"instruction"`
 	Input       string `json:"input" yaml:"input"`

-	Stop string `json:"stop" yaml:"stop"`
+	Stop interface{} `json:"stop" yaml:"stop"`

 	// Messages is read only by chat/completion API calls
 	Messages []Message `json:"messages" yaml:"messages"`
@@ -116,8 +117,17 @@ func updateConfig(config *Config, input *OpenAIRequest) {
 		config.Maxtokens = input.Maxtokens
 	}

-	if input.Stop != "" {
-		config.StopWords = append(config.StopWords, input.Stop)
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
 	}

 	if input.RepeatPenalty != 0 {
@@ -227,27 +237,44 @@ func completionEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		predInput := input.Prompt
+		predInput := []string{}
+
+		switch p := input.Prompt.(type) {
+		case string:
+			predInput = append(predInput, p)
+		case []interface{}:
+			for _, pp := range p {
+				if s, ok := pp.(string); ok {
+					predInput = append(predInput, s)
+				}
+			}
+		}
+
 		templateFile := config.Model

 		if config.TemplateConfig.Completion != "" {
 			templateFile = config.TemplateConfig.Completion
 		}

-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := loader.TemplatePrefix(templateFile, struct {
-			Input string
-		}{Input: predInput})
-		if err == nil {
-			predInput = templatedInput
-			log.Debug().Msgf("Template found, input modified to: %s", predInput)
-		}
+		var result []Choice
+		for _, i := range predInput {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := loader.TemplatePrefix(templateFile, struct {
+				Input string
+			}{Input: i})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}

-		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
-			*c = append(*c, Choice{Text: s})
-		})
-		if err != nil {
-			return err
+			r, err := ComputeChoices(i, input, config, loader, func(s string, c *[]Choice) {
+				*c = append(*c, Choice{Text: s})
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			result = append(result, r...)
 		}

 		resp := &OpenAIResponse{
@@ -290,8 +317,9 @@ func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread

 		if input.Stream {
 			log.Debug().Msgf("Stream request received")
+			c.Context().SetContentType("text/event-stream")
 			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			c.Set("Content-Type", "text/event-stream; charset=utf-8")
+			//	c.Set("Content-Type", "text/event-stream")
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
@@ -312,13 +340,52 @@ func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread
 			log.Debug().Msgf("Template found, input modified to: %s", predInput)
 		}

+		if input.Stream {
+			responses := make(chan OpenAIResponse)
+
+			go func() {
+				ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {}, func(s string) bool {
+					resp := OpenAIResponse{
+						Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+						Choices: []Choice{{Delta: &Message{Role: "assistant", Content: s}}},
+						Object:  "chat.completion.chunk",
+					}
+
+					responses <- resp
+					return true
+				})
+				close(responses)
+			}()
+
+			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
+
+				for ev := range responses {
+					var buf bytes.Buffer
+					enc := json.NewEncoder(&buf)
+					enc.Encode(ev)
+
+					fmt.Fprintf(w, "event: data\n\n")
+					fmt.Fprintf(w, "data: %v\n\n", buf.String())
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					w.Flush()
+				}
+
+				w.WriteString("event: data\n\n")
+				resp := &OpenAIResponse{
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []Choice{{FinishReason: "stop"}},
+				}
+				respData, _ := json.Marshal(resp)
+
+				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
+				w.Flush()
+			}))
+			return nil
+		}
+
 		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
-			if input.Stream {
-				*c = append(*c, Choice{Delta: &Message{Role: "assistant", Content: s}})
-			} else {
-				*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: s}})
-			}
-		})
+			*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: s}})
+		}, nil)
 		if err != nil {
 			return err
 		}
@@ -329,36 +396,6 @@ func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread
 			Object:  "chat.completion",
 		}

-		if input.Stream {
-			resp.Object = "chat.completion.chunk"
-			jsonResult, _ := json.Marshal(resp)
-			log.Debug().Msgf("Response: %s", jsonResult)
-			log.Debug().Msgf("Handling stream request")
-			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-				fmt.Fprintf(w, "event: data\n")
-				w.Flush()
-
-				fmt.Fprintf(w, "data: %s\n\n", jsonResult)
-				w.Flush()
-
-				fmt.Fprintf(w, "event: data\n")
-				w.Flush()
-
-				resp := &OpenAIResponse{
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{{FinishReason: "stop"}},
-				}
-				respData, _ := json.Marshal(resp)
-
-				fmt.Fprintf(w, "data: %s\n\n", respData)
-				w.Flush()
-
-				//	fmt.Fprintf(w, "data: [DONE]\n\n")
-				//		w.Flush()
-			}))
-			return nil
-		}
-
 		// Return the prediction in the response body
 		return c.JSON(resp)
 	}
@@ -392,7 +429,7 @@ func editEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread

 		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
 			*c = append(*c, Choice{Text: s})
-		})
+		}, nil)
 		if err != nil {
 			return err
 		}
--- a/api/prediction.go
+++ b/api/prediction.go
@@ -6,26 +6,103 @@ import (
 	"strings"
 	"sync"

+	"github.com/donomii/go-rwkv.cpp"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
+	"github.com/hashicorp/go-multierror"
 )

+const tokenizerSuffix = ".tokenizer.json"
+
 // mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 var mutexMap sync.Mutex
 var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)

-func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (string, error), error) {
-	var model *llama.LLama
-	var gptModel *gptj.GPTJ
-	var gpt2Model *gpt2.GPT2
-	var stableLMModel *gpt2.StableLM
+var loadedModels map[string]interface{} = map[string]interface{}{}
+var muModels sync.Mutex

+func backendLoader(backendString string, loader *model.ModelLoader, modelFile string, llamaOpts []llama.ModelOption, threads uint32) (model interface{}, err error) {
+	switch strings.ToLower(backendString) {
+	case "llama":
+		return loader.LoadLLaMAModel(modelFile, llamaOpts...)
+	case "stablelm":
+		return loader.LoadStableLMModel(modelFile)
+	case "gpt2":
+		return loader.LoadGPT2Model(modelFile)
+	case "gptj":
+		return loader.LoadGPTJModel(modelFile)
+	case "rwkv":
+		return loader.LoadRWKV(modelFile, modelFile+tokenizerSuffix, threads)
+	default:
+		return nil, fmt.Errorf("backend unsupported: %s", backendString)
+	}
+}
+
+func greedyLoader(loader *model.ModelLoader, modelFile string, llamaOpts []llama.ModelOption, threads uint32) (model interface{}, err error) {
+	updateModels := func(model interface{}) {
+		muModels.Lock()
+		defer muModels.Unlock()
+		loadedModels[modelFile] = model
+	}
+
+	muModels.Lock()
+	m, exists := loadedModels[modelFile]
+	if exists {
+		muModels.Unlock()
+		return m, nil
+	}
+	muModels.Unlock()
+
+	model, modelerr := loader.LoadLLaMAModel(modelFile, llamaOpts...)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadGPTJModel(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadGPT2Model(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadStableLMModel(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadRWKV(modelFile, modelFile+tokenizerSuffix, threads)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
+}
+
+func ModelInference(s string, loader *model.ModelLoader, c Config, tokenCallback func(string) bool) (func() (string, error), error) {
+	supportStreams := false
 	modelFile := c.Model

 	// Try to load the model
-	var llamaerr, gpt2err, gptjerr, stableerr error
 	llamaOpts := []llama.ModelOption{}
 	if c.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(c.ContextSize))
@@ -34,25 +111,35 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
 	}

-	// TODO: this is ugly, better identifying the model somehow! however, it is a good stab for a first implementation..
-	model, llamaerr = loader.LoadLLaMAModel(modelFile, llamaOpts...)
-	if llamaerr != nil {
-		gptModel, gptjerr = loader.LoadGPTJModel(modelFile)
-		if gptjerr != nil {
-			gpt2Model, gpt2err = loader.LoadGPT2Model(modelFile)
-			if gpt2err != nil {
-				stableLMModel, stableerr = loader.LoadStableLMModel(modelFile)
-				if stableerr != nil {
-					return nil, fmt.Errorf("llama: %s gpt: %s gpt2: %s stableLM: %s", llamaerr.Error(), gptjerr.Error(), gpt2err.Error(), stableerr.Error()) // llama failed first, so we want to catch both errors
-				}
-			}
-		}
+	var inferenceModel interface{}
+	var err error
+	if c.Backend == "" {
+		inferenceModel, err = greedyLoader(loader, modelFile, llamaOpts, uint32(c.Threads))
+	} else {
+		inferenceModel, err = backendLoader(c.Backend, loader, modelFile, llamaOpts, uint32(c.Threads))
+	}
+	if err != nil {
+		return nil, err
 	}

 	var fn func() (string, error)

-	switch {
-	case stableLMModel != nil:
+	switch model := inferenceModel.(type) {
+	case *rwkv.RwkvState:
+		supportStreams = true
+
+		fn = func() (string, error) {
+			//model.ProcessInput("You are a chatbot that is very good at chatting.  blah blah blah")
+			stopWord := "\n"
+			if len(c.StopWords) > 0 {
+				stopWord = c.StopWords[0]
+			}
+
+			response := model.GenerateResponse(c.Maxtokens, stopWord, float32(c.Temperature), float32(c.TopP), tokenCallback)
+
+			return response, nil
+		}
+	case *gpt2.StableLM:
 		fn = func() (string, error) {
 			// Generate the prediction using the language model
 			predictOptions := []gpt2.PredictOption{
@@ -71,12 +158,12 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
 			}

-			return stableLMModel.Predict(
+			return model.Predict(
 				s,
 				predictOptions...,
 			)
 		}
-	case gpt2Model != nil:
+	case *gpt2.GPT2:
 		fn = func() (string, error) {
 			// Generate the prediction using the language model
 			predictOptions := []gpt2.PredictOption{
@@ -95,12 +182,12 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
 			}

-			return gpt2Model.Predict(
+			return model.Predict(
 				s,
 				predictOptions...,
 			)
 		}
-	case gptModel != nil:
+	case *gptj.GPTJ:
 		fn = func() (string, error) {
 			// Generate the prediction using the language model
 			predictOptions := []gptj.PredictOption{
@@ -119,13 +206,19 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 				predictOptions = append(predictOptions, gptj.SetSeed(c.Seed))
 			}

-			return gptModel.Predict(
+			return model.Predict(
 				s,
 				predictOptions...,
 			)
 		}
-	case model != nil:
+	case *llama.LLama:
+		supportStreams = true
 		fn = func() (string, error) {
+
+			if tokenCallback != nil {
+				model.SetTokenCallback(tokenCallback)
+			}
+
 			// Generate the prediction using the language model
 			predictOptions := []llama.PredictOption{
 				llama.SetTemperature(c.Temperature),
@@ -185,11 +278,15 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 		l.Lock()
 		defer l.Unlock()

-		return fn()
+		res, err := fn()
+		if tokenCallback != nil && !supportStreams {
+			tokenCallback(res)
+		}
+		return res, err
 	}, nil
 }

-func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, loader *model.ModelLoader, cb func(string, *[]Choice)) ([]Choice, error) {
+func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
 	result := []Choice{}

 	n := input.N
@@ -199,7 +296,7 @@ func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, load
 	}

 	// get the model function to call for the result
-	predFunc, err := ModelInference(predInput, loader, *config)
+	predFunc, err := ModelInference(predInput, loader, *config, tokenCallback)
 	if err != nil {
 		return result, err
 	}
--- a/charts/local-ai/Chart.yaml
+++ b/charts/local-ai/Chart.yaml
@@ -1,6 +0,0 @@
-apiVersion: v2
-appVersion: 0.1.0
-description: A Helm chart for LocalAI
-name: local-ai
-type: application
-version: 1.0.0
--- a/charts/local-ai/templates/_helpers.tpl
+++ b/charts/local-ai/templates/_helpers.tpl
@@ -1,44 +0,0 @@
-{{/*
-Expand the name of the chart.
-*/}}
-{{- define "local-ai.name" -}}
-{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
-{{- end }}
-
-{{/*
-Create a default fully qualified app name.
-We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
-If release name contains chart name it will be used as a full name.
-*/}}
-{{- define "local-ai.fullname" -}}
-{{- if .Values.fullnameOverride }}
-{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
-{{- else }}
-{{- $name := default .Chart.Name .Values.nameOverride }}
-{{- if contains $name .Release.Name }}
-{{- .Release.Name | trunc 63 | trimSuffix "-" }}
-{{- else }}
-{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
-{{- end }}
-{{- end }}
-{{- end }}
-
-{{/*
-Create chart name and version as used by the chart label.
-*/}}
-{{- define "local-ai.chart" -}}
-{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
-{{- end }}
-
-{{/*
-Common labels
-*/}}
-{{- define "local-ai.labels" -}}
-helm.sh/chart: {{ include "local-ai.chart" . }}
-app.kubernetes.io/name: {{ include "local-ai.name" . }}
-app.kubernetes.io/instance: "{{ .Release.Name }}"
-app.kubernetes.io/managed-by: {{ .Release.Service }}
-{{- if .Chart.AppVersion }}
-app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
-{{- end }}
-{{- end }}
--- a/charts/local-ai/templates/data-volume.yaml
+++ b/charts/local-ai/templates/data-volume.yaml
@@ -1,39 +0,0 @@
-{{- if .Values.dataVolume.enabled }}
-apiVersion: cdi.kubevirt.io/v1beta1
-kind: DataVolume
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-spec:
-  contentType: archive
-  source:
-    {{ .Values.dataVolume.source.type }}:
-      url: {{ .Values.dataVolume.source.url }}
-      secretRef: {{ template "local-ai.fullname" . }}
-      {{- if and (eq .Values.dataVolume.source.type "http") .Values.dataVolume.source.secretExtraHeaders }}
-      secretExtraHeaders: {{ .Values.dataVolume.source.secretExtraHeaders }}
-      {{- end }}
-      {{- if .Values.dataVolume.source.caCertConfigMap }}
-      caCertConfigMap: {{ .Values.dataVolume.source.caCertConfigMap }}
-      {{- end }}
-  pvc:
-    accessModes: {{ .Values.dataVolume.pvc.accessModes }}
-    resources:
-      requests:
-        storage: {{ .Values.dataVolume.pvc.size }}
---
-{{- if .Values.dataVolume.secret.enabled }}
-apiVersion: v1
-kind: Secret
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-data:
-  accessKeyId: {{ .Values.dataVolume.secret.username }}
-  secretKey: {{ .Values.dataVolume.secret.password }}
-{{- end }}
-{{- end }}
--- a/charts/local-ai/templates/deployment.yaml
+++ b/charts/local-ai/templates/deployment.yaml
@@ -1,39 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-spec:
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: {{ include "local-ai.name" . }}
-      app.kubernetes.io/instance: {{ .Release.Name }}
-  replicas: 1
-  template:
-    metadata:
-      name: {{ template "local-ai.fullname" . }}
-      labels:
-        app.kubernetes.io/name: {{ include "local-ai.name" . }}
-        app.kubernetes.io/instance: {{ .Release.Name }}
-    spec:
-      containers:
-        - name: {{ template "local-ai.fullname" . }}
-          image: {{ .Values.deployment.image }}
-          env:
-          - name: THREADS
-            value: {{ .Values.deployment.env.threads | quote }}
-          - name: CONTEXT_SIZE
-            value: {{ .Values.deployment.env.contextSize | quote }}
-          - name: MODELS_PATH
-            value: {{ .Values.deployment.env.modelsPath }}
-{{- if .Values.deployment.volume.enabled }}
-          volumeMounts:
-          - mountPath: {{ .Values.deployment.env.modelsPath }}
-            name: models
-      volumes:
-      - name: models
-        persistentVolumeClaim:
-          claimName: {{ template "local-ai.fullname" . }}
-{{- end }}
--- a/charts/local-ai/templates/service.yaml
+++ b/charts/local-ai/templates/service.yaml
@@ -1,19 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-{{- if .Values.service.annotations }}
-  annotations:
-  {{ toYaml .Values.service.annotations | indent 4 }}
-{{- end }}
-spec:
-  selector:
-    app.kubernetes.io/name: {{ include "local-ai.name" . }}
-  type: "{{ .Values.service.type }}"
-  ports:
-    - protocol: TCP
-      port: 8080
-      targetPort: 8080
--- a/charts/local-ai/values.yaml
+++ b/charts/local-ai/values.yaml
@@ -1,38 +0,0 @@
-deployment:
-  image: quay.io/go-skynet/local-ai:latest
-  env:
-    threads: 14
-    contextSize: 512
-    modelsPath: "/models"
-  volume:
-    enabled: false
-
-service:
-  type: ClusterIP
-  annotations: {}
-  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
-  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
-
-# Optionally create a PVC containing a model binary, sourced from an arbitrary HTTP server or S3 bucket
-# (requires https://github.com/kubevirt/containerized-data-importer)
-dataVolume:
-  enabled: false
-  source:
-    type: "http" # Source type. One of: [ http | s3 ]
-    url: "http://<model_server>/<model_archive>" # e.g. koala-7B-4bit-128g.GGML.tar
-
-    # CertConfigMap is an optional ConfigMap reference, containing a Certificate Authority (CA) public key
-    # and a base64 encoded pem certificate
-    caCertConfigMap: ""
-
-    # SecretExtraHeaders is an optional list of Secret references, each containing an extra HTTP header
-    # that may include sensitive information. Only applicable for the http source type.
-    secretExtraHeaders: []
-  pvc:
-    accessModes:
-    - ReadWriteOnce
-    size: 5Gi
-  secret:
-    enabled: false
-    username: "" # base64 encoded
-    password: "" # base64 encoded
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -5,11 +5,11 @@ services:
    image: quay.io/go-skynet/local-ai:latest
    build:
      context: .
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile.dev
    ports:
      - 8080:8080
    env_file:
      - .env
    volumes:
      - ./models:/models:cached
-    command: ["/usr/bin/local-ai" ]
+    command: ["/usr/bin/local-ai" ]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /build
+
+make build
+
+./local-ai "$@"
--- a/examples/README.md
+++ b/examples/README.md
@@ -5,6 +5,9 @@ Here is a list of projects that can easily be integrated with the LocalAI backen
 ## Projects

 - [chatbot-ui](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui/) (by [@mkellerman](https://github.com/mkellerman))
+- [discord-bot](https://github.com/go-skynet/LocalAI/tree/master/examples/discord-bot/) (by [@mudler](https://github.com/mudler))
+- [langchain](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain/) (by [@dave-gray101](https://github.com/dave-gray101))
+- [slack-bot](https://github.com/go-skynet/LocalAI/tree/master/examples/slack-bot/) (by [@mudler](https://github.com/mudler))

 ## Want to contribute?

--- a/examples/chatbot-ui/README.md
+++ b/examples/chatbot-ui/README.md
@@ -22,5 +22,25 @@ wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
 docker-compose up -d --build
 ```

+## Pointing chatbot-ui to a separately managed LocalAI service
+
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+```
+version: '3.6'
+
+services:
+  chatgpt:
+    image: ghcr.io/mckaywrigley/chatbot-ui:main
+    ports:
+      - 3000:3000
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
+```
+
+Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+
+## Accessing chatbot-ui
+
 Open http://localhost:3000 for the Web UI.

--- a/examples/chatbot-ui/docker-compose.yaml
+++ b/examples/chatbot-ui/docker-compose.yaml
@@ -5,7 +5,7 @@ services:
    image: quay.io/go-skynet/local-ai:latest
    build:
      context: ../../
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile.dev
    ports:
      - 8080:8080
    environment:
--- a/examples/discord-bot/.env.example
+++ b/examples/discord-bot/.env.example
@@ -0,0 +1,6 @@
+OPENAI_API_KEY=x
+DISCORD_BOT_TOKEN=x
+DISCORD_CLIENT_ID=x
+OPENAI_API_BASE=http://api:8080
+ALLOWED_SERVER_IDS=x
+SERVER_TO_MODERATION_CHANNEL=1:1
--- a/examples/discord-bot/README.md
+++ b/examples/discord-bot/README.md
@@ -0,0 +1,76 @@
+# discord-bot
+
+![Screenshot from 2023-05-01 07-58-19](https://user-images.githubusercontent.com/2420543/235413924-0cb2e75b-f2d6-4119-8610-44386e44afb8.png)
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/discord-bot
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Set the discord bot options (see: https://github.com/go-skynet/gpt-discord-bot#setup)
+cp -rfv .env.example .env
+vim .env
+
+# start with docker-compose
+docker-compose up -d --build
+```
+
+Note: see setup options here: https://github.com/go-skynet/gpt-discord-bot#setup
+
+Open up the URL in the console and give permission to the bot in your server. Start a thread with `/chat ..`
+
+## Kubernetes
+
+- install the local-ai chart first
+- change OPENAI_API_BASE to point to the API address and apply the discord-bot manifest:
+
+```yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: discord-bot
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: localai
+  namespace: discord-bot
+  labels:
+    app: localai
+spec:
+  selector:
+    matchLabels:
+      app: localai
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: localai
+      name: localai
+    spec:
+      containers:
+        - name: localai-discord
+          env:
+          - name: OPENAI_API_KEY
+            value: "x"
+          - name: DISCORD_BOT_TOKEN
+            value: ""
+          - name: DISCORD_CLIENT_ID
+            value: ""
+          - name: OPENAI_API_BASE
+            value: "http://local-ai.default.svc.cluster.local:8080"
+          - name: ALLOWED_SERVER_IDS
+            value: "xx"
+          - name: SERVER_TO_MODERATION_CHANNEL
+            value: "1:1"
+          image: quay.io/go-skynet/gpt-discord-bot:main
+```
--- a/examples/discord-bot/docker-compose.yaml
+++ b/examples/discord-bot/docker-compose.yaml
@@ -0,0 +1,21 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  bot:
+    image: quay.io/go-skynet/gpt-discord-bot:main
+    env_file:
+    - .env
--- a/examples/discord-bot/models
+++ b/examples/discord-bot/models
@@ -0,0 +1 @@
+../chatbot-ui/models/
--- a/examples/langchain/.gitignore
+++ b/examples/langchain/.gitignore
@@ -0,0 +1,2 @@
+models/ggml-koala-13B-4bit-128g
+models/ggml-gpt4all-j
--- a/examples/langchain/JS.Dockerfile
+++ b/examples/langchain/JS.Dockerfile
@@ -0,0 +1,6 @@
+FROM node:latest
+COPY ./langchainjs-localai-example /app
+WORKDIR /app
+RUN npm install
+RUN npm run build
+ENTRYPOINT [ "npm", "run", "start" ]
--- a/examples/langchain/README.md
+++ b/examples/langchain/README.md
@@ -0,0 +1,31 @@
+# langchain
+
+Example of using langchain in TypeScript, with the standard OpenAI llm module, and LocalAI.
+
+Example for python langchain to follow at a later date
+
+Set up to make it easy to modify the `index.mts` file to look like any langchain example file.
+
+**Please Note** - This is a tech demo example at this time. ggml-gpt4all-j has pretty terrible results for most langchain applications with the settings used in this example.
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/langchain
+
+# (optional) - Edit the example code in typescript.
+# vi ./langchainjs-localai-example/index.ts
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# start with docker-compose
+docker-compose up --build
+```
+
+## Copyright
+
+Some of the example code in index.mts is adapted from the langchainjs project and is Copyright (c) Harrison Chase. Used under the terms of the MIT license, as is the remainder of this code.
--- a/examples/langchain/docker-compose.yaml
+++ b/examples/langchain/docker-compose.yaml
@@ -0,0 +1,25 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  langchainjs:
+    build:
+      context: .
+      dockerfile: JS.Dockerfile
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://api:8080/v1'
+      - 'MODEL_NAME=gpt-3.5-turbo' #gpt-3.5-turbo' # ggml-gpt4all-j' # ggml-koala-13B-4bit-128g'
--- a/examples/langchain/langchainjs-localai-example/.gitignore
+++ b/examples/langchain/langchainjs-localai-example/.gitignore
@@ -0,0 +1,2 @@
+node_modules/
+dist/
--- a/examples/langchain/langchainjs-localai-example/.vscode/launch.json
+++ b/examples/langchain/langchainjs-localai-example/.vscode/launch.json
@@ -0,0 +1,20 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "node",
+            "request": "launch",
+            "name": "Launch Program",
+            // "skipFiles": [
+            //     "<node_internals>/**"
+            // ],
+            "program": "${workspaceFolder}\\dist\\index.mjs",
+            "outFiles": [
+                "${workspaceFolder}/**/*.js"
+            ]
+        }
+    ]
+}
--- a/examples/langchain/langchainjs-localai-example/package-lock.json
+++ b/examples/langchain/langchainjs-localai-example/package-lock.json
--- a/examples/langchain/langchainjs-localai-example/package.json
+++ b/examples/langchain/langchainjs-localai-example/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "langchainjs-localai-example",
+  "version": "0.1.0",
+  "description": "Trivial Example of using langchain + the OpenAI API + LocalAI together",
+  "main": "index.mjs",
+  "scripts": {
+    "build": "tsc --build",
+    "clean": "tsc --build --clean",
+    "start": "node --trace-warnings dist/index.mjs"
+  },
+  "author": "dave@gray101.com",
+  "license": "MIT",
+  "devDependencies": {
+    "@types/node": "^18.16.3",
+    "typescript": "^5.0.4"
+  },
+  "dependencies": {
+    "langchain": "^0.0.67",
+    "typeorm": "^0.3.15"
+  }
+}
--- a/examples/langchain/langchainjs-localai-example/src/index.mts
+++ b/examples/langchain/langchainjs-localai-example/src/index.mts
@@ -0,0 +1,79 @@
+import { OpenAIChat } from "langchain/llms/openai";
+import { loadQAStuffChain } from "langchain/chains";
+import { Document } from "langchain/document";
+import { initializeAgentExecutorWithOptions } from "langchain/agents";
+import {Calculator} from "langchain/tools/calculator";
+
+const pathToLocalAi = process.env['OPENAI_API_HOST'] || 'http://api:8080/v1';
+const fakeApiKey = process.env['OPENAI_API_KEY'] || '-';
+const modelName = process.env['MODEL_NAME'] || 'gpt-3.5-turbo';
+
+function getModel(): OpenAIChat {
+  return new OpenAIChat({
+    prefixMessages: [
+      {
+        role: "system",
+        content: "You are a helpful assistant that answers in pirate language",
+      },
+    ],
+    modelName: modelName,
+    maxTokens: 50,
+    openAIApiKey: fakeApiKey,
+    maxRetries: 2
+  }, {
+    basePath: pathToLocalAi,
+    apiKey: fakeApiKey,
+  });
+}
+
+// Minimal example.
+export const run = async () => {
+  const model = getModel();
+  console.log(`about to model.call at ${new Date().toUTCString()}`);
+  const res = await model.call(
+    "What would be a good company name a company that makes colorful socks?"
+  );
+  console.log(`${new Date().toUTCString()}`);
+  console.log({ res });
+};
+
+await run();
+
+// This example uses the `StuffDocumentsChain`
+export const run2 = async () => {
+  const model = getModel();
+  const chainA = loadQAStuffChain(model);
+  const docs = [
+    new Document({ pageContent: "Harrison went to Harvard." }),
+    new Document({ pageContent: "Ankush went to Princeton." }),
+  ];
+  const resA = await chainA.call({
+    input_documents: docs,
+    question: "Where did Harrison go to college?",
+  });
+  console.log({ resA });
+};
+
+await run2();
+
+// Quickly thrown together example of using tools + agents.
+// This seems like it should work, but it doesn't yet.
+export const temporarilyBrokenToolTest = async () => {
+  const model = getModel();
+
+  const executor = await initializeAgentExecutorWithOptions([new Calculator(true)], model, {
+    agentType: "zero-shot-react-description",
+  });
+
+  console.log("Loaded agent.");
+
+  const input = `What is the value of (500 *2) + 350 - 13?`;
+
+  console.log(`Executing with input "${input}"...`);
+
+  const result = await executor.call({ input });
+
+  console.log(`Got output ${result.output}`);
+}
+
+await temporarilyBrokenToolTest();
--- a/examples/langchain/langchainjs-localai-example/tsconfig.json
+++ b/examples/langchain/langchainjs-localai-example/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "es2022",
+    "lib": ["ES2022", "DOM"],
+    "module": "ES2022",
+    "moduleResolution": "node",
+    "strict": true,
+    "esModuleInterop": true,
+    "allowSyntheticDefaultImports": true,
+    "isolatedModules": true,
+    "outDir": "./dist"
+  },
+  "include": ["src", "test"],
+  "exclude": ["node_modules", "dist"]
+}
--- a/examples/langchain/models/completion.tmpl
+++ b/examples/langchain/models/completion.tmpl
@@ -0,0 +1 @@
+{{.Input}}
--- a/examples/langchain/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain/models/gpt-3.5-turbo.yaml
@@ -0,0 +1,17 @@
+name: gpt-3.5-turbo
+parameters:
+  model: ggml-gpt4all-j # ggml-koala-13B-4bit-128g
+  top_k: 80
+  temperature: 0.2
+  top_p: 0.7
+context_size: 1024
+threads: 4
+stopwords:
+- "HUMAN:"
+- "GPT:"
+roles:
+  user: " "
+  system: " "
+template:
+  completion: completion
+  chat: completion # gpt4all
--- a/examples/langchain/models/gpt4all.tmpl
+++ b/examples/langchain/models/gpt4all.tmpl
@@ -0,0 +1,4 @@
+The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+### Prompt:
+{{.Input}}
+### Response:
--- a/examples/slack-bot/.env.example
+++ b/examples/slack-bot/.env.example
@@ -0,0 +1,11 @@
+SLACK_APP_TOKEN=xapp-1-...
+SLACK_BOT_TOKEN=xoxb-...
+OPENAI_API_KEY=sk-...
+OPENAI_API_BASE=http://api:8080
+OPENAI_MODEL=gpt-3.5-turbo
+OPENAI_TIMEOUT_SECONDS=60
+#OPENAI_SYSTEM_TEXT="You proofread text. When you receive a message, you will check
+#for mistakes and make suggestion to improve the language of the given text"
+USE_SLACK_LANGUAGE=true
+SLACK_APP_LOG_LEVEL=INFO
+TRANSLATE_MARKDOWN=true
--- a/examples/slack-bot/README.md
+++ b/examples/slack-bot/README.md
@@ -0,0 +1,27 @@
+# Slack bot
+
+Slackbot using: https://github.com/seratch/ChatGPT-in-Slack
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/slack-bot
+
+git clone https://github.com/seratch/ChatGPT-in-Slack
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Set the discord bot options (see: https://github.com/seratch/ChatGPT-in-Slack)
+cp -rfv .env.example .env
+vim .env
+
+# start with docker-compose
+docker-compose up -d --build
+```
--- a/examples/slack-bot/docker-compose.yaml
+++ b/examples/slack-bot/docker-compose.yaml
@@ -0,0 +1,23 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  bot:
+    build:
+     context: ./ChatGPT-in-Slack
+     dockerfile: Dockerfile
+    env_file:
+    - .env
--- a/examples/slack-bot/models
+++ b/examples/slack-bot/models
@@ -0,0 +1 @@
+../chatbot-ui/models
--- a/go.mod
+++ b/go.mod
@@ -5,16 +5,17 @@ go 1.19
 require (
 	github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708
 	github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c
-	github.com/go-skynet/go-llama.cpp v0.0.0-20230428071219-3d084e4299e9
+	github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405
 	github.com/gofiber/fiber/v2 v2.44.0
+	github.com/hashicorp/go-multierror v1.1.1
 	github.com/jaypipes/ghw v0.10.0
-	github.com/onsi/ginkgo/v2 v2.9.2
+	github.com/onsi/ginkgo/v2 v2.9.3
 	github.com/onsi/gomega v1.27.6
 	github.com/otiai10/openaigo v1.1.0
 	github.com/rs/zerolog v1.29.1
-	github.com/sashabaranov/go-openai v1.9.0
-	github.com/urfave/cli/v2 v2.25.1
-	github.com/valyala/fasthttp v1.46.0
+	github.com/sashabaranov/go-openai v1.9.1
+	github.com/urfave/cli/v2 v2.25.3
+	github.com/valyala/fasthttp v1.47.0
 	gopkg.in/yaml.v3 v3.0.1
 )

@@ -22,13 +23,15 @@ require (
 	github.com/StackExchange/wmi v1.2.1 // indirect
 	github.com/andybalholm/brotli v1.0.5 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
+	github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d // indirect
 	github.com/ghodss/yaml v1.0.0 // indirect
-	github.com/go-logr/logr v1.2.3 // indirect
+	github.com/go-logr/logr v1.2.4 // indirect
 	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
 	github.com/google/go-cmp v0.5.9 // indirect
 	github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect
 	github.com/google/uuid v1.3.0 // indirect
+	github.com/hashicorp/errwrap v1.0.0 // indirect
 	github.com/jaypipes/pcidb v1.0.0 // indirect
 	github.com/klauspost/compress v1.16.3 // indirect
 	github.com/kr/text v0.2.0 // indirect
@@ -46,10 +49,10 @@ require (
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
-	golang.org/x/net v0.8.0 // indirect
+	golang.org/x/net v0.9.0 // indirect
 	golang.org/x/sys v0.7.0 // indirect
-	golang.org/x/text v0.8.0 // indirect
-	golang.org/x/tools v0.7.0 // indirect
+	golang.org/x/text v0.9.0 // indirect
+	golang.org/x/tools v0.8.0 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	howett.net/plist v1.0.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -12,10 +12,14 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d h1:lSHwlYf1H4WAWYgf7rjEVTGen1qmigUq2Egpu8mnQiY=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d/go.mod h1:H6QBF7/Tz6DAEBDXQged4H1BvsmqY/K5FG9wQRGa01g=
 github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
 github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
 github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
+github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
 github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
 github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
@@ -23,8 +27,10 @@ github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708 h1:cfOi4TWvQ
 github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708/go.mod h1:1Wj/xbkMfwQSOrhNYK178IzqQHstZbRfhx4s8p1M5VM=
 github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c h1:48I7jpLNGiQeBmF0SFVVbREh8vlG0zN13v9LH5ctXis=
 github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c/go.mod h1:5VZ9XbcINI0XcHhkcX8GPK8TplFGAzu1Hrg4tNiMCtI=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230428071219-3d084e4299e9 h1:N/0SBefkMFao6GiGhIF7+5EdYOMHn4KnCG2AFcIXPt0=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230428071219-3d084e4299e9/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230430075552-377fd245eae2 h1:CYQRCbOfYtC77OxweAyrdxSVwoLIM/EdZ6Ij+xBzta8=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230430075552-377fd245eae2/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405 h1:pbIxJ/eiL1Irdprxk/mquaxjR1XDGCE+7CT9BGJNRaY=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
@@ -37,6 +43,10 @@ github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE
 github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
+github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
+github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
+github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
 github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 github.com/jaypipes/ghw v0.10.0 h1:UHu9UX08Py315iPojADFPOkmjTsNzHj4g4adsNKKteY=
 github.com/jaypipes/ghw v0.10.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g=
@@ -61,6 +71,8 @@ github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
 github.com/onsi/ginkgo/v2 v2.9.2/go.mod h1:WHcJJG2dIlcCqVfBAwUCrJxSPFb6v4azBwgxeMeDuts=
+github.com/onsi/ginkgo/v2 v2.9.3 h1:5X2vl/isiKqkrOYjiaGgp3JQOcLV59g5o5SuTMqCcxU=
+github.com/onsi/ginkgo/v2 v2.9.3/go.mod h1:gCQYp2Q+kSoIj7ykSVb9nskRSsR6PUj4AiLywzIhbKM=
 github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
 github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg=
 github.com/otiai10/mint v1.4.1 h1:HOVBfKP1oXIc0wWo9hZ8JLdZtyCPWqjvmFDuVZ0yv2Y=
@@ -80,8 +92,8 @@ github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
 github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/sashabaranov/go-openai v1.9.0 h1:NoiO++IISxxJ1pRc0n7uZvMGMake0G+FJ1XPwXtprsA=
-github.com/sashabaranov/go-openai v1.9.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/sashabaranov/go-openai v1.9.1 h1:3N52HkJKo9Zlo/oe1AVv5ZkCOny0ra58/ACvAxkN3MM=
+github.com/sashabaranov/go-openai v1.9.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94/go.mod h1:90zrgN3D/WJsDd1iXHT96alCoN2KJo6/4x1DZC3wZs8=
 github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d/go.mod h1:Gy+0tqhJvgGlqnTF8CVGP0AaGRjwBtXs/a5PA0Y3+A4=
@@ -93,12 +105,12 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/tinylib/msgp v1.1.6/go.mod h1:75BAfg2hauQhs3qedfdDZmWAPcFMAvJE5b9rGOMufyw=
 github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0=
 github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw=
-github.com/urfave/cli/v2 v2.25.1 h1:zw8dSP7ghX0Gmm8vugrs6q9Ku0wzweqPyshy+syu9Gw=
-github.com/urfave/cli/v2 v2.25.1/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
+github.com/urfave/cli/v2 v2.25.3 h1:VJkt6wvEBOoSjPFQvOkv6iWIrsJyCrKGtCtxXWwmGeY=
+github.com/urfave/cli/v2 v2.25.3/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/fasthttp v1.46.0 h1:6ZRhrFg8zBXTRYY6vdzbFhqsBd7FVv123pV2m9V87U4=
-github.com/valyala/fasthttp v1.46.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA=
+github.com/valyala/fasthttp v1.47.0 h1:y7moDoxYzMooFpT5aHgNgVOQDrS3qlkfiP9mDtGGK9c=
+github.com/valyala/fasthttp v1.47.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA=
 github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
 github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
@@ -120,6 +132,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug
 golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
 golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
 golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
+golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM=
+golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -149,6 +163,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
 golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
@@ -156,6 +172,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc
 golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ=
 golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
 golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s=
+golang.org/x/tools v0.8.0 h1:vSDcovVPld282ceKgDimkRSC8kpaH1dgyc9UMzlt84Y=
+golang.org/x/tools v0.8.0/go.mod h1:JxBZ99ISMI5ViVkT1tr6tdNmXeTrcpVSD3vZ1RsRdN4=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -12,6 +12,7 @@ import (

 	"github.com/rs/zerolog/log"

+	rwkv "github.com/donomii/go-rwkv.cpp"
 	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
@@ -25,8 +26,8 @@ type ModelLoader struct {
 	gptmodels         map[string]*gptj.GPTJ
 	gpt2models        map[string]*gpt2.GPT2
 	gptstablelmmodels map[string]*gpt2.StableLM
-
-	promptsTemplates map[string]*template.Template
+	rwkv              map[string]*rwkv.RwkvState
+	promptsTemplates  map[string]*template.Template
 }

 func NewModelLoader(modelPath string) *ModelLoader {
@@ -36,6 +37,7 @@ func NewModelLoader(modelPath string) *ModelLoader {
 		gptmodels:         make(map[string]*gptj.GPTJ),
 		gptstablelmmodels: make(map[string]*gpt2.StableLM),
 		models:            make(map[string]*llama.LLama),
+		rwkv:              make(map[string]*rwkv.RwkvState),
 		promptsTemplates:  make(map[string]*template.Template),
 	}
 }
@@ -168,13 +170,6 @@ func (ml *ModelLoader) LoadGPT2Model(modelName string) (*gpt2.GPT2, error) {
 		return m, nil
 	}

-	// TODO: This needs refactoring, it's really bad to have it in here
-	// Check if we have a GPTStable model loaded instead - if we do we return an error so the API tries with StableLM
-	if _, ok := ml.gptstablelmmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTStableLM one")
-	}
-
 	// Load the model and keep it in memory for later use
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
@@ -207,17 +202,6 @@ func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
 		return m, nil
 	}

-	// TODO: This needs refactoring, it's really bad to have it in here
-	// Check if we have a GPT2 model loaded instead - if we do we return an error so the API tries with GPT2
-	if _, ok := ml.gpt2models[modelName]; ok {
-		log.Debug().Msgf("Model is GPT2: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPT2 one")
-	}
-	if _, ok := ml.gptstablelmmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTStableLM one")
-	}
-
 	// Load the model and keep it in memory for later use
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
@@ -236,6 +220,36 @@ func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
 	return model, err
 }

+func (ml *ModelLoader) LoadRWKV(modelName, tokenFile string, threads uint32) (*rwkv.RwkvState, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	log.Debug().Msgf("Loading model name: %s", modelName)
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.rwkv[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	tokenPath := filepath.Join(ml.ModelPath, tokenFile)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model := rwkv.LoadFiles(modelFile, tokenPath, threads)
+	if model == nil {
+		return nil, fmt.Errorf("could not load model")
+	}
+
+	ml.rwkv[modelName] = model
+	return model, nil
+}
+
 func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
@@ -252,21 +266,6 @@ func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOptio
 		return m, nil
 	}

-	// TODO: This needs refactoring, it's really bad to have it in here
-	// Check if we have a GPTJ model loaded instead - if we do we return an error so the API tries with GPTJ
-	if _, ok := ml.gptmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTJ: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTJ one")
-	}
-	if _, ok := ml.gpt2models[modelName]; ok {
-		log.Debug().Msgf("Model is GPT2: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPT2 one")
-	}
-	if _, ok := ml.gptstablelmmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTStableLM one")
-	}
-
 	// Load the model and keep it in memory for later use
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
--- a/prompt-templates/wizardlm.tmpl
+++ b/prompt-templates/wizardlm.tmpl
@@ -0,0 +1,3 @@
+{{.Input}}
+
+### Response:
Author	SHA1	Message	Date
Ettore Di Giacinto	4eae570ef5	Update docs (#163 )	2023-05-03 15:51:54 +02:00
Ettore Di Giacinto	67992a7d99	feat: support slices or strings in the prompt completion endpoint (#162 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-03 13:13:31 +02:00
renovate[bot]	0a4899f366	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 8ceb616 (#150 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-03 11:48:06 +02:00
renovate[bot]	1eb02f6c91	fix(deps): update module github.com/onsi/ginkgo/v2 to v2.9.3 (#161 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-03 11:47:54 +02:00
mudler	575874e4fb	readme: minor update	2023-05-03 11:46:29 +02:00
Ettore Di Giacinto	751b7eca62	feat: add rwkv support (#158 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-03 11:45:22 +02:00
Ettore Di Giacinto	1ae7150810	feat: allow to specify default backend for model (#156 ) Signed-off-by: mudler <mudler@c3os.io>	2023-05-03 00:31:28 +02:00
Ettore Di Giacinto	70caf9bf8c	feat: support stopwords both string and arrays (#154 )	2023-05-02 23:30:00 +02:00
Dave	0b226ac027	Stop parameter of OpenAIRequest changed to String Array (#153 )	2023-05-02 22:02:45 +02:00
Ettore Di Giacinto	220d6fd59b	feat: add stream events (#152 )	2023-05-02 20:03:35 +02:00
antongisli	0a00a4b58e	adding mac build and example (#151 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-05-02 19:24:45 +02:00
Ettore Di Giacinto	156e15a4fa	Bump llama.cpp, downgrade gpt4all-j (#149 )	2023-05-02 16:07:18 +02:00
renovate[bot]	271d3f6673	fix(deps): update module github.com/valyala/fasthttp to v1.47.0 (#143 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 23:36:58 +02:00
Ettore Di Giacinto	fec4ab93c5	docs: Add langchain to the example index (#147 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 23:21:07 +02:00
renovate[bot]	38a7a7a54d	fix(deps): update github.com/go-skynet/go-gpt4all-j.cpp digest to 77bf8c1 (#141 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-05-01 23:18:41 +02:00
Ettore Di Giacinto	0db0704e2c	docs: Add slack-bot example (#145 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 23:18:24 +02:00
Dave	88f472e5d2	Add LangchainJS Examples (#146 )	2023-05-01 23:18:14 +02:00
Ettore Di Giacinto	92452d46da	feat: add new gpt4all-j binding (#142 )	2023-05-01 20:00:15 +02:00
Ettore Di Giacinto	ac70252d70	drop: remove helm charts, now in separate repo (#134 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 18:07:41 +02:00
renovate[bot]	f6451d2518	fix(deps): update module github.com/urfave/cli/v2 to v2.25.3 (#140 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 18:07:29 +02:00
Ettore Di Giacinto	2473f9d19b	docs: add discord-bot preview (#137 )	2023-05-01 11:03:34 +02:00
renovate[bot]	bc583385a9	fix(deps): update module github.com/urfave/cli/v2 to v2.25.2 (#136 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 07:53:48 +02:00
renovate[bot]	8286bfbab7	fix(deps): update module github.com/sashabaranov/go-openai to v1.9.1 (#135 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 07:52:20 +02:00
Ettore Di Giacinto	d129fabe3b	docs: enhancements (#133 )	2023-04-30 23:27:02 +02:00
renovate[bot]	2539867247	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 377fd24 (#129 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-30 11:09:48 +02:00
renovate[bot]	69fedb92d9	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 361b9f8 (#127 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-30 08:47:27 +02:00
Ettore Di Giacinto	54b5eadcc4	docs: add discord-bot example (#126 )	2023-04-30 00:31:28 +02:00
Ettore Di Giacinto	16773e2a35	feat: make images to build sources on start (#124 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-29 20:38:37 +02:00
renovate[bot]	78503c62b7	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 9bf702f (#125 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-29 16:53:39 +02:00