fix: missing returning error and free callback stream (#187 )

fix(deps): update github.com/donomii/go-rwkv.cpp digest to af62fcc (#171 )
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
2026-02-04 11:42:57 -05:00 · 2023-05-04 19:49:43 +02:00 · 2023-05-04 18:30:48 +02:00 · 2023-05-04 18:30:11 +02:00 · 2023-05-04 18:28:49 +02:00 · 2023-05-04 18:27:58 +02:00
60 changed files with 3791 additions and 560 deletions
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+REPO=$1
+BRANCH=$2
+VAR=$3
+
+LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")
+
+sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -0,0 +1,42 @@
+name: Bump dependencies
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  bump:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: "go-skynet/go-gpt4all-j.cpp"
+            variable: "GOGPT4ALLJ_VERSION"
+            branch: "master"
+          - repository: "go-skynet/go-llama.cpp"
+            variable: "GOLLAMA_VERSION"
+            branch: "master"
+          - repository: "go-skynet/go-gpt2.cpp"
+            variable: "GOGPT2_VERSION"
+            branch: "master"
+          - repository: "donomii/go-rwkv.cpp"
+            variable: "RWKV_VERSION"
+            branch: "main"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Bump dependencies 🔧
+        run: |
+          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
+          title: ':arrow_up: Update ${{ matrix.repository }}'
+          branch: "update/${{ matrix.variable }}"
+          body: Bump of ${{ matrix.repository }} version
+          signoff: true
+
+
+
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -54,8 +54,8 @@ jobs:
        uses: docker/login-action@v2
        with:
          registry: quay.io
-          username: ${{ secrets.QUAY_USERNAME }}
-          password: ${{ secrets.QUAY_PASSWORD }}
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
      - name: Build
        if: github.event_name != 'pull_request'
        uses: docker/build-push-action@v4
--- a/66
+++ b/66
@@ -2,12 +2,12 @@ GOCMD=go
 GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
-# renovate: datasource=github-tags depName=go-skynet/go-llama.cpp
-GOLLAMA_VERSION?=llama.cpp-7f15c5c
-# renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt4all-j.cpp currentValueTemplate=master depNameTemplate=go-gpt4all-j.cpp
+
+GOLLAMA_VERSION?=2e6ae1269e035886fc64e268a6dda9d8c4ba8c75
 GOGPT4ALLJ_VERSION?=1f7bff57f66cb7062e40d0ac3abd2217815e5109
-# renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt2.cpp currentValueTemplate=master depNameTemplate=go-gpt2.cpp
 GOGPT2_VERSION?=245a5bfe6708ab80dc5c733dcdbfbe3cfd2acdaa
+RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
+RWKV_VERSION?=af62fcc432be2847acb6e0688b2c2491d6588d58

 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
@@ -15,8 +15,8 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

-C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2
-LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2
+C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv
+LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv

 # Use this if you want to set the default behavior
 ifndef BUILD_TYPE
@@ -33,20 +33,10 @@ endif

 all: help

-## Build:
-
-build: prepare ## Build the project
-	$(info ${GREEN}I local-ai build info:${RESET})
-	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
-	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) build -o $(BINARY_NAME) ./
-
-generic-build: ## Build the project using generic
-	BUILD_TYPE="generic" $(MAKE) build
-
 ## GPT4ALL-J
 go-gpt4all-j:
 	git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j
-	cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION)
+	cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION) && git submodule update --init --recursive --depth 1
 	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
 	@find ./go-gpt4all-j -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
@@ -57,13 +47,21 @@ go-gpt4all-j:
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_gptj_replace/g' {} +
 	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_gptj_replace/g' {} +

+## RWKV
+go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
+	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+
+go-rwkv/librwkv.a: go-rwkv
+	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a .. && cp ggml/src/libggml.a ..
+
 go-gpt4all-j/libgptj.a: go-gpt4all-j
 	$(MAKE) -C go-gpt4all-j $(GENERIC_PREFIX)libgptj.a

-# CEREBRAS GPT
-go-gpt2:
+## CEREBRAS GPT
+go-gpt2: 
 	git clone --recurse-submodules https://github.com/go-skynet/go-gpt2.cpp go-gpt2
-	cd go-gpt2 && git checkout -b build $(GOGPT2_VERSION)
+	cd go-gpt2 && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
 	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
 	@find ./go-gpt2 -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
 	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
@@ -74,10 +72,10 @@ go-gpt2:

 go-gpt2/libgpt2.a: go-gpt2
 	$(MAKE) -C go-gpt2 $(GENERIC_PREFIX)libgpt2.a
-	

 go-llama:
-	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
+	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1

 go-llama/libbinding.a: go-llama 
 	$(MAKE) -C go-llama $(GENERIC_PREFIX)libbinding.a
@@ -86,26 +84,40 @@ replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt2.cpp=$(shell pwd)/go-gpt2
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv

-prepare-sources: go-llama go-gpt2 go-gpt4all-j
+prepare-sources: go-llama go-gpt2 go-gpt4all-j go-rwkv
 	$(GOCMD) mod download

-rebuild:
+## GENERIC
+rebuild: ## Rebuilds the project
 	$(MAKE) -C go-llama clean
 	$(MAKE) -C go-gpt4all-j clean
 	$(MAKE) -C go-gpt2 clean
+	$(MAKE) -C go-rwkv clean
 	$(MAKE) build

-prepare: prepare-sources go-llama/libbinding.a go-gpt4all-j/libgptj.a go-gpt2/libgpt2.a replace
+prepare: prepare-sources go-llama/libbinding.a go-gpt4all-j/libgptj.a go-gpt2/libgpt2.a go-rwkv/librwkv.a replace ## Prepares for building

 clean: ## Remove build related file
 	rm -fr ./go-llama
 	rm -rf ./go-gpt4all-j
 	rm -rf ./go-gpt2
+	rm -rf ./go-rwkv
 	rm -rf $(BINARY_NAME)

-## Run:
-run: prepare
+## Build:
+
+build: prepare ## Build the project
+	$(info ${GREEN}I local-ai build info:${RESET})
+	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
+	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) build -o $(BINARY_NAME) ./
+
+generic-build: ## Build the project using generic
+	BUILD_TYPE="generic" $(MAKE) build
+
+## Run
+run: prepare ## run local-ai
 	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) run ./main.go

 test-models/testmodel:
--- a/README.md
+++ b/README.md
@@ -9,17 +9,23 @@

 [![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 

-**LocalAI** is a straightforward, drop-in replacement API compatible with OpenAI for local CPU inferencing, based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is licensed under Apache 2.0.
+**LocalAI** is a drop-in replacement REST API compatible with OpenAI for local CPU inferencing. It allows to run models locally or on-prem with consumer grade hardware. It is based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all), [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is licensed under Apache 2.0.

 - OpenAI compatible API
 - Supports multiple-models
 - Once loaded the first time, it keep models loaded in memory for faster inference
 - Support for prompt templates
- Doesn't shell-out, but uses C bindings for a faster inference and better performance. Uses [go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) and [go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp).
+- Doesn't shell-out, but uses C bindings for a faster inference and better performance. 

 LocalAI is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! It was initially created by [mudler](https://github.com/mudler/) at the [SpectroCloud OSS Office](https://github.com/spectrocloud).

+### News
+
+- 02-05-2023: Support for `rwkv.cpp` models ( https://github.com/go-skynet/LocalAI/pull/158 ) and for `/edits` endpoint
+- 01-05-2023: Support for SSE stream of tokens in `llama.cpp` backends ( https://github.com/go-skynet/LocalAI/pull/152 )
+
 ### Socials and community chatter
+
 - Follow [@LocalAI_API](https://twitter.com/LocalAI_API) on twitter.

 - [Reddit post](https://www.reddit.com/r/selfhosted/comments/12w4p2f/localai_openai_compatible_api_to_run_llm_models/) about LocalAI.
@@ -39,11 +45,45 @@ Tested with:
 - [GPT4ALL-J](https://gpt4all.io/models/ggml-gpt4all-j.bin)
 - Koala
 - [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml)
+- WizardLM
+- [RWKV](https://github.com/BlinkDL/RWKV-LM) models with [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)

-It should also be compatible with StableLM and GPTNeoX ggml models (untested)
+### Vicuna, Alpaca, LLaMa...
+
+[llama.cpp](https://github.com/ggerganov/llama.cpp) based models are compatible
+
+### GPT4ALL

 Note: You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.

+### GPT4ALL-J
+
+No changes required to the model.
+
+### RWKV
+
+<details>
+
+A full example on how to run a rwkv model is in the [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv).
+
+Note: rwkv models have an associated tokenizer along that needs to be provided with it:
+
+```
+36464540 -rw-r--r--  1 mudler mudler 1.2G May  3 10:51 rwkv_small
+36464543 -rw-r--r--  1 mudler mudler 2.4M May  3 10:51 rwkv_small.tokenizer.json
+```
+
+</details>
+
+### Others
+
+It should also be compatible with StableLM and GPTNeoX ggml models (untested).
+
+### Hardware requirements
+
+Depending on the model you are attempting to run might need more RAM or CPU resources. Check out also [here](https://github.com/ggerganov/llama.cpp#memorydisk-requirements) for `ggml` based backends. `rwkv` is less expensive on resources.
+
+
 ## Usage

 > `LocalAI` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).
@@ -120,184 +160,59 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso

 To build locally, run `make build` (see below).

-## Other examples
-
-![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)
+### Other examples

 To see other examples on how to integrate with other projects for instance chatbot-ui, see: [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/).

-## Prompt templates 

-The API doesn't inject a default prompt for talking to the model. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.
-
-<details>
-You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibling file, `foo.bin.tmpl` which will be used as a default prompt and can be used with alpaca:
-
-```
-The below instruction describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-{{.Input}}
-
-### Response:
-```
-
-See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prompt-templates) directory in this repository for templates for some of the most popular models.
-
-</details>
-
-## Installation
-
-Currently LocalAI comes as container images and can be used with docker or a containre engine of choice. 
-
-### Run LocalAI in Kubernetes
-
-LocalAI can be installed inside Kubernetes with helm.
-
-<details>
-The local-ai Helm chart supports two options for the LocalAI server's models directory:
-1. Basic deployment with no persistent volume. You must manually update the Deployment to configure your own models directory.
-
-    Install the chart with `.Values.deployment.volumes.enabled == false` and `.Values.dataVolume.enabled == false`.
-
-2. Advanced, two-phase deployment to provision the models directory using a DataVolume. Requires [Containerized Data Importer CDI](https://github.com/kubevirt/containerized-data-importer) to be pre-installed in your cluster.
-
-    First, install the chart with `.Values.deployment.volumes.enabled == false` and `.Values.dataVolume.enabled == true`:
-    ```bash
-    helm install local-ai charts/local-ai -n local-ai --create-namespace
-    ```
-    Wait for CDI to create an importer Pod for the DataVolume and for the importer pod to finish provisioning the model archive inside the PV.
-
-    Once the PV is provisioned and the importer Pod removed, set `.Values.deployment.volumes.enabled == true` and `.Values.dataVolume.enabled == false` and upgrade the chart:
-    ```bash
-    helm upgrade local-ai -n local-ai charts/local-ai
-    ```
-    This will update the local-ai Deployment to mount the PV that was provisioned by the DataVolume.
-
-</details>
-
-## API
-
-`LocalAI` provides an API for running text generation as a service, that follows the OpenAI reference and can be used as a drop-in. The models once loaded the first time will be kept in memory.
-
-<details>
-Example of starting the API with `docker`:
-
-```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
-```
-
-You should see:
-```
-┌───────────────────────────────────────────────────┐ 
-│                   Fiber v2.42.0                   │ 
-│               http://127.0.0.1:8080               │ 
-│       (bound on host 0.0.0.0 and port 8080)       │ 
-│                                                   │ 
-│ Handlers ............. 1  Processes ........... 1 │ 
-│ Prefork ....... Disabled  PID ................. 1 │ 
-└───────────────────────────────────────────────────┘ 
-```
-
-You can control the API server options with command line arguments:
-
-```
-local-api --models-path <model_path> [--address <address>] [--threads <num_threads>]
-```
-
-The API takes takes the following parameters:
-
-| Parameter    | Environment Variable | Default Value | Description                            |
-| ------------ | -------------------- | ------------- | -------------------------------------- |
-| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
-| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
-| address      | ADDRESS              | :8080         | The address and port to listen on. |
-| context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| debug | DEBUG         | false           | Enable debug mode. |
-| config-file | CONFIG_FILE         | empty           | Path to a LocalAI config file. |
-
-Once the server is running, you can start making requests to it using HTTP, using the OpenAI API. 
-
-</details>
-
-### Supported OpenAI API endpoints
-
-You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
-
-Following the list of endpoints/parameters supported. 
-
-Note:
-
- You can also specify the model as part of the OpenAI token.
- If only one model is available, the API will use it for all the requests.
-
-#### Chat completions
-
-<details>
-For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
-
-```
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-koala-7b-model-q4_0-r2.bin",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-Available additional parameters: `top_p`, `top_k`, `max_tokens`
-</details>
-
-#### Completions
-
-<details>
-
-To generate a completion, you can send a POST request to the `/v1/completions` endpoint with the instruction as per the request body:
-
-```
-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-koala-7b-model-q4_0-r2.bin",
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
-```
-
-Available additional parameters: `top_p`, `top_k`, `max_tokens`
-
-</details>
-
-#### List models
-
-<details>
-You can list all the models available with:
-
-```
-curl http://localhost:8080/v1/models
-```
-
-</details>
-
-## Advanced configuration
+### Advanced configuration

 LocalAI can be configured to serve user-defined models with a set of default parameters and templates.

 <details>
-You can create multiple `yaml` files in the models path or either specify a single YAML configuration file.

-For instance, a configuration file (`gpt-3.5-turbo.yaml`) can be declaring the "gpt-3.5-turbo" model but backed by the "testmodel" model file:
+You can create multiple `yaml` files in the models path or either specify a single YAML configuration file. 
+Consider the following `models` folder in the `example/chatbot-ui`:
+
+```
+base ❯ ls -liah examples/chatbot-ui/models 
+36487587 drwxr-xr-x 2 mudler mudler 4.0K May  3 12:27 .
+36487586 drwxr-xr-x 3 mudler mudler 4.0K May  3 10:42 ..
+36465214 -rw-r--r-- 1 mudler mudler   10 Apr 27 07:46 completion.tmpl
+36464855 -rw-r--r-- 1 mudler mudler 3.6G Apr 27 00:08 ggml-gpt4all-j
+36464537 -rw-r--r-- 1 mudler mudler  245 May  3 10:42 gpt-3.5-turbo.yaml
+36467388 -rw-r--r-- 1 mudler mudler  180 Apr 27 07:46 gpt4all.tmpl
+```
+
+In the `gpt-3.5-turbo.yaml` file it is defined the `gpt-3.5-turbo` model which is an alias to use `gpt4all-j` with pre-defined options.
+
+For instance, consider the following that declares `gpt-3.5-turbo` backed by the `ggml-gpt4all-j` model:

 ```yaml
 name: gpt-3.5-turbo
+# Default model parameters
 parameters:
-  model: testmodel
+  # Relative to the models path
+  model: ggml-gpt4all-j
+  # temperature
+  temperature: 0.3
+  # all the OpenAI request options here..
+
+# Default context size
 context_size: 512
 threads: 10
+# Define a backend (optional). By default it will try to guess the backend the first time the model is interacted with.
+backend: gptj # available: llama, stablelm, gpt2, gptj rwkv
+# stopwords (if supported by the backend)
 stopwords:
 - "HUMAN:"
 - "### Response:"
+# define chat roles
 roles:
  user: "HUMAN:"
  system: "GPT:"
 template:
+  # template file ".tmpl" with the prompt template to use by default on the endpoint call. Note there is no extension in the files
  completion: completion
  chat: ggml-gpt4all-j
 ```
@@ -332,20 +247,101 @@ Specifying a `config-file` via CLI allows to declare models in a single file as
    system: "GPT:"
  template:
    completion: completion
-    chat: ggml-gpt4all-j
+   chat: ggml-gpt4all-j
 ```

 See also [chatbot-ui](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) as an example on how to use config files.

 </details>

-## Windows compatibility
+### Prompt templates 

-It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/LocalAI/issues/2
+The API doesn't inject a default prompt for talking to the model. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.

-## Build locally
+<details>
+You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibling file, `foo.bin.tmpl` which will be used as a default prompt and can be used with alpaca:

-Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.
+```
+The below instruction describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Input}}
+
+### Response:
+```
+
+See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prompt-templates) directory in this repository for templates for some of the most popular models.
+
+
+For the edit endpoint, an example template for alpaca-based models can be:
+
+```yaml
+Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Instruction}}
+
+### Input:
+{{.Input}}
+
+### Response:
+```
+
+</details>
+
+### CLI
+
+You can control LocalAI with command line arguments, to specify a binding address, or the number of threads.
+
+<details>
+
+Usage:
+
+```
+local-ai --models-path <model_path> [--address <address>] [--threads <num_threads>]
+```
+
+| Parameter    | Environment Variable | Default Value | Description                            |
+| ------------ | -------------------- | ------------- | -------------------------------------- |
+| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
+| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
+| address      | ADDRESS              | :8080         | The address and port to listen on. |
+| context-size | CONTEXT_SIZE         | 512           | Default token context size. |
+| debug | DEBUG         | false           | Enable debug mode. |
+| config-file | CONFIG_FILE         | empty           | Path to a LocalAI config file. |
+
+</details>
+
+## Setup
+
+Currently LocalAI comes as a container image and can be used with docker or a container engine of choice. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).
+
+### Docker
+
+<details>
+Example of starting the API with `docker`:
+
+```bash
+docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
+```
+
+You should see:
+```
+┌───────────────────────────────────────────────────┐ 
+│                   Fiber v2.42.0                   │ 
+│               http://127.0.0.1:8080               │ 
+│       (bound on host 0.0.0.0 and port 8080)       │ 
+│                                                   │ 
+│ Handlers ............. 1  Processes ........... 1 │ 
+│ Prefork ....... Disabled  PID ................. 1 │ 
+└───────────────────────────────────────────────────┘ 
+```
+
+</details>
+
+### Build locally
+
+<details>

 In order to build the `LocalAI` container image locally you can use `docker`:

@@ -355,12 +351,182 @@ docker build -t LocalAI .
 docker run LocalAI
 ```

-Or build the binary with `make`:
+Or you can build the binary with `make`:

 ```
 make build
 ```

+</details>
+
+### Build on mac
+
+Building on Mac (M1 or M2) works, but you may need to install some prerequisites using `brew`. 
+
+<details>
+
+The below has been tested by one mac user and found to work. Note that this doesn't use docker to run the server:
+
+```
+# install build dependencies
+brew install cmake
+brew install go
+
+# clone the repo
+git clone https://github.com/go-skynet/LocalAI.git
+
+cd LocalAI
+
+# build the binary
+make build
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Use a template from the examples
+cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
+
+# Run LocalAI
+./local-ai --models-path ./models/ --debug
+
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-gpt4all-j",
+     "messages": [{"role": "user", "content": "How are you?"}],
+     "temperature": 0.9 
+   }'
+```
+
+</details>
+
+### Windows compatibility
+
+It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/LocalAI/issues/2
+
+### Run LocalAI in Kubernetes
+
+LocalAI can be installed inside Kubernetes with helm.
+
+<details>
+
+1. Add the helm repo
+    ```bash
+    helm repo add go-skynet https://go-skynet.github.io/helm-charts/
+    ```
+1. Create a values files with your settings:
+```bash
+cat <<EOF > values.yaml
+deployment:
+  image: quay.io/go-skynet/local-ai:latest
+  env:
+    threads: 4
+    contextSize: 1024
+    modelsPath: "/models"
+# Optionally create a PVC, mount the PV to the LocalAI Deployment,
+# and download a model to prepopulate the models directory
+modelsVolume:
+  enabled: true
+  url: "https://gpt4all.io/models/ggml-gpt4all-j.bin"
+  pvc:
+    size: 6Gi
+    accessModes:
+    - ReadWriteOnce
+  auth:
+    # Optional value for HTTP basic access authentication header
+    basic: "" # 'username:password' base64 encoded
+service:
+  type: ClusterIP
+  annotations: {}
+  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
+  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
+EOF
+```
+3. Install the helm chart:
+```bash
+helm repo update
+helm install local-ai go-skynet/local-ai -f values.yaml
+```
+
+Check out also the [helm chart repository on GitHub](https://github.com/go-skynet/helm-charts).
+
+</details>
+
+## Supported OpenAI API endpoints
+
+You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
+
+Following the list of endpoints/parameters supported. 
+
+Note:
+
+- You can also specify the model as part of the OpenAI token.
+- If only one model is available, the API will use it for all the requests.
+
+### Chat completions
+
+<details>
+For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
+
+```
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+</details>
+
+### Edit completions
+
+<details>
+To generate an edit completion you can send a POST request to the `/v1/edits` endpoint with the instruction as the request body:
+
+```
+curl http://localhost:8080/v1/edits -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "instruction": "rephrase",
+     "input": "Black cat jumped out of the window",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`.
+
+</details>
+
+### Completions
+
+<details>
+
+To generate a completion, you can send a POST request to the `/v1/completions` endpoint with the instruction as per the request body:
+
+```
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+</details>
+
+### List models
+
+<details>
+You can list all the models available with:
+
+```
+curl http://localhost:8080/v1/models
+```
+
+</details>
+
 ## Frequently asked questions

 Here are answers to some of the most common questions.
@@ -404,7 +570,7 @@ Not currently, as ggml doesn't support GPUs yet: https://github.com/ggerganov/ll
 ### Where is the webUI? 

 <details> 
-We are working on to have a good out of the box experience - however as LocalAI is an API you can already plug it into existing projects that provides are UI interfaces to OpenAI's APIs. There are several already on github, and should be compatible with LocalAI already (as it mimics the OpenAI API)
+There is the availability of localai-webui and chatbot-ui in the examples section and can be setup as per the instructions. However as LocalAI is an API you can already plug it into existing projects that provides are UI interfaces to OpenAI's APIs. There are several already on github, and should be compatible with LocalAI already (as it mimics the OpenAI API)

 </details>

@@ -448,6 +614,13 @@ LocalAI is a community-driven project. It was initially created by [mudler](http

 MIT

+## Golang bindings used
+
+- [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+- [go-skynet/go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp)
+- [go-skynet/go-gpt2.cpp](https://github.com/go-skynet/go-gpt2.cpp)
+- [donomii/go-rwkv.cpp](https://github.com/donomii/go-rwkv.cpp)
+
 ## Acknowledgements

 - [llama.cpp](https://github.com/ggerganov/llama.cpp)
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -79,7 +79,7 @@ var _ = Describe("API test", func() {
 		It("returns errors", func() {
 			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
 			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: llama: model does not exist"))
+			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 5 errors occurred:"))
 		})

 	})
--- a/api/config.go
+++ b/api/config.go
@@ -21,6 +21,7 @@ type Config struct {
 	Threads        int               `yaml:"threads"`
 	Debug          bool              `yaml:"debug"`
 	Roles          map[string]string `yaml:"roles"`
+	Backend        string            `yaml:"backend"`
 	TemplateConfig TemplateConfig    `yaml:"template"`
 }

--- a/api/openai.go
+++ b/api/openai.go
@@ -2,6 +2,7 @@ package api

 import (
 	"bufio"
+	"bytes"
 	"encoding/json"
 	"fmt"
 	"os"
@@ -26,12 +27,19 @@ type ErrorResponse struct {
 	Error *APIError `json:"error,omitempty"`
 }

+type OpenAIUsage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
+
 type OpenAIResponse struct {
-	Created int      `json:"created,omitempty"`
-	Object  string   `json:"object,omitempty"`
-	ID      string   `json:"id,omitempty"`
-	Model   string   `json:"model,omitempty"`
-	Choices []Choice `json:"choices,omitempty"`
+	Created int         `json:"created,omitempty"`
+	Object  string      `json:"object,omitempty"`
+	ID      string      `json:"id,omitempty"`
+	Model   string      `json:"model,omitempty"`
+	Choices []Choice    `json:"choices,omitempty"`
+	Usage   OpenAIUsage `json:"usage"`
 }

 type Choice struct {
@@ -56,13 +64,13 @@ type OpenAIRequest struct {
 	Model string `json:"model" yaml:"model"`

 	// Prompt is read only by completion API calls
-	Prompt string `json:"prompt" yaml:"prompt"`
+	Prompt interface{} `json:"prompt" yaml:"prompt"`

 	// Edit endpoint
 	Instruction string `json:"instruction" yaml:"instruction"`
 	Input       string `json:"input" yaml:"input"`

-	Stop string `json:"stop" yaml:"stop"`
+	Stop interface{} `json:"stop" yaml:"stop"`

 	// Messages is read only by chat/completion API calls
 	Messages []Message `json:"messages" yaml:"messages"`
@@ -116,8 +124,17 @@ func updateConfig(config *Config, input *OpenAIRequest) {
 		config.Maxtokens = input.Maxtokens
 	}

-	if input.Stop != "" {
-		config.StopWords = append(config.StopWords, input.Stop)
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
 	}

 	if input.RepeatPenalty != 0 {
@@ -227,27 +244,44 @@ func completionEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,

 		log.Debug().Msgf("Parameter Config: %+v", config)

-		predInput := input.Prompt
+		predInput := []string{}
+
+		switch p := input.Prompt.(type) {
+		case string:
+			predInput = append(predInput, p)
+		case []interface{}:
+			for _, pp := range p {
+				if s, ok := pp.(string); ok {
+					predInput = append(predInput, s)
+				}
+			}
+		}
+
 		templateFile := config.Model

 		if config.TemplateConfig.Completion != "" {
 			templateFile = config.TemplateConfig.Completion
 		}

-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := loader.TemplatePrefix(templateFile, struct {
-			Input string
-		}{Input: predInput})
-		if err == nil {
-			predInput = templatedInput
-			log.Debug().Msgf("Template found, input modified to: %s", predInput)
-		}
+		var result []Choice
+		for _, i := range predInput {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := loader.TemplatePrefix(templateFile, struct {
+				Input string
+			}{Input: i})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}

-		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
-			*c = append(*c, Choice{Text: s})
-		})
-		if err != nil {
-			return err
+			r, err := ComputeChoices(i, input, config, loader, func(s string, c *[]Choice) {
+				*c = append(*c, Choice{Text: s})
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			result = append(result, r...)
 		}

 		resp := &OpenAIResponse{
@@ -265,6 +299,21 @@ func completionEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader,
 }

 func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, threads, ctx int, f16 bool) func(c *fiber.Ctx) error {
+
+	process := func(s string, req *OpenAIRequest, config *Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
+		ComputeChoices(s, req, config, loader, func(s string, c *[]Choice) {}, func(s string) bool {
+			resp := OpenAIResponse{
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []Choice{{Delta: &Message{Role: "assistant", Content: s}}},
+				Object:  "chat.completion.chunk",
+			}
+			log.Debug().Msgf("Sending goroutine: %s", s)
+
+			responses <- resp
+			return true
+		})
+		close(responses)
+	}
 	return func(c *fiber.Ctx) error {
 		config, input, err := readConfig(cm, c, loader, debug, threads, ctx, f16)
 		if err != nil {
@@ -290,8 +339,9 @@ func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread

 		if input.Stream {
 			log.Debug().Msgf("Stream request received")
+			c.Context().SetContentType("text/event-stream")
 			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			c.Set("Content-Type", "text/event-stream; charset=utf-8")
+			//	c.Set("Content-Type", "text/event-stream")
 			c.Set("Cache-Control", "no-cache")
 			c.Set("Connection", "keep-alive")
 			c.Set("Transfer-Encoding", "chunked")
@@ -312,13 +362,40 @@ func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread
 			log.Debug().Msgf("Template found, input modified to: %s", predInput)
 		}

+		if input.Stream {
+			responses := make(chan OpenAIResponse)
+
+			go process(predInput, input, config, loader, responses)
+
+			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
+
+				for ev := range responses {
+					var buf bytes.Buffer
+					enc := json.NewEncoder(&buf)
+					enc.Encode(ev)
+
+					fmt.Fprintf(w, "event: data\n\n")
+					fmt.Fprintf(w, "data: %v\n\n", buf.String())
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					w.Flush()
+				}
+
+				w.WriteString("event: data\n\n")
+				resp := &OpenAIResponse{
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []Choice{{FinishReason: "stop"}},
+				}
+				respData, _ := json.Marshal(resp)
+
+				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
+				w.Flush()
+			}))
+			return nil
+		}
+
 		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
-			if input.Stream {
-				*c = append(*c, Choice{Delta: &Message{Role: "assistant", Content: s}})
-			} else {
-				*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: s}})
-			}
-		})
+			*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: s}})
+		}, nil)
 		if err != nil {
 			return err
 		}
@@ -328,36 +405,8 @@ func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread
 			Choices: result,
 			Object:  "chat.completion",
 		}
-
-		if input.Stream {
-			resp.Object = "chat.completion.chunk"
-			jsonResult, _ := json.Marshal(resp)
-			log.Debug().Msgf("Response: %s", jsonResult)
-			log.Debug().Msgf("Handling stream request")
-			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-				fmt.Fprintf(w, "event: data\n")
-				w.Flush()
-
-				fmt.Fprintf(w, "data: %s\n\n", jsonResult)
-				w.Flush()
-
-				fmt.Fprintf(w, "event: data\n")
-				w.Flush()
-
-				resp := &OpenAIResponse{
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{{FinishReason: "stop"}},
-				}
-				respData, _ := json.Marshal(resp)
-
-				fmt.Fprintf(w, "data: %s\n\n", respData)
-				w.Flush()
-
-				//	fmt.Fprintf(w, "data: [DONE]\n\n")
-				//		w.Flush()
-			}))
-			return nil
-		}
+		respData, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", respData)

 		// Return the prediction in the response body
 		return c.JSON(resp)
@@ -392,7 +441,7 @@ func editEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, thread

 		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
 			*c = append(*c, Choice{Text: s})
-		})
+		}, nil)
 		if err != nil {
 			return err
 		}
--- a/api/prediction.go
+++ b/api/prediction.go
@@ -6,26 +6,103 @@ import (
 	"strings"
 	"sync"

+	"github.com/donomii/go-rwkv.cpp"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
+	"github.com/hashicorp/go-multierror"
 )

+const tokenizerSuffix = ".tokenizer.json"
+
 // mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
 var mutexMap sync.Mutex
 var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)

-func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (string, error), error) {
-	var model *llama.LLama
-	var gptModel *gptj.GPTJ
-	var gpt2Model *gpt2.GPT2
-	var stableLMModel *gpt2.StableLM
+var loadedModels map[string]interface{} = map[string]interface{}{}
+var muModels sync.Mutex

+func backendLoader(backendString string, loader *model.ModelLoader, modelFile string, llamaOpts []llama.ModelOption, threads uint32) (model interface{}, err error) {
+	switch strings.ToLower(backendString) {
+	case "llama":
+		return loader.LoadLLaMAModel(modelFile, llamaOpts...)
+	case "stablelm":
+		return loader.LoadStableLMModel(modelFile)
+	case "gpt2":
+		return loader.LoadGPT2Model(modelFile)
+	case "gptj":
+		return loader.LoadGPTJModel(modelFile)
+	case "rwkv":
+		return loader.LoadRWKV(modelFile, modelFile+tokenizerSuffix, threads)
+	default:
+		return nil, fmt.Errorf("backend unsupported: %s", backendString)
+	}
+}
+
+func greedyLoader(loader *model.ModelLoader, modelFile string, llamaOpts []llama.ModelOption, threads uint32) (model interface{}, err error) {
+	updateModels := func(model interface{}) {
+		muModels.Lock()
+		defer muModels.Unlock()
+		loadedModels[modelFile] = model
+	}
+
+	muModels.Lock()
+	m, exists := loadedModels[modelFile]
+	if exists {
+		muModels.Unlock()
+		return m, nil
+	}
+	muModels.Unlock()
+
+	model, modelerr := loader.LoadLLaMAModel(modelFile, llamaOpts...)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadGPTJModel(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadGPT2Model(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadStableLMModel(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadRWKV(modelFile, modelFile+tokenizerSuffix, threads)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
+}
+
+func ModelInference(s string, loader *model.ModelLoader, c Config, tokenCallback func(string) bool) (func() (string, error), error) {
+	supportStreams := false
 	modelFile := c.Model

 	// Try to load the model
-	var llamaerr, gpt2err, gptjerr, stableerr error
 	llamaOpts := []llama.ModelOption{}
 	if c.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(c.ContextSize))
@@ -34,25 +111,38 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
 	}

-	// TODO: this is ugly, better identifying the model somehow! however, it is a good stab for a first implementation..
-	model, llamaerr = loader.LoadLLaMAModel(modelFile, llamaOpts...)
-	if llamaerr != nil {
-		gptModel, gptjerr = loader.LoadGPTJModel(modelFile)
-		if gptjerr != nil {
-			gpt2Model, gpt2err = loader.LoadGPT2Model(modelFile)
-			if gpt2err != nil {
-				stableLMModel, stableerr = loader.LoadStableLMModel(modelFile)
-				if stableerr != nil {
-					return nil, fmt.Errorf("llama: %s gpt: %s gpt2: %s stableLM: %s", llamaerr.Error(), gptjerr.Error(), gpt2err.Error(), stableerr.Error()) // llama failed first, so we want to catch both errors
-				}
-			}
-		}
+	var inferenceModel interface{}
+	var err error
+	if c.Backend == "" {
+		inferenceModel, err = greedyLoader(loader, modelFile, llamaOpts, uint32(c.Threads))
+	} else {
+		inferenceModel, err = backendLoader(c.Backend, loader, modelFile, llamaOpts, uint32(c.Threads))
+	}
+	if err != nil {
+		return nil, err
 	}

 	var fn func() (string, error)

-	switch {
-	case stableLMModel != nil:
+	switch model := inferenceModel.(type) {
+	case *rwkv.RwkvState:
+		supportStreams = true
+
+		fn = func() (string, error) {
+			stopWord := "\n"
+			if len(c.StopWords) > 0 {
+				stopWord = c.StopWords[0]
+			}
+
+			if err := model.ProcessInput(s); err != nil {
+				return "", err
+			}
+
+			response := model.GenerateResponse(c.Maxtokens, stopWord, float32(c.Temperature), float32(c.TopP), tokenCallback)
+
+			return response, nil
+		}
+	case *gpt2.StableLM:
 		fn = func() (string, error) {
 			// Generate the prediction using the language model
 			predictOptions := []gpt2.PredictOption{
@@ -71,12 +161,12 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
 			}

-			return stableLMModel.Predict(
+			return model.Predict(
 				s,
 				predictOptions...,
 			)
 		}
-	case gpt2Model != nil:
+	case *gpt2.GPT2:
 		fn = func() (string, error) {
 			// Generate the prediction using the language model
 			predictOptions := []gpt2.PredictOption{
@@ -95,12 +185,12 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
 			}

-			return gpt2Model.Predict(
+			return model.Predict(
 				s,
 				predictOptions...,
 			)
 		}
-	case gptModel != nil:
+	case *gptj.GPTJ:
 		fn = func() (string, error) {
 			// Generate the prediction using the language model
 			predictOptions := []gptj.PredictOption{
@@ -119,13 +209,19 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 				predictOptions = append(predictOptions, gptj.SetSeed(c.Seed))
 			}

-			return gptModel.Predict(
+			return model.Predict(
 				s,
 				predictOptions...,
 			)
 		}
-	case model != nil:
+	case *llama.LLama:
+		supportStreams = true
 		fn = func() (string, error) {
+
+			if tokenCallback != nil {
+				model.SetTokenCallback(tokenCallback)
+			}
+
 			// Generate the prediction using the language model
 			predictOptions := []llama.PredictOption{
 				llama.SetTemperature(c.Temperature),
@@ -165,10 +261,15 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 				predictOptions = append(predictOptions, llama.SetSeed(c.Seed))
 			}

-			return model.Predict(
+			str, er := model.Predict(
 				s,
 				predictOptions...,
 			)
+			// Seems that if we don't free the callback explicitly we leave functions registered (that might try to send on closed channels)
+			// For instance otherwise the API returns: {"error":{"code":500,"message":"send on closed channel","type":""}}
+			// after a stream event has occurred
+			model.SetTokenCallback(nil)
+			return str, er
 		}
 	}

@@ -185,11 +286,15 @@ func ModelInference(s string, loader *model.ModelLoader, c Config) (func() (stri
 		l.Lock()
 		defer l.Unlock()

-		return fn()
+		res, err := fn()
+		if tokenCallback != nil && !supportStreams {
+			tokenCallback(res)
+		}
+		return res, err
 	}, nil
 }

-func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, loader *model.ModelLoader, cb func(string, *[]Choice)) ([]Choice, error) {
+func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
 	result := []Choice{}

 	n := input.N
@@ -199,7 +304,7 @@ func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, load
 	}

 	// get the model function to call for the result
-	predFunc, err := ModelInference(predInput, loader, *config)
+	predFunc, err := ModelInference(predInput, loader, *config, tokenCallback)
 	if err != nil {
 		return result, err
 	}
--- a/charts/local-ai/Chart.yaml
+++ b/charts/local-ai/Chart.yaml
@@ -1,6 +0,0 @@
-apiVersion: v2
-appVersion: 0.1.0
-description: A Helm chart for LocalAI
-name: local-ai
-type: application
-version: 1.0.0
--- a/charts/local-ai/templates/_helpers.tpl
+++ b/charts/local-ai/templates/_helpers.tpl
@@ -1,44 +0,0 @@
-{{/*
-Expand the name of the chart.
-*/}}
-{{- define "local-ai.name" -}}
-{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
-{{- end }}
-
-{{/*
-Create a default fully qualified app name.
-We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
-If release name contains chart name it will be used as a full name.
-*/}}
-{{- define "local-ai.fullname" -}}
-{{- if .Values.fullnameOverride }}
-{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
-{{- else }}
-{{- $name := default .Chart.Name .Values.nameOverride }}
-{{- if contains $name .Release.Name }}
-{{- .Release.Name | trunc 63 | trimSuffix "-" }}
-{{- else }}
-{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
-{{- end }}
-{{- end }}
-{{- end }}
-
-{{/*
-Create chart name and version as used by the chart label.
-*/}}
-{{- define "local-ai.chart" -}}
-{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
-{{- end }}
-
-{{/*
-Common labels
-*/}}
-{{- define "local-ai.labels" -}}
-helm.sh/chart: {{ include "local-ai.chart" . }}
-app.kubernetes.io/name: {{ include "local-ai.name" . }}
-app.kubernetes.io/instance: "{{ .Release.Name }}"
-app.kubernetes.io/managed-by: {{ .Release.Service }}
-{{- if .Chart.AppVersion }}
-app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
-{{- end }}
-{{- end }}
--- a/charts/local-ai/templates/data-volume.yaml
+++ b/charts/local-ai/templates/data-volume.yaml
@@ -1,39 +0,0 @@
-{{- if .Values.dataVolume.enabled }}
-apiVersion: cdi.kubevirt.io/v1beta1
-kind: DataVolume
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-spec:
-  contentType: archive
-  source:
-    {{ .Values.dataVolume.source.type }}:
-      url: {{ .Values.dataVolume.source.url }}
-      secretRef: {{ template "local-ai.fullname" . }}
-      {{- if and (eq .Values.dataVolume.source.type "http") .Values.dataVolume.source.secretExtraHeaders }}
-      secretExtraHeaders: {{ .Values.dataVolume.source.secretExtraHeaders }}
-      {{- end }}
-      {{- if .Values.dataVolume.source.caCertConfigMap }}
-      caCertConfigMap: {{ .Values.dataVolume.source.caCertConfigMap }}
-      {{- end }}
-  pvc:
-    accessModes: {{ .Values.dataVolume.pvc.accessModes }}
-    resources:
-      requests:
-        storage: {{ .Values.dataVolume.pvc.size }}
---
-{{- if .Values.dataVolume.secret.enabled }}
-apiVersion: v1
-kind: Secret
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-data:
-  accessKeyId: {{ .Values.dataVolume.secret.username }}
-  secretKey: {{ .Values.dataVolume.secret.password }}
-{{- end }}
-{{- end }}
--- a/charts/local-ai/templates/deployment.yaml
+++ b/charts/local-ai/templates/deployment.yaml
@@ -1,39 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-spec:
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: {{ include "local-ai.name" . }}
-      app.kubernetes.io/instance: {{ .Release.Name }}
-  replicas: 1
-  template:
-    metadata:
-      name: {{ template "local-ai.fullname" . }}
-      labels:
-        app.kubernetes.io/name: {{ include "local-ai.name" . }}
-        app.kubernetes.io/instance: {{ .Release.Name }}
-    spec:
-      containers:
-        - name: {{ template "local-ai.fullname" . }}
-          image: {{ .Values.deployment.image }}
-          env:
-          - name: THREADS
-            value: {{ .Values.deployment.env.threads | quote }}
-          - name: CONTEXT_SIZE
-            value: {{ .Values.deployment.env.contextSize | quote }}
-          - name: MODELS_PATH
-            value: {{ .Values.deployment.env.modelsPath }}
-{{- if .Values.deployment.volume.enabled }}
-          volumeMounts:
-          - mountPath: {{ .Values.deployment.env.modelsPath }}
-            name: models
-      volumes:
-      - name: models
-        persistentVolumeClaim:
-          claimName: {{ template "local-ai.fullname" . }}
-{{- end }}
--- a/charts/local-ai/templates/service.yaml
+++ b/charts/local-ai/templates/service.yaml
@@ -1,19 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-{{- if .Values.service.annotations }}
-  annotations:
-  {{ toYaml .Values.service.annotations | indent 4 }}
-{{- end }}
-spec:
-  selector:
-    app.kubernetes.io/name: {{ include "local-ai.name" . }}
-  type: "{{ .Values.service.type }}"
-  ports:
-    - protocol: TCP
-      port: 8080
-      targetPort: 8080
--- a/charts/local-ai/values.yaml
+++ b/charts/local-ai/values.yaml
@@ -1,38 +0,0 @@
-deployment:
-  image: quay.io/go-skynet/local-ai:latest
-  env:
-    threads: 14
-    contextSize: 512
-    modelsPath: "/models"
-  volume:
-    enabled: false
-
-service:
-  type: ClusterIP
-  annotations: {}
-  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
-  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
-
-# Optionally create a PVC containing a model binary, sourced from an arbitrary HTTP server or S3 bucket
-# (requires https://github.com/kubevirt/containerized-data-importer)
-dataVolume:
-  enabled: false
-  source:
-    type: "http" # Source type. One of: [ http | s3 ]
-    url: "http://<model_server>/<model_archive>" # e.g. koala-7B-4bit-128g.GGML.tar
-
-    # CertConfigMap is an optional ConfigMap reference, containing a Certificate Authority (CA) public key
-    # and a base64 encoded pem certificate
-    caCertConfigMap: ""
-
-    # SecretExtraHeaders is an optional list of Secret references, each containing an extra HTTP header
-    # that may include sensitive information. Only applicable for the http source type.
-    secretExtraHeaders: []
-  pvc:
-    accessModes:
-    - ReadWriteOnce
-    size: 5Gi
-  secret:
-    enabled: false
-    username: "" # base64 encoded
-    password: "" # base64 encoded
--- a/examples/README.md
+++ b/examples/README.md
@@ -6,6 +6,11 @@ Here is a list of projects that can easily be integrated with the LocalAI backen

 - [chatbot-ui](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui/) (by [@mkellerman](https://github.com/mkellerman))
 - [discord-bot](https://github.com/go-skynet/LocalAI/tree/master/examples/discord-bot/) (by [@mudler](https://github.com/mudler))
+- [langchain](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain/) (by [@dave-gray101](https://github.com/dave-gray101))
+- [langchain-python](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain-python/) (by [@mudler](https://github.com/mudler))
+- [localai-webui](https://github.com/go-skynet/LocalAI/tree/master/examples/localai-webui/) (by [@dhruvgera](https://github.com/dhruvgera))
+- [rwkv](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv/) (by [@mudler](https://github.com/mudler))
+- [slack-bot](https://github.com/go-skynet/LocalAI/tree/master/examples/slack-bot/) (by [@mudler](https://github.com/mudler))

 ## Want to contribute?

--- a/examples/chatbot-ui/README.md
+++ b/examples/chatbot-ui/README.md
@@ -22,5 +22,25 @@ wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
 docker-compose up -d --build
 ```

+## Pointing chatbot-ui to a separately managed LocalAI service
+
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+```
+version: '3.6'
+
+services:
+  chatgpt:
+    image: ghcr.io/mckaywrigley/chatbot-ui:main
+    ports:
+      - 3000:3000
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
+```
+
+Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+
+## Accessing chatbot-ui
+
 Open http://localhost:3000 for the Web UI.

--- a/examples/discord-bot/README.md
+++ b/examples/discord-bot/README.md
@@ -1,5 +1,7 @@
 # discord-bot

+![Screenshot from 2023-05-01 07-58-19](https://user-images.githubusercontent.com/2420543/235413924-0cb2e75b-f2d6-4119-8610-44386e44afb8.png)
+
 ## Setup

 ```bash
@@ -8,15 +10,13 @@ git clone https://github.com/go-skynet/LocalAI

 cd LocalAI/examples/discord-bot

-git clone https://github.com/go-skynet/gpt-discord-bot.git
-
 # (optional) Checkout a specific LocalAI tag
 # git checkout -b build <TAG>

 # Download gpt4all-j to models/
 wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j

-# Set the discord bot options
+# Set the discord bot options (see: https://github.com/go-skynet/gpt-discord-bot#setup)
 cp -rfv .env.example .env
 vim .env

@@ -24,5 +24,53 @@ vim .env
 docker-compose up -d --build
 ```

+Note: see setup options here: https://github.com/go-skynet/gpt-discord-bot#setup
+
 Open up the URL in the console and give permission to the bot in your server. Start a thread with `/chat ..`

+## Kubernetes
+
+- install the local-ai chart first
+- change OPENAI_API_BASE to point to the API address and apply the discord-bot manifest:
+
+```yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: discord-bot
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: localai
+  namespace: discord-bot
+  labels:
+    app: localai
+spec:
+  selector:
+    matchLabels:
+      app: localai
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: localai
+      name: localai
+    spec:
+      containers:
+        - name: localai-discord
+          env:
+          - name: OPENAI_API_KEY
+            value: "x"
+          - name: DISCORD_BOT_TOKEN
+            value: ""
+          - name: DISCORD_CLIENT_ID
+            value: ""
+          - name: OPENAI_API_BASE
+            value: "http://local-ai.default.svc.cluster.local:8080"
+          - name: ALLOWED_SERVER_IDS
+            value: "xx"
+          - name: SERVER_TO_MODERATION_CHANNEL
+            value: "1:1"
+          image: quay.io/go-skynet/gpt-discord-bot:main
+```
--- a/examples/discord-bot/docker-compose.yaml
+++ b/examples/discord-bot/docker-compose.yaml
@@ -16,8 +16,6 @@ services:
    command: ["/usr/bin/local-ai" ]

  bot:
-    build:
-      context: ./gpt-discord-bot
-      dockerfile: Dockerfile
+    image: quay.io/go-skynet/gpt-discord-bot:main
    env_file:
    - .env
--- a/examples/langchain-python/README.md
+++ b/examples/langchain-python/README.md
@@ -0,0 +1,33 @@
+## Langchain-python
+
+Langchain example from [quickstart](https://python.langchain.com/en/latest/getting_started/getting_started.html).
+
+To interact with langchain, you can just set the `OPENAI_API_BASE` URL and provide a token with a random string.
+
+See the example below:
+
+```
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/langchain-python
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# start with docker-compose
+docker-compose up -d --build
+
+
+pip install langchain
+pip install openai
+
+export OPENAI_API_BASE=http://localhost:8080
+export OPENAI_API_KEY=sk-
+
+python test.py
+# A good company name for a company that makes colorful socks would be "Colorsocks".
+```
--- a/examples/langchain-python/docker-compose.yaml
+++ b/examples/langchain-python/docker-compose.yaml
@@ -0,0 +1,16 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
--- a/examples/langchain-python/models
+++ b/examples/langchain-python/models
@@ -0,0 +1 @@
+../chatbot-ui/models
--- a/examples/langchain-python/test.py
+++ b/examples/langchain-python/test.py
@@ -0,0 +1,6 @@
+
+from langchain.llms import OpenAI
+
+llm = OpenAI(temperature=0.9,model_name="gpt-3.5-turbo")
+text = "What would be a good company name for a company that makes colorful socks?"
+print(llm(text))
--- a/examples/langchain/.gitignore
+++ b/examples/langchain/.gitignore
@@ -0,0 +1,2 @@
+models/ggml-koala-13B-4bit-128g
+models/ggml-gpt4all-j
--- a/examples/langchain/JS.Dockerfile
+++ b/examples/langchain/JS.Dockerfile
@@ -0,0 +1,6 @@
+FROM node:latest
+COPY ./langchainjs-localai-example /app
+WORKDIR /app
+RUN npm install
+RUN npm run build
+ENTRYPOINT [ "npm", "run", "start" ]
--- a/examples/langchain/PY.Dockerfile
+++ b/examples/langchain/PY.Dockerfile
@@ -0,0 +1,5 @@
+FROM python:3.10-bullseye
+COPY ./langchainpy-localai-example /app
+WORKDIR /app
+RUN pip install --no-cache-dir -r requirements.txt
+ENTRYPOINT [ "python", "./simple_demo.py" ];
--- a/examples/langchain/README.md
+++ b/examples/langchain/README.md
@@ -0,0 +1,30 @@
+# langchain
+
+Example of using langchain, with the standard OpenAI llm module, and LocalAI. Has docker compose profiles for both the Typescript and Python versions.
+
+**Please Note** - This is a tech demo example at this time. ggml-gpt4all-j has pretty terrible results for most langchain applications with the settings used in this example.
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/langchain
+
+# (optional) - Edit the example code in typescript.
+# vi ./langchainjs-localai-example/index.ts
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# start with docker-compose for typescript!
+docker-compose --profile ts up --build
+
+# or start with docker-compose for python!
+docker-compose --profile py up --build
+```
+
+## Copyright
+
+Some of the example code in index.mts is adapted from the langchainjs project and is Copyright (c) Harrison Chase. Used under the terms of the MIT license, as is the remainder of this code.
--- a/examples/langchain/docker-compose.yaml
+++ b/examples/langchain/docker-compose.yaml
@@ -0,0 +1,43 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  js:
+    build:
+      context: .
+      dockerfile: JS.Dockerfile
+    profiles:
+      - js
+      - ts
+    depends_on:
+    - "api"
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_BASE=http://api:8080/v1'
+      - 'MODEL_NAME=gpt-3.5-turbo' #gpt-3.5-turbo' # ggml-gpt4all-j' # ggml-koala-13B-4bit-128g'
+
+  py:
+    build:
+      context: .
+      dockerfile: PY.Dockerfile
+    profiles:
+      - py
+    depends_on:
+    - "api"
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_BASE=http://api:8080/v1'
+      - 'MODEL_NAME=gpt-3.5-turbo' #gpt-3.5-turbo' # ggml-gpt4all-j' # ggml-koala-13B-4bit-128g'
--- a/examples/langchain/langchainjs-localai-example/.gitignore
+++ b/examples/langchain/langchainjs-localai-example/.gitignore
@@ -0,0 +1,2 @@
+node_modules/
+dist/
--- a/examples/langchain/langchainjs-localai-example/.vscode/launch.json
+++ b/examples/langchain/langchainjs-localai-example/.vscode/launch.json
@@ -0,0 +1,20 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "node",
+            "request": "launch",
+            "name": "Launch Program",
+            // "skipFiles": [
+            //     "<node_internals>/**"
+            // ],
+            "program": "${workspaceFolder}\\dist\\index.mjs",
+            "outFiles": [
+                "${workspaceFolder}/**/*.js"
+            ]
+        }
+    ]
+}
--- a/examples/langchain/langchainjs-localai-example/package-lock.json
+++ b/examples/langchain/langchainjs-localai-example/package-lock.json
--- a/examples/langchain/langchainjs-localai-example/package.json
+++ b/examples/langchain/langchainjs-localai-example/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "langchainjs-localai-example",
+  "version": "0.1.0",
+  "description": "Trivial Example of using langchain + the OpenAI API + LocalAI together",
+  "main": "index.mjs",
+  "scripts": {
+    "build": "tsc --build",
+    "clean": "tsc --build --clean",
+    "start": "node --trace-warnings dist/index.mjs"
+  },
+  "author": "dave@gray101.com",
+  "license": "MIT",
+  "devDependencies": {
+    "@types/node": "^18.16.3",
+    "typescript": "^5.0.4"
+  },
+  "dependencies": {
+    "langchain": "^0.0.67",
+    "typeorm": "^0.3.15"
+  }
+}
--- a/examples/langchain/langchainjs-localai-example/src/index.mts
+++ b/examples/langchain/langchainjs-localai-example/src/index.mts
@@ -0,0 +1,79 @@
+import { OpenAIChat } from "langchain/llms/openai";
+import { loadQAStuffChain } from "langchain/chains";
+import { Document } from "langchain/document";
+import { initializeAgentExecutorWithOptions } from "langchain/agents";
+import {Calculator} from "langchain/tools/calculator";
+
+const pathToLocalAi = process.env['OPENAI_API_BASE'] || 'http://api:8080/v1';
+const fakeApiKey = process.env['OPENAI_API_KEY'] || '-';
+const modelName = process.env['MODEL_NAME'] || 'gpt-3.5-turbo';
+
+function getModel(): OpenAIChat {
+  return new OpenAIChat({
+    prefixMessages: [
+      {
+        role: "system",
+        content: "You are a helpful assistant that answers in pirate language",
+      },
+    ],
+    modelName: modelName,
+    maxTokens: 50,
+    openAIApiKey: fakeApiKey,
+    maxRetries: 2
+  }, {
+    basePath: pathToLocalAi,
+    apiKey: fakeApiKey,
+  });
+}
+
+// Minimal example.
+export const run = async () => {
+  const model = getModel();
+  console.log(`about to model.call at ${new Date().toUTCString()}`);
+  const res = await model.call(
+    "What would be a good company name a company that makes colorful socks?"
+  );
+  console.log(`${new Date().toUTCString()}`);
+  console.log({ res });
+};
+
+await run();
+
+// This example uses the `StuffDocumentsChain`
+export const run2 = async () => {
+  const model = getModel();
+  const chainA = loadQAStuffChain(model);
+  const docs = [
+    new Document({ pageContent: "Harrison went to Harvard." }),
+    new Document({ pageContent: "Ankush went to Princeton." }),
+  ];
+  const resA = await chainA.call({
+    input_documents: docs,
+    question: "Where did Harrison go to college?",
+  });
+  console.log({ resA });
+};
+
+await run2();
+
+// Quickly thrown together example of using tools + agents.
+// This seems like it should work, but it doesn't yet.
+export const temporarilyBrokenToolTest = async () => {
+  const model = getModel();
+
+  const executor = await initializeAgentExecutorWithOptions([new Calculator(true)], model, {
+    agentType: "zero-shot-react-description",
+  });
+
+  console.log("Loaded agent.");
+
+  const input = `What is the value of (500 *2) + 350 - 13?`;
+
+  console.log(`Executing with input "${input}"...`);
+
+  const result = await executor.call({ input });
+
+  console.log(`Got output ${result.output}`);
+}
+
+await temporarilyBrokenToolTest();
--- a/examples/langchain/langchainjs-localai-example/tsconfig.json
+++ b/examples/langchain/langchainjs-localai-example/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "es2022",
+    "lib": ["ES2022", "DOM"],
+    "module": "ES2022",
+    "moduleResolution": "node",
+    "strict": true,
+    "esModuleInterop": true,
+    "allowSyntheticDefaultImports": true,
+    "isolatedModules": true,
+    "outDir": "./dist"
+  },
+  "include": ["src", "test"],
+  "exclude": ["node_modules", "dist"]
+}
--- a/examples/langchain/langchainpy-localai-example/.vscode/launch.json
+++ b/examples/langchain/langchainpy-localai-example/.vscode/launch.json
@@ -0,0 +1,24 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "redirectOutput": true,
+            "justMyCode": false
+        },
+        {
+            "name": "Python: Attach to Port 5678",
+            "type": "python",
+            "request": "attach",
+            "connect": {
+                "host": "localhost",
+                "port": 5678
+              },
+            "justMyCode": false
+        }
+    ]
+}
--- a/examples/langchain/langchainpy-localai-example/.vscode/settings.json
+++ b/examples/langchain/langchainpy-localai-example/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.defaultInterpreterPath": "${workspaceFolder}/.venv/Scripts/python"
+}
--- a/examples/langchain/langchainpy-localai-example/full_demo.py
+++ b/examples/langchain/langchainpy-localai-example/full_demo.py
@@ -0,0 +1,39 @@
+import os
+from langchain.chat_models import ChatOpenAI
+from langchain import PromptTemplate, LLMChain
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    AIMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.schema import (
+    AIMessage,
+    HumanMessage,
+    SystemMessage
+)
+
+print('Langchain + LocalAI PYTHON Tests')
+
+base_path = os.environ.get('OPENAI_API_BASE', 'http://api:8080/v1')
+key = os.environ.get('OPENAI_API_KEY', '-')
+model_name = os.environ.get('MODEL_NAME', 'gpt-3.5-turbo')
+
+
+chat = ChatOpenAI(temperature=0, openai_api_base=base_path, openai_api_key=key, model_name=model_name, max_tokens=100)
+
+print("Created ChatOpenAI for ", chat.model_name)
+
+template = "You are a helpful assistant that translates {input_language} to {output_language}."
+system_message_prompt = SystemMessagePromptTemplate.from_template(template)
+human_template = "{text}"
+human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
+
+chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
+
+print("ABOUT to execute")
+
+# get a chat completion from the formatted messages
+chat(chat_prompt.format_prompt(input_language="English", output_language="French", text="I love programming.").to_messages())
+
+print(".");
--- a/examples/langchain/langchainpy-localai-example/requirements.txt
+++ b/examples/langchain/langchainpy-localai-example/requirements.txt
@@ -0,0 +1,32 @@
+aiohttp==3.8.4
+aiosignal==1.3.1
+async-timeout==4.0.2
+attrs==23.1.0
+certifi==2022.12.7
+charset-normalizer==3.1.0
+colorama==0.4.6
+dataclasses-json==0.5.7
+debugpy==1.6.7
+frozenlist==1.3.3
+greenlet==2.0.2
+idna==3.4
+langchain==0.0.157
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+multidict==6.0.4
+mypy-extensions==1.0.0
+numexpr==2.8.4
+numpy==1.24.3
+openai==0.27.6
+openapi-schema-pydantic==1.2.4
+packaging==23.1
+pydantic==1.10.7
+PyYAML==6.0
+requests==2.29.0
+SQLAlchemy==2.0.12
+tenacity==8.2.2
+tqdm==4.65.0
+typing-inspect==0.8.0
+typing_extensions==4.5.0
+urllib3==1.26.15
+yarl==1.9.2
--- a/examples/langchain/langchainpy-localai-example/simple_demo.py
+++ b/examples/langchain/langchainpy-localai-example/simple_demo.py
@@ -0,0 +1,6 @@
+
+from langchain.llms import OpenAI
+
+llm = OpenAI(temperature=0.9,model_name="gpt-3.5-turbo")
+text = "What would be a good company name for a company that makes colorful socks?"
+print(llm(text))
--- a/examples/langchain/models/completion.tmpl
+++ b/examples/langchain/models/completion.tmpl
@@ -0,0 +1 @@
+{{.Input}}
--- a/examples/langchain/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain/models/gpt-3.5-turbo.yaml
@@ -0,0 +1,18 @@
+name: gpt-3.5-turbo
+parameters:
+  model: ggml-gpt4all-j # ggml-koala-13B-4bit-128g
+  top_k: 80
+  temperature: 0.2
+  top_p: 0.7
+context_size: 1024
+threads: 4
+stopwords:
+- "HUMAN:"
+- "GPT:"
+roles:
+  user: " "
+  system: " "
+backend: "gptj"
+template:
+  completion: completion
+  chat: completion # gpt4all
--- a/examples/langchain/models/gpt4all.tmpl
+++ b/examples/langchain/models/gpt4all.tmpl
@@ -0,0 +1,4 @@
+The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+### Prompt:
+{{.Input}}
+### Response:
--- a/examples/localai-webui/README.md
+++ b/examples/localai-webui/README.md
@@ -0,0 +1,26 @@
+# localai-webui
+
+Example of integration with [dhruvgera/localai-frontend](https://github.com/Dhruvgera/LocalAI-frontend).
+
+![image](https://user-images.githubusercontent.com/42107491/235344183-44b5967d-ba22-4331-804c-8da7004a5d35.png)
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/localai-webui
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download any desired models to models/ in the parent LocalAI project dir
+# For example: wget https://gpt4all.io/models/ggml-gpt4all-j.bin
+
+# start with docker-compose
+docker-compose up -d --build
+```
+
+Open http://localhost:3000 for the Web UI.
+
--- a/examples/localai-webui/docker-compose.yml
+++ b/examples/localai-webui/docker-compose.yml
@@ -0,0 +1,20 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - 8080:8080
+    env_file:
+      - .env
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai"]
+
+  frontend:
+    image: quay.io/go-skynet/localai-frontend:master
+    ports:
+      - 3000:3000
--- a/examples/rwkv/Dockerfile.build
+++ b/examples/rwkv/Dockerfile.build
@@ -0,0 +1,10 @@
+FROM python
+
+# convert the model (one-off)
+RUN pip3 install torch numpy
+
+WORKDIR /build
+COPY ./scripts/ .
+
+RUN git clone --recurse-submodules https://github.com/saharNooby/rwkv.cpp && cd rwkv.cpp && cmake . && cmake --build . --config Release
+ENTRYPOINT [ "/build/build.sh" ]
--- a/examples/rwkv/README.md
+++ b/examples/rwkv/README.md
@@ -0,0 +1,59 @@
+# rwkv
+
+Example of how to run rwkv models.
+
+## Run models
+
+Setup:
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/rwkv
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# build the tooling image to convert an rwkv model locally:
+docker build -t rwkv-converter -f Dockerfile.build .
+
+# download and convert a model (one-off) - it's going to be fast on CPU too!
+docker run -ti --name converter -v $PWD:/data rwkv-converter https://huggingface.co/BlinkDL/rwkv-4-raven/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%25-Other1%25-20230425-ctx4096.pth /data/models/rwkv
+
+# Get the tokenizer
+wget https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O models/rwkv.tokenizer.json
+
+# start with docker-compose
+docker-compose up -d --build
+```
+
+Test it out:
+
+```bash
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+    "model": "gpt-3.5-turbo",
+    "prompt": "A long time ago, in a galaxy far away",
+    "max_tokens": 100,
+    "temperature": 0.9, "top_p": 0.8, "top_k": 80
+  }'
+
+# {"object":"text_completion","model":"gpt-3.5-turbo","choices":[{"text":", there was a small group of five friends: Annie, Bryan, Charlie, Emily, and Jesse."}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}
+
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "gpt-3.5-turbo",            
+     "messages": [{"role": "user", "content": "How are you?"}],
+     "temperature": 0.9, "top_p": 0.8, "top_k": 80
+   }'
+
+# {"object":"chat.completion","model":"gpt-3.5-turbo","choices":[{"message":{"role":"assistant","content":" Good, thanks. I am about to go to bed. I' ll talk to you later.Bye."}}],"usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}
+```
+
+### Fine tuning
+
+See [RWKV-LM](https://github.com/BlinkDL/RWKV-LM#training--fine-tuning). There is also a Google [colab](https://colab.research.google.com/github/resloved/RWKV-notebooks/blob/master/RWKV_v4_RNN_Pile_Fine_Tuning.ipynb).
+
+## See also
+
+- [RWKV-LM](https://github.com/BlinkDL/RWKV-LM)
+- [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)
--- a/examples/rwkv/docker-compose.yaml
+++ b/examples/rwkv/docker-compose.yaml
@@ -0,0 +1,16 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
--- a/examples/rwkv/models/gpt-3.5-turbo.yaml
+++ b/examples/rwkv/models/gpt-3.5-turbo.yaml
@@ -0,0 +1,19 @@
+name: gpt-3.5-turbo
+parameters:
+  model: rwkv
+  top_k: 80
+  temperature: 0.9
+  max_tokens: 100
+  top_p: 0.8
+context_size: 1024
+threads: 14
+backend: "rwkv"
+cutwords:
+- "Bob:.*"
+roles:
+  user: "Bob:"
+  system: "Alice:"
+  assistant: "Alice:"
+template:
+  completion: rwkv_completion
+  chat: rwkv_chat
--- a/examples/rwkv/models/rwkv_chat.tmpl
+++ b/examples/rwkv/models/rwkv_chat.tmpl
@@ -0,0 +1,13 @@
+The following is a verbose detailed conversation between Bob and a woman, Alice. Alice is intelligent, friendly and likeable. Alice is likely to agree with Bob.
+
+Bob: Hello Alice, how are you doing?
+
+Alice: Hi Bob! Thanks, I'm fine. What about you?
+
+Bob: I am very good! It's nice to see you. Would you mind me chatting with you for a while?
+
+Alice: Not at all! I'm listening.
+
+{{.Input}}
+
+Alice: 
--- a/examples/rwkv/models/rwkv_completion.tmpl
+++ b/examples/rwkv/models/rwkv_completion.tmpl
@@ -0,0 +1 @@
+Complete the following sentence: {{.Input}} 
--- a/examples/slack-bot/.env.example
+++ b/examples/slack-bot/.env.example
@@ -0,0 +1,11 @@
+SLACK_APP_TOKEN=xapp-1-...
+SLACK_BOT_TOKEN=xoxb-...
+OPENAI_API_KEY=sk-...
+OPENAI_API_BASE=http://api:8080
+OPENAI_MODEL=gpt-3.5-turbo
+OPENAI_TIMEOUT_SECONDS=60
+#OPENAI_SYSTEM_TEXT="You proofread text. When you receive a message, you will check
+#for mistakes and make suggestion to improve the language of the given text"
+USE_SLACK_LANGUAGE=true
+SLACK_APP_LOG_LEVEL=INFO
+TRANSLATE_MARKDOWN=true
--- a/examples/slack-bot/README.md
+++ b/examples/slack-bot/README.md
@@ -0,0 +1,27 @@
+# Slack bot
+
+Slackbot using: https://github.com/seratch/ChatGPT-in-Slack
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/slack-bot
+
+git clone https://github.com/seratch/ChatGPT-in-Slack
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Set the discord bot options (see: https://github.com/seratch/ChatGPT-in-Slack)
+cp -rfv .env.example .env
+vim .env
+
+# start with docker-compose
+docker-compose up -d --build
+```
--- a/examples/slack-bot/docker-compose.yaml
+++ b/examples/slack-bot/docker-compose.yaml
@@ -0,0 +1,23 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  bot:
+    build:
+     context: ./ChatGPT-in-Slack
+     dockerfile: Dockerfile
+    env_file:
+    - .env
--- a/examples/slack-bot/models
+++ b/examples/slack-bot/models
@@ -0,0 +1 @@
+../chatbot-ui/models
--- a/go.mod
+++ b/go.mod
@@ -3,18 +3,20 @@ module github.com/go-skynet/LocalAI
 go 1.19

 require (
+	github.com/donomii/go-rwkv.cpp v0.0.0-20230503112711-af62fcc432be
 	github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708
 	github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c
-	github.com/go-skynet/go-llama.cpp v0.0.0-20230430075552-377fd245eae2
+	github.com/go-skynet/go-llama.cpp v0.0.0-20230503200855-2e6ae1269e03
 	github.com/gofiber/fiber/v2 v2.44.0
+	github.com/hashicorp/go-multierror v1.1.1
 	github.com/jaypipes/ghw v0.10.0
-	github.com/onsi/ginkgo/v2 v2.9.2
+	github.com/onsi/ginkgo/v2 v2.9.4
 	github.com/onsi/gomega v1.27.6
 	github.com/otiai10/openaigo v1.1.0
 	github.com/rs/zerolog v1.29.1
-	github.com/sashabaranov/go-openai v1.9.0
-	github.com/urfave/cli/v2 v2.25.1
-	github.com/valyala/fasthttp v1.46.0
+	github.com/sashabaranov/go-openai v1.9.3
+	github.com/urfave/cli/v2 v2.25.3
+	github.com/valyala/fasthttp v1.47.0
 	gopkg.in/yaml.v3 v3.0.1
 )

@@ -23,12 +25,13 @@ require (
 	github.com/andybalholm/brotli v1.0.5 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
 	github.com/ghodss/yaml v1.0.0 // indirect
-	github.com/go-logr/logr v1.2.3 // indirect
+	github.com/go-logr/logr v1.2.4 // indirect
 	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
 	github.com/google/go-cmp v0.5.9 // indirect
 	github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect
 	github.com/google/uuid v1.3.0 // indirect
+	github.com/hashicorp/errwrap v1.0.0 // indirect
 	github.com/jaypipes/pcidb v1.0.0 // indirect
 	github.com/klauspost/compress v1.16.3 // indirect
 	github.com/kr/text v0.2.0 // indirect
@@ -46,10 +49,10 @@ require (
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
-	golang.org/x/net v0.8.0 // indirect
+	golang.org/x/net v0.9.0 // indirect
 	golang.org/x/sys v0.7.0 // indirect
-	golang.org/x/text v0.8.0 // indirect
-	golang.org/x/tools v0.7.0 // indirect
+	golang.org/x/text v0.9.0 // indirect
+	golang.org/x/tools v0.8.0 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	howett.net/plist v1.0.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -12,10 +12,16 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d h1:lSHwlYf1H4WAWYgf7rjEVTGen1qmigUq2Egpu8mnQiY=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d/go.mod h1:H6QBF7/Tz6DAEBDXQged4H1BvsmqY/K5FG9wQRGa01g=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230503112711-af62fcc432be h1:3Hic97PY6hcw/SY44RuR7kyONkxd744RFeRrqckzwNQ=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230503112711-af62fcc432be/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
 github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
 github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
 github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
 github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
+github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
 github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
 github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
@@ -23,14 +29,12 @@ github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708 h1:cfOi4TWvQ
 github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708/go.mod h1:1Wj/xbkMfwQSOrhNYK178IzqQHstZbRfhx4s8p1M5VM=
 github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c h1:48I7jpLNGiQeBmF0SFVVbREh8vlG0zN13v9LH5ctXis=
 github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c/go.mod h1:5VZ9XbcINI0XcHhkcX8GPK8TplFGAzu1Hrg4tNiMCtI=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230428071219-3d084e4299e9 h1:N/0SBefkMFao6GiGhIF7+5EdYOMHn4KnCG2AFcIXPt0=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230428071219-3d084e4299e9/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230429125915-9bf702fe56b9 h1:20/tdOA4+b7Y7lCob+q2sczfOSz0pp+14L32adYJ+uQ=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230429125915-9bf702fe56b9/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230429225431-361b9f87de6d h1:7KDq1Uylm1mXphQ+M2qztekXAvODtXvJDHrXQguRw9k=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230429225431-361b9f87de6d/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230430075552-377fd245eae2 h1:CYQRCbOfYtC77OxweAyrdxSVwoLIM/EdZ6Ij+xBzta8=
 github.com/go-skynet/go-llama.cpp v0.0.0-20230430075552-377fd245eae2/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405 h1:pbIxJ/eiL1Irdprxk/mquaxjR1XDGCE+7CT9BGJNRaY=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230503200855-2e6ae1269e03 h1:j9fhITFhkz4SczJU0jIaMYo5tdTVTrj+zdhEgWHEr40=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230503200855-2e6ae1269e03/go.mod h1:LvSQx5QAYBAMpWkbyVFFDiM1Tzj8LP55DvmUM3hbRMY=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
@@ -43,6 +47,10 @@ github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE
 github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
+github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
+github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
+github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
 github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 github.com/jaypipes/ghw v0.10.0 h1:UHu9UX08Py315iPojADFPOkmjTsNzHj4g4adsNKKteY=
 github.com/jaypipes/ghw v0.10.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g=
@@ -67,6 +75,10 @@ github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
 github.com/onsi/ginkgo/v2 v2.9.2/go.mod h1:WHcJJG2dIlcCqVfBAwUCrJxSPFb6v4azBwgxeMeDuts=
+github.com/onsi/ginkgo/v2 v2.9.3 h1:5X2vl/isiKqkrOYjiaGgp3JQOcLV59g5o5SuTMqCcxU=
+github.com/onsi/ginkgo/v2 v2.9.3/go.mod h1:gCQYp2Q+kSoIj7ykSVb9nskRSsR6PUj4AiLywzIhbKM=
+github.com/onsi/ginkgo/v2 v2.9.4 h1:xR7vG4IXt5RWx6FfIjyAtsoMAtnc3C/rFXBBd2AjZwE=
+github.com/onsi/ginkgo/v2 v2.9.4/go.mod h1:gCQYp2Q+kSoIj7ykSVb9nskRSsR6PUj4AiLywzIhbKM=
 github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
 github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg=
 github.com/otiai10/mint v1.4.1 h1:HOVBfKP1oXIc0wWo9hZ8JLdZtyCPWqjvmFDuVZ0yv2Y=
@@ -86,8 +98,12 @@ github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
 github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/sashabaranov/go-openai v1.9.0 h1:NoiO++IISxxJ1pRc0n7uZvMGMake0G+FJ1XPwXtprsA=
-github.com/sashabaranov/go-openai v1.9.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/sashabaranov/go-openai v1.9.1 h1:3N52HkJKo9Zlo/oe1AVv5ZkCOny0ra58/ACvAxkN3MM=
+github.com/sashabaranov/go-openai v1.9.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/sashabaranov/go-openai v1.9.2 h1:7//Glm9EiMBjelgmBb00yYzKYqm1jckHWWTDLahfeuQ=
+github.com/sashabaranov/go-openai v1.9.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/sashabaranov/go-openai v1.9.3 h1:uNak3Rn5pPsKRs9bdT7RqRZEyej/zdZOEI2/8wvrFtM=
+github.com/sashabaranov/go-openai v1.9.3/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94/go.mod h1:90zrgN3D/WJsDd1iXHT96alCoN2KJo6/4x1DZC3wZs8=
 github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d/go.mod h1:Gy+0tqhJvgGlqnTF8CVGP0AaGRjwBtXs/a5PA0Y3+A4=
@@ -99,12 +115,12 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/tinylib/msgp v1.1.6/go.mod h1:75BAfg2hauQhs3qedfdDZmWAPcFMAvJE5b9rGOMufyw=
 github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0=
 github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw=
-github.com/urfave/cli/v2 v2.25.1 h1:zw8dSP7ghX0Gmm8vugrs6q9Ku0wzweqPyshy+syu9Gw=
-github.com/urfave/cli/v2 v2.25.1/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
+github.com/urfave/cli/v2 v2.25.3 h1:VJkt6wvEBOoSjPFQvOkv6iWIrsJyCrKGtCtxXWwmGeY=
+github.com/urfave/cli/v2 v2.25.3/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/fasthttp v1.46.0 h1:6ZRhrFg8zBXTRYY6vdzbFhqsBd7FVv123pV2m9V87U4=
-github.com/valyala/fasthttp v1.46.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA=
+github.com/valyala/fasthttp v1.47.0 h1:y7moDoxYzMooFpT5aHgNgVOQDrS3qlkfiP9mDtGGK9c=
+github.com/valyala/fasthttp v1.47.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA=
 github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
 github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
@@ -126,6 +142,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug
 golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
 golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
 golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
+golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM=
+golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -155,6 +173,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
 golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
@@ -162,6 +182,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc
 golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ=
 golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
 golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s=
+golang.org/x/tools v0.8.0 h1:vSDcovVPld282ceKgDimkRSC8kpaH1dgyc9UMzlt84Y=
+golang.org/x/tools v0.8.0/go.mod h1:JxBZ99ISMI5ViVkT1tr6tdNmXeTrcpVSD3vZ1RsRdN4=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- a/main.go
+++ b/main.go
@@ -1,11 +1,12 @@
 package main

 import (
+	"fmt"
 	"os"
+	"path/filepath"

 	api "github.com/go-skynet/LocalAI/api"
 	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/jaypipes/ghw"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 	"github.com/urfave/cli/v2"
@@ -20,12 +21,6 @@ func main() {
 		os.Exit(1)
 	}

-	threads := 4
-	cpu, err := ghw.CPU()
-	if err == nil {
-		threads = int(cpu.TotalCores)
-	}
-
 	app := &cli.App{
 		Name:  "LocalAI",
 		Usage: "OpenAI compatible API for running LLaMA/GPT models locally on CPU with consumer grade hardware.",
@@ -42,13 +37,13 @@ func main() {
 				Name:        "threads",
 				DefaultText: "Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested.",
 				EnvVars:     []string{"THREADS"},
-				Value:       threads,
+				Value:       4,
 			},
 			&cli.StringFlag{
 				Name:        "models-path",
 				DefaultText: "Path containing models used for inferencing",
 				EnvVars:     []string{"MODELS_PATH"},
-				Value:       path,
+				Value:       filepath.Join(path, "models"),
 			},
 			&cli.StringFlag{
 				Name:        "config-file",
@@ -85,6 +80,7 @@ It uses llama.cpp, ggml and gpt4all as backend with golang c bindings.
 		UsageText: `local-ai [options]`,
 		Copyright: "go-skynet authors",
 		Action: func(ctx *cli.Context) error {
+			fmt.Printf("Starting LocalAI using %d threads, with models path: %s\n", ctx.Int("threads"), ctx.String("models-path"))
 			return api.App(ctx.String("config-file"), model.NewModelLoader(ctx.String("models-path")), ctx.Int("threads"), ctx.Int("context-size"), ctx.Bool("f16"), ctx.Bool("debug"), false).Listen(ctx.String("address"))
 		},
 	}
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -12,6 +12,7 @@ import (

 	"github.com/rs/zerolog/log"

+	rwkv "github.com/donomii/go-rwkv.cpp"
 	gpt2 "github.com/go-skynet/go-gpt2.cpp"
 	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
@@ -25,8 +26,8 @@ type ModelLoader struct {
 	gptmodels         map[string]*gptj.GPTJ
 	gpt2models        map[string]*gpt2.GPT2
 	gptstablelmmodels map[string]*gpt2.StableLM
-
-	promptsTemplates map[string]*template.Template
+	rwkv              map[string]*rwkv.RwkvState
+	promptsTemplates  map[string]*template.Template
 }

 func NewModelLoader(modelPath string) *ModelLoader {
@@ -36,6 +37,7 @@ func NewModelLoader(modelPath string) *ModelLoader {
 		gptmodels:         make(map[string]*gptj.GPTJ),
 		gptstablelmmodels: make(map[string]*gpt2.StableLM),
 		models:            make(map[string]*llama.LLama),
+		rwkv:              make(map[string]*rwkv.RwkvState),
 		promptsTemplates:  make(map[string]*template.Template),
 	}
 }
@@ -79,10 +81,9 @@ func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string,
 		if exists {
 			m = t
 		}
-
 	}
 	if m == nil {
-		return "", nil
+		return "", fmt.Errorf("failed loading any template")
 	}

 	var buf bytes.Buffer
@@ -168,13 +169,6 @@ func (ml *ModelLoader) LoadGPT2Model(modelName string) (*gpt2.GPT2, error) {
 		return m, nil
 	}

-	// TODO: This needs refactoring, it's really bad to have it in here
-	// Check if we have a GPTStable model loaded instead - if we do we return an error so the API tries with StableLM
-	if _, ok := ml.gptstablelmmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTStableLM one")
-	}
-
 	// Load the model and keep it in memory for later use
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
@@ -207,17 +201,6 @@ func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
 		return m, nil
 	}

-	// TODO: This needs refactoring, it's really bad to have it in here
-	// Check if we have a GPT2 model loaded instead - if we do we return an error so the API tries with GPT2
-	if _, ok := ml.gpt2models[modelName]; ok {
-		log.Debug().Msgf("Model is GPT2: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPT2 one")
-	}
-	if _, ok := ml.gptstablelmmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTStableLM one")
-	}
-
 	// Load the model and keep it in memory for later use
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
@@ -236,6 +219,36 @@ func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
 	return model, err
 }

+func (ml *ModelLoader) LoadRWKV(modelName, tokenFile string, threads uint32) (*rwkv.RwkvState, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	log.Debug().Msgf("Loading model name: %s", modelName)
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.rwkv[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	tokenPath := filepath.Join(ml.ModelPath, tokenFile)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model := rwkv.LoadFiles(modelFile, tokenPath, threads)
+	if model == nil {
+		return nil, fmt.Errorf("could not load model")
+	}
+
+	ml.rwkv[modelName] = model
+	return model, nil
+}
+
 func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
@@ -252,21 +265,6 @@ func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOptio
 		return m, nil
 	}

-	// TODO: This needs refactoring, it's really bad to have it in here
-	// Check if we have a GPTJ model loaded instead - if we do we return an error so the API tries with GPTJ
-	if _, ok := ml.gptmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTJ: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTJ one")
-	}
-	if _, ok := ml.gpt2models[modelName]; ok {
-		log.Debug().Msgf("Model is GPT2: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPT2 one")
-	}
-	if _, ok := ml.gptstablelmmodels[modelName]; ok {
-		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
-		return nil, fmt.Errorf("this model is a GPTStableLM one")
-	}
-
 	// Load the model and keep it in memory for later use
 	modelFile := filepath.Join(ml.ModelPath, modelName)
 	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
--- a/prompt-templates/wizardlm.tmpl
+++ b/prompt-templates/wizardlm.tmpl
@@ -0,0 +1,3 @@
+{{.Input}}
+
+### Response:
--- a/renovate.json
+++ b/renovate.json
@@ -1,17 +1,4 @@
 {
  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
-  "extends": [
-    "config:base"
-  ],
-  "regexManagers": [
-    {
-      "fileMatch": [
-        "^Makefile$"
-      ],
-      "matchStrings": [
-        "#\\s*renovate:\\s*datasource=(?<datasource>.*?) depName=(?<depName>.*?)( datasourceTemplate=(?<datasourceTemplate>.*?))?( packageNameTemplate=(?<packageNameTemplate>.*?))?( depNameTemplate=(?<depNameTemplate>.*?))?( valueTemplate=(?<currentValueTemplate>.*?))?( versioning=(?<versioning>.*?))?\\s+.+_VERSION=(?<currentValue>.*?)\\s"
-      ],
-      "versioningTemplate": "{{#if versioning}}{{versioning}}{{/if}}"
-    }
-  ]
+  "extends": ["config:base"]
 }
Author	SHA1	Message	Date
Ettore Di Giacinto	714bfcd45b	fix: missing returning error and free callback stream (#187 )	2023-05-04 19:49:43 +02:00
renovate[bot]	77ce8b953e	fix(deps): update github.com/donomii/go-rwkv.cpp digest to af62fcc (#171 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-04 18:30:48 +02:00
renovate[bot]	01ada95941	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 2e6ae12 (#172 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-04 18:30:11 +02:00
ci-robbot [bot]	eabdc5042a	⬆️ Update go-skynet/go-llama.cpp (#184 ) Signed-off-by: GitHub <noreply@github.com> Co-authored-by: mudler <mudler@users.noreply.github.com>	2023-05-04 18:28:49 +02:00
Dhruv Gera	96267d9437	localai: Include the WebUI project example (#130 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-05-04 18:27:58 +02:00
Ettore Di Giacinto	9497a24127	fix: hardcode default number of cores to '4' (#186 )	2023-05-04 18:14:58 +02:00
Ettore Di Giacinto	fdf75c6d0e	rwkv fixes and examples (#185 )	2023-05-04 17:32:23 +02:00
mudler	6352308882	ci: minor fixups	2023-05-04 15:08:20 +02:00
mudler	a8172a0f4e	ci: fix typo	2023-05-04 15:04:41 +02:00
mudler	ebcd10d66f	ci: manually update deps	2023-05-04 15:01:29 +02:00
mudler	885642915f	ci: add renovate suffix	2023-05-04 12:26:59 +02:00
mudler	2e424491c0	ci: lookupNameTemplate -> depNameTemplate	2023-05-04 12:23:05 +02:00
mudler	aa6faef8f7	ci: versioning -> versioningTemplate	2023-05-04 12:07:29 +02:00
mudler	b3254baf60	ci: add versioning	2023-05-04 12:05:39 +02:00
mudler	0a43d27f0e	ci: update renovate	2023-05-04 12:02:19 +02:00
Ettore Di Giacinto	3fe11fe24d	ci: attempt to configure renovate with custom regexes (#178 )	2023-05-04 11:55:14 +02:00
renovate[bot]	af18fdc749	fix(deps): update module github.com/sashabaranov/go-openai to v1.9.3 (#174 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-04 08:44:02 +02:00
renovate[bot]	32b5eddd7d	fix(deps): update module github.com/onsi/ginkgo/v2 to v2.9.4 (#173 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-04 08:41:51 +02:00
Dave	07c3aa1869	Dockerized Langchain / PY example (#175 )	2023-05-04 08:41:13 +02:00
renovate[bot]	e59bad89e7	fix(deps): update module github.com/sashabaranov/go-openai to v1.9.2 (#164 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-03 23:05:50 +02:00
Jeremy Price	b971807980	Looks for models in $CWD/models/ dir by default (#169 )	2023-05-03 23:03:31 +02:00
Ettore Di Giacinto	c974dad799	Return usage in the API responses (#166 )	2023-05-03 17:29:18 +02:00
Ettore Di Giacinto	4eae570ef5	Update docs (#163 )	2023-05-03 15:51:54 +02:00
Ettore Di Giacinto	67992a7d99	feat: support slices or strings in the prompt completion endpoint (#162 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-03 13:13:31 +02:00
renovate[bot]	0a4899f366	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 8ceb616 (#150 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-03 11:48:06 +02:00
renovate[bot]	1eb02f6c91	fix(deps): update module github.com/onsi/ginkgo/v2 to v2.9.3 (#161 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-03 11:47:54 +02:00
mudler	575874e4fb	readme: minor update	2023-05-03 11:46:29 +02:00
Ettore Di Giacinto	751b7eca62	feat: add rwkv support (#158 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-03 11:45:22 +02:00
Ettore Di Giacinto	1ae7150810	feat: allow to specify default backend for model (#156 ) Signed-off-by: mudler <mudler@c3os.io>	2023-05-03 00:31:28 +02:00
Ettore Di Giacinto	70caf9bf8c	feat: support stopwords both string and arrays (#154 )	2023-05-02 23:30:00 +02:00
Dave	0b226ac027	Stop parameter of OpenAIRequest changed to String Array (#153 )	2023-05-02 22:02:45 +02:00
Ettore Di Giacinto	220d6fd59b	feat: add stream events (#152 )	2023-05-02 20:03:35 +02:00
antongisli	0a00a4b58e	adding mac build and example (#151 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-05-02 19:24:45 +02:00
Ettore Di Giacinto	156e15a4fa	Bump llama.cpp, downgrade gpt4all-j (#149 )	2023-05-02 16:07:18 +02:00
renovate[bot]	271d3f6673	fix(deps): update module github.com/valyala/fasthttp to v1.47.0 (#143 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 23:36:58 +02:00
Ettore Di Giacinto	fec4ab93c5	docs: Add langchain to the example index (#147 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 23:21:07 +02:00
renovate[bot]	38a7a7a54d	fix(deps): update github.com/go-skynet/go-gpt4all-j.cpp digest to 77bf8c1 (#141 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-05-01 23:18:41 +02:00
Ettore Di Giacinto	0db0704e2c	docs: Add slack-bot example (#145 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 23:18:24 +02:00
Dave	88f472e5d2	Add LangchainJS Examples (#146 )	2023-05-01 23:18:14 +02:00
Ettore Di Giacinto	92452d46da	feat: add new gpt4all-j binding (#142 )	2023-05-01 20:00:15 +02:00
Ettore Di Giacinto	ac70252d70	drop: remove helm charts, now in separate repo (#134 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 18:07:41 +02:00
renovate[bot]	f6451d2518	fix(deps): update module github.com/urfave/cli/v2 to v2.25.3 (#140 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 18:07:29 +02:00
Ettore Di Giacinto	2473f9d19b	docs: add discord-bot preview (#137 )	2023-05-01 11:03:34 +02:00
renovate[bot]	bc583385a9	fix(deps): update module github.com/urfave/cli/v2 to v2.25.2 (#136 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 07:53:48 +02:00
renovate[bot]	8286bfbab7	fix(deps): update module github.com/sashabaranov/go-openai to v1.9.1 (#135 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 07:52:20 +02:00
Ettore Di Giacinto	d129fabe3b	docs: enhancements (#133 )	2023-04-30 23:27:02 +02:00
				`@@ -0,0 +1 @@`
				`Complete the following sentence: {{.Input}}`