Update docs (#163 )

feat: support slices or strings in the prompt completion endpoint (#162 )
Signed-off-by: mudler <mudler@mocaccino.org>
2026-02-03 03:02:38 -05:00 · 2023-05-03 15:51:54 +02:00 · 2023-05-03 13:13:31 +02:00 · 2023-05-03 11:48:06 +02:00 · 2023-05-03 11:47:54 +02:00 · 2023-05-03 11:46:29 +02:00
63 changed files with 5247 additions and 726 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,3 @@
+ARG GO_VERSION=1.20
+FROM mcr.microsoft.com/devcontainers/go:0-$GO_VERSION-bullseye
+RUN apt-get update && apt-get install -y cmake
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,46 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-docker-compose
+{
+	"name": "Existing Docker Compose (Extend)",
+
+	// Update the 'dockerComposeFile' list if you have more compose files or use different names.
+	// The .devcontainer/docker-compose.yml file contains any overrides you need/want to make.
+	"dockerComposeFile": [
+		"../docker-compose.yaml",
+		"docker-compose.yml"
+	],
+
+	// The 'service' property is the name of the service for the container that VS Code should
+	// use. Update this value and .devcontainer/docker-compose.yml to the real service name.
+	"service": "api",
+
+	// The optional 'workspaceFolder' property is the path VS Code should open by default when
+	// connected. This is typically a file mount in .devcontainer/docker-compose.yml
+	"workspaceFolder": "/workspace",
+
+	"features": {
+		"ghcr.io/devcontainers/features/go:1": {},
+		"ghcr.io/azutake/devcontainer-features/go-packages-install:0": {}
+	},
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Uncomment the next line if you want start specific services in your Docker Compose config.
+	// "runServices": [],
+
+	// Uncomment the next line if you want to keep your containers running after VS Code shuts down.
+	// "shutdownAction": "none",
+
+	// Uncomment the next line to run commands after the container is created.
+	"postCreateCommand": "make prepare"
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "devcontainer"
+}
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -0,0 +1,26 @@
+version: '3.6'
+services:
+  # Update this to the name of the service you want to work with in your docker-compose.yml file
+  api:
+    # Uncomment if you want to override the service's Dockerfile to one in the .devcontainer 
+    # folder. Note that the path of the Dockerfile and context is relative to the *primary* 
+    # docker-compose.yml file (the first in the devcontainer.json "dockerComposeFile"
+    # array). The sample below assumes your primary file is in the root of your project.
+    #
+    build:
+      context: .
+      dockerfile: .devcontainer/Dockerfile
+
+    volumes:
+      # Update this to wherever you want VS Code to mount the folder of your project
+      - .:/workspace:cached
+
+    # Uncomment the next four lines if you will use a ptrace-based debugger like C++, Go, and Rust.
+    # cap_add:
+    #   - SYS_PTRACE
+    # security_opt:
+    #   - seccomp:unconfined
+
+    # Overrides default command so things don't shut down after the process ends.
+    command: /bin/sh -c "while sleep 1000; do :; done"
+ 
--- a/.dockerignore
+++ b/.dockerignore
@@ -1 +1,2 @@
 models
+examples/chatbot-ui/models
--- a/.env
+++ b/.env
@@ -1,3 +1,5 @@
-THREADS=14
-CONTEXT_SIZE=700
+# THREADS=14
+# CONTEXT_SIZE=512
 MODELS_PATH=/models
+# DEBUG=true
+# BUILD_TYPE=generic
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -19,7 +19,7 @@ jobs:
      - name: Prepare
        id: prep
        run: |
-          DOCKER_IMAGE=quay.io/go-skynet/llama-cli
+          DOCKER_IMAGE=quay.io/go-skynet/local-ai
          VERSION=master
          SHORTREF=${GITHUB_SHA::8}

@@ -54,8 +54,8 @@ jobs:
        uses: docker/login-action@v2
        with:
          registry: quay.io
-          username: ${{ secrets.QUAY_USERNAME }}
-          password: ${{ secrets.QUAY_PASSWORD }}
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
      - name: Build
        if: github.event_name != 'pull_request'
        uses: docker/build-push-action@v4
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -0,0 +1,44 @@
+---
+name: 'tests'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+jobs:
+  ubuntu-latest:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+      - name: Test
+        run: |
+          make test
+
+  macOS-latest:
+    runs-on: macOS-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+        with: 
+          submodules: true
+
+      - name: Dependencies
+        run: |
+          brew update
+          brew install sdl2
+      - name: Test
+        run: |
+          make test
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,14 @@
 # go-llama build artifacts
 go-llama
+go-gpt4all-j
+go-gpt2

-# llama-cli build binary
-llama-cli
+# LocalAI build binary
+LocalAI
+local-ai
+# prevent above rules from omitting the helm chart
+!charts/*

 # Ignore models
-models/*.bin
-models/ggml-*
+models/*
+test-models/
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -1,5 +1,5 @@
 # Make sure to check the documentation at http://goreleaser.com
-project_name: llama-cli
+project_name: local-ai
 builds:
  - ldflags:
      - -w -s
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,16 +1,20 @@
 {
    "version": "0.2.0",
    "configurations": [
-    
-    {
-        "name": "Launch Go",
-        "type": "go",
-        "request": "launch",
-        "mode": "debug",
-        "program": "${workspaceFolder}/main.go",
-        "args": [
-            "api"
-        ]
-    }
+        {
+            "name": "Launch Go",
+            "type": "go",
+            "request": "launch",
+            "mode": "debug",
+            "program": "${workspaceFolder}/main.go",
+            "args": [
+                "api"
+            ],
+            "env": {
+                "C_INCLUDE_PATH": "/workspace/go-llama:/workspace/go-gpt4all-j:/workspace/go-gpt2",
+                "LIBRARY_PATH": "/workspace/go-llama:/workspace/go-gpt4all-j:/workspace/go-gpt2",
+                "DEBUG": "true"
+            }
+        }
    ]
-}
+}
--- a/21
+++ b/21
@@ -1,18 +1,9 @@
 ARG GO_VERSION=1.20
-ARG DEBIAN_VERSION=11
-FROM golang:$GO_VERSION as builder
+ARG BUILD_TYPE=
+FROM golang:$GO_VERSION
 WORKDIR /build
-ARG GO_LLAMA_CPP_TAG=llama.cpp-2f7c8e0
-RUN git clone -b $GO_LLAMA_CPP_TAG --recurse-submodules https://github.com/go-skynet/go-llama.cpp
-RUN cd go-llama.cpp && make libbinding.a
-COPY go.mod ./
-COPY go.sum ./
-RUN go mod download
-RUN apt-get update
+RUN apt-get update && apt-get install -y cmake
 COPY . .
-RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
-RUN C_INCLUDE_PATH=/build/go-llama.cpp LIBRARY_PATH=/build/go-llama.cpp go build -o llama-cli ./
-
-FROM debian:$DEBIAN_VERSION
-COPY --from=builder /build/llama-cli /usr/bin/llama-cli
-ENTRYPOINT [ "/usr/bin/llama-cli" ]
+RUN make prepare-sources
+EXPOSE 8080
+ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -0,0 +1,14 @@
+ARG GO_VERSION=1.20
+ARG DEBIAN_VERSION=11
+ARG BUILD_TYPE=
+
+FROM golang:$GO_VERSION as builder
+WORKDIR /build
+RUN apt-get update && apt-get install -y cmake
+COPY . .
+RUN make build
+
+FROM debian:$DEBIAN_VERSION
+COPY --from=builder /build/local-ai /usr/bin/local-ai
+EXPOSE 8080
+ENTRYPOINT [ "/usr/bin/local-ai" ]
--- a/2
+++ b/2
@@ -2,4 +2,4 @@ VERSION 0.7

 build:
    FROM DOCKERFILE -f Dockerfile .
-    SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli
+    SAVE ARTIFACT /usr/bin/local-ai AS LOCAL local-ai
--- a/126
+++ b/126
@@ -1,8 +1,16 @@
 GOCMD=go
 GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
-BINARY_NAME=llama-cli
-GOLLAMA_VERSION?=llama.cpp-8b67998
+BINARY_NAME=local-ai
+# renovate: datasource=github-tags depName=go-skynet/go-llama.cpp
+GOLLAMA_VERSION?=llama.cpp-f4cef87
+# renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt4all-j.cpp currentValueTemplate=master depNameTemplate=go-gpt4all-j.cpp
+GOGPT4ALLJ_VERSION?=1f7bff57f66cb7062e40d0ac3abd2217815e5109
+# renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt2.cpp currentValueTemplate=master depNameTemplate=go-gpt2.cpp
+GOGPT2_VERSION?=245a5bfe6708ab80dc5c733dcdbfbe3cfd2acdaa
+
+RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
+RWKV_VERSION?=af62fcc432be2847acb6e0688b2c2491d6588d58

 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
@@ -10,34 +18,118 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

+C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv
+LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2:$(shell pwd)/go-rwkv
+
+# Use this if you want to set the default behavior
+ifndef BUILD_TYPE
+	BUILD_TYPE:=default
+endif
+
+ifeq ($(BUILD_TYPE), "generic")
+	GENERIC_PREFIX:=generic-
+else
+	GENERIC_PREFIX:=
+endif
+
 .PHONY: all test build vendor

 all: help

-## Build:
+## GPT4ALL-J
+go-gpt4all-j:
+	git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j
+	cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION) && git submodule update --init --recursive --depth 1
+	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
+	@find ./go-gpt4all-j -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_gptj_replace/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_gptj_replace/g' {} +

-build: prepare ## Build the project
-	$(GOCMD) build -o $(BINARY_NAME) ./
+## RWKV
+go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
+	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
+
+go-rwkv/librwkv.a: go-rwkv
+	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a .. && cp ggml/src/libggml.a ..
+
+go-gpt4all-j/libgptj.a: go-gpt4all-j
+	$(MAKE) -C go-gpt4all-j $(GENERIC_PREFIX)libgptj.a
+
+## CEREBRAS GPT
+go-gpt2: 
+	git clone --recurse-submodules https://github.com/go-skynet/go-gpt2.cpp go-gpt2
+	cd go-gpt2 && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
+	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
+	@find ./go-gpt2 -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2 -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
+	@find ./go-gpt2 -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
+	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gpt2_/g' {} +
+
+go-gpt2/libgpt2.a: go-gpt2
+	$(MAKE) -C go-gpt2 $(GENERIC_PREFIX)libgpt2.a

 go-llama:
 	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama

-prepare: go-llama
-	$(MAKE) -C go-llama libbinding.a
+go-llama/libbinding.a: go-llama 
+	$(MAKE) -C go-llama $(GENERIC_PREFIX)libbinding.a
+
+replace:
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
-	
-clean: ## Remove build related file
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt2.cpp=$(shell pwd)/go-gpt2
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
+
+prepare-sources: go-llama go-gpt2 go-gpt4all-j go-rwkv
+	$(GOCMD) mod download
+
+## GENERIC
+rebuild: ## Rebuilds the project
 	$(MAKE) -C go-llama clean
+	$(MAKE) -C go-gpt4all-j clean
+	$(MAKE) -C go-gpt2 clean
+	$(MAKE) -C go-rwkv clean
+	$(MAKE) build
+
+prepare: prepare-sources go-llama/libbinding.a go-gpt4all-j/libgptj.a go-gpt2/libgpt2.a go-rwkv/librwkv.a replace ## Prepares for building
+
+clean: ## Remove build related file
 	rm -fr ./go-llama
-	rm -f $(BINARY_NAME)
+	rm -rf ./go-gpt4all-j
+	rm -rf ./go-gpt2
+	rm -rf ./go-rwkv
+	rm -rf $(BINARY_NAME)

-## Run:
-run: prepare
-	C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp LIBRARY_PATH=$(shell pwd)/go-llama.cpp $(GOCMD) run ./ api
+## Build:

-## Test:
-test: ## Run the tests of the project
-	$(GOTEST) -v -race ./... $(OUTPUT_OPTIONS)
+build: prepare ## Build the project
+	$(info ${GREEN}I local-ai build info:${RESET})
+	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
+	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) build -o $(BINARY_NAME) ./
+
+generic-build: ## Build the project using generic
+	BUILD_TYPE="generic" $(MAKE) build
+
+## Run
+run: prepare ## run local-ai
+	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) run ./main.go
+
+test-models/testmodel:
+	mkdir test-models
+	wget https://huggingface.co/concedo/cerebras-111M-ggml/resolve/main/cerberas-111m-q4_0.bin -O test-models/testmodel
+	cp tests/fixtures/* test-models
+
+test: prepare test-models/testmodel
+	cp tests/fixtures/* test-models
+	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models $(GOCMD) test -v -timeout 30m ./...

 ## Help:
 help: ## Show this help.
@@ -49,4 +141,4 @@ help: ## Show this help.
 	@awk 'BEGIN {FS = ":.*?## "} { \
 		if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf "    ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
-		}' $(MAKEFILE_LIST)
+		}' $(MAKEFILE_LIST)
--- a/README.md
+++ b/README.md
@@ -1,20 +1,84 @@
-## :camel: llama-cli
+<h1 align="center">
+  <br>
+  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+    LocalAI
+<br>
+</h1>

+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml) [![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)

-llama-cli is a straightforward golang CLI interface and API compatible with OpenAI for [llama.cpp](https://github.com/ggerganov/llama.cpp), it supports multiple-models and also provides a simple command line interface that allows text generation using a GPT-based model like llama directly from the terminal. 
+[![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 

-It is compatible with the models supported by `llama.cpp`. You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
+**LocalAI** is a drop-in replacement REST API compatible with OpenAI for local CPU inferencing. It allows to run models locally or on-prem with consumer grade hardware. It is based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all), [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is licensed under Apache 2.0.

-`llama-cli` doesn't shell-out, it uses https://github.com/go-skynet/go-llama.cpp, which is a golang binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
+- OpenAI compatible API
+- Supports multiple-models
+- Once loaded the first time, it keep models loaded in memory for faster inference
+- Support for prompt templates
+- Doesn't shell-out, but uses C bindings for a faster inference and better performance. 
+
+LocalAI is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! It was initially created by [mudler](https://github.com/mudler/) at the [SpectroCloud OSS Office](https://github.com/spectrocloud).
+
+### News
+
+- 02-05-2023: Support for `rwkv.cpp` models ( https://github.com/go-skynet/LocalAI/pull/158 ) and for `/edits` endpoint
+- 01-05-2023: Support for SSE stream of tokens in `llama.cpp` backends ( https://github.com/go-skynet/LocalAI/pull/152 )
+
+### Socials and community chatter
+
+- Follow [@LocalAI_API](https://twitter.com/LocalAI_API) on twitter.
+
+- [Reddit post](https://www.reddit.com/r/selfhosted/comments/12w4p2f/localai_openai_compatible_api_to_run_llm_models/) about LocalAI.
+
+- [Hacker news post](https://news.ycombinator.com/item?id=35726934) - help us out by voting if you like this project.
+
+- [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65) - excellent usecase for localAI, using AI to analyse Kubernetes clusters.
+
+## Model compatibility
+
+It is compatible with the models supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) supports also [GPT4ALL-J](https://github.com/nomic-ai/gpt4all) and [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml).
+
+Tested with:
+- Vicuna
+- Alpaca
+- [GPT4ALL](https://github.com/nomic-ai/gpt4all)
+- [GPT4ALL-J](https://gpt4all.io/models/ggml-gpt4all-j.bin)
+- Koala
+- [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml)
+- [RWKV](https://github.com/BlinkDL/RWKV-LM) models with [rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)
+
+It should also be compatible with StableLM and GPTNeoX ggml models (untested)
+
+Note: You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
+
+### RWKV
+
+<details>
+
+For `rwkv` models, you need to put also the associated tokenizer along with the ggml model:
+
+```
+ls models
+36464540 -rw-r--r--  1 mudler mudler 1.2G May  3 10:51 rwkv_small
+36464543 -rw-r--r--  1 mudler mudler 2.4M May  3 10:51 rwkv_small.tokenizer.json
+```
+
+</details>

 ## Usage

-You can use `docker-compose`:
+> `LocalAI` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).
+
+The easiest way to run LocalAI is by using `docker-compose`:

 ```bash

-git clone https://github.com/go-skynet/llama-cli
-cd llama-cli
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>

 # copy your models to models/
 cp your-model.bin models/
@@ -23,27 +87,165 @@ cp your-model.bin models/
 # vim .env

 # start with docker-compose
-docker compose up -d --build
+docker-compose up -d --build

 # Now API is accessible at localhost:8080
 curl http://localhost:8080/v1/models
-
 # {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
+
 curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
     "model": "your-model.bin",            
     "prompt": "A long time ago in a galaxy far, far away",
     "temperature": 0.7
   }'
-
-
 ```

-Note: The API doesn't inject a default prompt for talking to the model, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.
+### Example: Use GPT4ALL-J model

-You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
+<details>
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Use a template from the examples
+cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
+
+# (optional) Edit the .env file to set things like context size and threads
+# vim .env
+
+# start with docker-compose
+docker-compose up -d --build
+
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+# {"object":"list","data":[{"id":"ggml-gpt4all-j","object":"model"}]}
+
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-gpt4all-j",
+     "messages": [{"role": "user", "content": "How are you?"}],
+     "temperature": 0.9 
+   }'
+
+# {"model":"ggml-gpt4all-j","choices":[{"message":{"role":"assistant","content":"I'm doing well, thanks. How about you?"}}]}
+```
+</details>
+
+To build locally, run `make build` (see below).
+
+### Other examples
+
+![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)
+
+To see other examples on how to integrate with other projects for instance chatbot-ui, see: [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/).
+
+
+### Advanced configuration
+
+LocalAI can be configured to serve user-defined models with a set of default parameters and templates.
+
+<details>
+
+You can create multiple `yaml` files in the models path or either specify a single YAML configuration file. 
+Consider the following `models` folder in the `example/chatbot-ui`:

 ```
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
+base ❯ ls -liah examples/chatbot-ui/models 
+36487587 drwxr-xr-x 2 mudler mudler 4.0K May  3 12:27 .
+36487586 drwxr-xr-x 3 mudler mudler 4.0K May  3 10:42 ..
+36465214 -rw-r--r-- 1 mudler mudler   10 Apr 27 07:46 completion.tmpl
+36464855 -rw-r--r-- 1 mudler mudler 3.6G Apr 27 00:08 ggml-gpt4all-j
+36464537 -rw-r--r-- 1 mudler mudler  245 May  3 10:42 gpt-3.5-turbo.yaml
+36467388 -rw-r--r-- 1 mudler mudler  180 Apr 27 07:46 gpt4all.tmpl
+```
+
+In the `gpt-3.5-turbo.yaml` file it is defined the `gpt-3.5-turbo` model which is an alias to use `gpt4all-j` with pre-defined options.
+
+For instance, consider the following that declares `gpt-3.5-turbo` backed by the `ggml-gpt4all-j` model:
+
+```yaml
+name: gpt-3.5-turbo
+# Default model parameters
+parameters:
+  # Relative to the models path
+  model: ggml-gpt4all-j
+  # temperature
+  temperature: 0.3
+  # all the OpenAI request options here..
+
+# Default context size
+context_size: 512
+threads: 10
+# Define a backend (optional). By default it will try to guess the backend the first time the model is interacted with.
+backend: gptj # available: llama, stablelm, gpt2, gptj rwkv
+# stopwords (if supported by the backend)
+stopwords:
+- "HUMAN:"
+- "### Response:"
+# define chat roles
+roles:
+  user: "HUMAN:"
+  system: "GPT:"
+template:
+  # template file ".tmpl" with the prompt template to use by default on the endpoint call. Note there is no extension in the files
+  completion: completion
+  chat: ggml-gpt4all-j
+```
+
+Specifying a `config-file` via CLI allows to declare models in a single file as a list, for instance:
+
+```yaml
+- name: list1
+  parameters:
+    model: testmodel
+  context_size: 512
+  threads: 10
+  stopwords:
+  - "HUMAN:"
+  - "### Response:"
+  roles:
+    user: "HUMAN:"
+    system: "GPT:"
+  template:
+    completion: completion
+    chat: ggml-gpt4all-j
+- name: list2
+  parameters:
+    model: testmodel
+  context_size: 512
+  threads: 10
+  stopwords:
+  - "HUMAN:"
+  - "### Response:"
+  roles:
+    user: "HUMAN:"
+    system: "GPT:"
+  template:
+    completion: completion
+   chat: ggml-gpt4all-j
+```
+
+See also [chatbot-ui](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) as an example on how to use config files.
+
+</details>
+
+### Prompt templates 
+
+The API doesn't inject a default prompt for talking to the model. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.
+
+<details>
+You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibling file, `foo.bin.tmpl` which will be used as a default prompt and can be used with alpaca:
+
+```
+The below instruction describes a task. Write a response that appropriately completes the request.

 ### Instruction:
 {{.Input}}
@@ -51,73 +253,62 @@ Below is an instruction that describes a task. Write a response that appropriate
 ### Response:
 ```

-See the [prompt-templates](https://github.com/go-skynet/llama-cli/tree/master/prompt-templates) directory in this repository for templates for most popular models.
+See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prompt-templates) directory in this repository for templates for some of the most popular models.

-## Container images

-`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
+For the edit endpoint, an example template for alpaca-based models can be:

-To begin, run:
+```yaml
+Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

-```
-docker run -ti --rm quay.io/go-skynet/llama-cli:latest  --instruction "What's an alpaca?" --topk 10000 --model ...
+### Instruction:
+{{.Instruction}}
+
+### Input:
+{{.Input}}
+
+### Response:
 ```

-Where `--model` is the path of the model you want to use. 
+</details>

-Note: you need to mount a volume to the docker container in order to load a model, for instance:
+### CLI
+
+You can control LocalAI with command line arguments, to specify a binding address, or the number of threads.
+
+<details>
+
+Usage:

 ```
-# assuming your model is in /path/to/your/models/foo.bin
-docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest  --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
-```
-
-You will receive a response like the following:
-
-```
-An alpaca is a member of the South American Camelid family, which includes the llama, guanaco and vicuña. It is a domesticated species that originates from the Andes mountain range in South America. Alpacas are used in the textile industry for their fleece, which is much softer than wool. Alpacas are also used for meat, milk, and fiber.
-```
-
-## Basic usage
-
-To use llama-cli, specify a pre-trained GPT-based model, an input text, and an instruction for text generation. llama-cli takes the following arguments when running from the CLI:
-
-```
-llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--template <template_path>] [--tokens <num_tokens>] [--threads <num_threads>] [--temperature <temperature>] [--topp <top_p>] [--topk <top_k>]
+local-ai --models-path <model_path> [--address <address>] [--threads <num_threads>]
 ```

 | Parameter    | Environment Variable | Default Value | Description                            |
 | ------------ | -------------------- | ------------- | -------------------------------------- |
-| template     | TEMPLATE             |               | A file containing a template for output formatting (optional).  |
-| instruction  | INSTRUCTION          |               | Input prompt text or instruction. "-" for STDIN.   |
-| input        | INPUT                | -             | Path to text or "-" for STDIN.                    |
-| model        | MODEL           |               | The path to the pre-trained GPT-based model.      |
-| tokens       | TOKENS               | 128           | The maximum number of tokens to generate. |
-| threads      | THREADS              | NumCPU()      | The number of threads to use for text generation. |
-| temperature  | TEMPERATURE          | 0.95          | Sampling temperature for model output. ( values between `0.1` and `1.0` )  |
-| top_p        | TOP_P                | 0.85          | The cumulative probability for top-p sampling. |
-| top_k        | TOP_K                | 20            | The number of top-k tokens to consider for text generation.  |
+| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
+| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
+| address      | ADDRESS              | :8080         | The address and port to listen on. |
 | context-size | CONTEXT_SIZE         | 512           | Default token context size. |
+| debug | DEBUG         | false           | Enable debug mode. |
+| config-file | CONFIG_FILE         | empty           | Path to a LocalAI config file. |

-Here's an example of using `llama-cli`:
+</details>

-```
-llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
-```
+## Setup

-This will generate text based on the given model and instruction.
+Currently LocalAI comes as a container image and can be used with docker or a container engine of choice. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).

-## API
-
-`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.
+### Docker

+<details>
 Example of starting the API with `docker`:

 ```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
+docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
 ```

-And you'll see:
+You should see:
 ```
 ┌───────────────────────────────────────────────────┐ 
 │                   Fiber v2.42.0                   │ 
@@ -129,33 +320,136 @@ And you'll see:
 └───────────────────────────────────────────────────┘ 
 ```

-Note: Models have to end up with `.bin`.
+</details>

-You can control the API server options with command line arguments:
+### Build locally
+
+<details>
+
+In order to build the `LocalAI` container image locally you can use `docker`:

 ```
-llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
+# build the image
+docker build -t LocalAI .
+docker run LocalAI
 ```

-The API takes takes the following:
+Or you can build the binary with `make`:

-| Parameter    | Environment Variable | Default Value | Description                            |
-| ------------ | -------------------- | ------------- | -------------------------------------- |
-| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
-| threads      | THREADS              | CPU cores     | The number of threads to use for text generation. |
-| address      | ADDRESS              | :8080         | The address and port to listen on. |
-| context-size | CONTEXT_SIZE         | 512           | Default token context size. |
+```
+make build
+```

-Once the server is running, you can start making requests to it using HTTP, using the OpenAI API. 
+</details>

-### Supported OpenAI API endpoints
+### Build on mac
+
+Building on Mac (M1 or M2) works, but you may need to install some prerequisites using `brew`. 
+
+<details>
+
+The below has been tested by one mac user and found to work. Note that this doesn't use docker to run the server:
+
+```
+# install build dependencies
+brew install cmake
+brew install go
+
+# clone the repo
+git clone https://github.com/go-skynet/LocalAI.git
+
+cd LocalAI
+
+# build the binary
+make build
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Use a template from the examples
+cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
+
+# Run LocalAI
+./local-ai --models-path ./models/ --debug
+
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-gpt4all-j",
+     "messages": [{"role": "user", "content": "How are you?"}],
+     "temperature": 0.9 
+   }'
+```
+
+</details>
+
+### Windows compatibility
+
+It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/LocalAI/issues/2
+
+### Run LocalAI in Kubernetes
+
+LocalAI can be installed inside Kubernetes with helm.
+
+<details>
+
+1. Add the helm repo
+    ```bash
+    helm repo add go-skynet https://go-skynet.github.io/helm-charts/
+    ```
+1. Create a values files with your settings:
+```bash
+cat <<EOF > values.yaml
+deployment:
+  image: quay.io/go-skynet/local-ai:latest
+  env:
+    threads: 4
+    contextSize: 1024
+    modelsPath: "/models"
+# Optionally create a PVC, mount the PV to the LocalAI Deployment,
+# and download a model to prepopulate the models directory
+modelsVolume:
+  enabled: true
+  url: "https://gpt4all.io/models/ggml-gpt4all-j.bin"
+  pvc:
+    size: 6Gi
+    accessModes:
+    - ReadWriteOnce
+  auth:
+    # Optional value for HTTP basic access authentication header
+    basic: "" # 'username:password' base64 encoded
+service:
+  type: ClusterIP
+  annotations: {}
+  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
+  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
+EOF
+```
+3. Install the helm chart:
+```bash
+helm repo update
+helm install local-ai go-skynet/local-ai -f values.yaml
+```
+
+Check out also the [helm chart repository on GitHub](https://github.com/go-skynet/helm-charts).
+
+</details>
+
+## Supported OpenAI API endpoints

 You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 

-Following the list of endpoints/parameters supported.
+Following the list of endpoints/parameters supported. 

-#### Chat completions
+Note:

+- You can also specify the model as part of the OpenAI token.
+- If only one model is available, the API will use it for all the requests.
+
+### Chat completions
+
+<details>
 For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:

 ```
@@ -167,10 +461,32 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso
 ```

 Available additional parameters: `top_p`, `top_k`, `max_tokens`
+</details>

-#### Completions
+### Edit completions
+
+<details>
+To generate an edit completion you can send a POST request to the `/v1/edits` endpoint with the instruction as the request body:
+
+```
+curl http://localhost:8080/v1/edits -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "instruction": "rephrase",
+     "input": "Black cat jumped out of the window",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`.
+
+</details>
+
+### Completions
+
+<details>
+
+To generate a completion, you can send a POST request to the `/v1/completions` endpoint with the instruction as per the request body:

-For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
 ```
 curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
     "model": "ggml-koala-7b-model-q4_0-r2.bin",
@@ -181,76 +497,122 @@ curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d

 Available additional parameters: `top_p`, `top_k`, `max_tokens`

-#### List models
+</details>

+### List models
+
+<details>
 You can list all the models available with:

 ```
 curl http://localhost:8080/v1/models
 ```

-## Using other models
+</details>

-gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):
+## Frequently asked questions

-```bash
-wget -O tokenizer.model https://huggingface.co/decapoda-research/llama-30b-hf/resolve/main/tokenizer.model
-mkdir models
-cp gpt4all.. models/
-git clone https://gist.github.com/eiz/828bddec6162a023114ce19146cb2b82
-pip install sentencepiece
-python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
-# There will be a new model with the ".tmp" extension, you have to use that one!
-```
+Here are answers to some of the most common questions.

-### Windows compatibility

-It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/llama-cli/issues/2
+### How do I get models? 

-### Kubernetes
+<details>

-You can run the API directly in Kubernetes:
+Most ggml-based models should work, but newer models may require additions to the API. If a model doesn't work, please feel free to open up issues. However, be cautious about downloading models from the internet and directly onto your machine, as there may be security vulnerabilities in lama.cpp or ggml that could be maliciously exploited. Some models can be found on Hugging Face: https://huggingface.co/models?search=ggml, or models from gpt4all should also work: https://github.com/nomic-ai/gpt4all.

-```bash
-kubectl apply -f https://raw.githubusercontent.com/go-skynet/llama-cli/master/kubernetes/deployment.yaml
-```
+</details>

-### Build locally
+### What's the difference with Serge, or XXX?

-Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.

-In order to build the `llama-cli` container image locally you can use `docker`:
+<details>

-```
-# build the image as "alpaca-image"
-docker build -t llama-cli .
-docker run llama-cli --instruction "What's an alpaca?"
-```
+LocalAI is a multi-model solution that doesn't focus on a specific model type (e.g., llama.cpp or alpaca.cpp), and it handles all of these internally for faster inference,  easy to set up locally and deploy to Kubernetes.

-Or build the binary with:
+</details>

-```
-# build the image as "alpaca-image"
-docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +build
-# run the binary
-./llama-cli --instruction "What's an alpaca?"
-```
+
+### Can I use it with a Discord bot, or XXX?
+
+<details>
+
+Yes! If the client uses OpenAI and supports setting a different base URL to send requests to, you can use the LocalAI endpoint. This allows to use this with every application that was supposed to work with OpenAI, but without changing the application!
+
+</details>
+
+
+### Can this leverage GPUs? 
+
+<details>
+
+Not currently, as ggml doesn't support GPUs yet: https://github.com/ggerganov/llama.cpp/discussions/915.
+
+</details>
+
+### Where is the webUI? 
+
+<details> 
+We are working on to have a good out of the box experience - however as LocalAI is an API you can already plug it into existing projects that provides are UI interfaces to OpenAI's APIs. There are several already on github, and should be compatible with LocalAI already (as it mimics the OpenAI API)
+
+</details>
+
+### Does it work with AutoGPT? 
+
+<details>
+
+AutoGPT currently doesn't allow to set a different API URL, but there is a PR open for it, so this should be possible soon!
+
+</details>
+
+## Projects already using LocalAI to run local models
+
+Feel free to open up a PR to get your project listed!
+
+- [Kairos](https://github.com/kairos-io/kairos)
+- [k8sgpt](https://github.com/k8sgpt-ai/k8sgpt#running-local-models)
+
+## Blog posts and other articles
+
+- https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65
+- https://kairos.io/docs/examples/localai/

 ## Short-term roadmap

- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
- Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
+- [x] Mimic OpenAI API (https://github.com/go-skynet/LocalAI/issues/10)
+- [ ] Binary releases (https://github.com/go-skynet/LocalAI/issues/6)
+- [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351) and [gpt4all](https://github.com/go-skynet/LocalAI/issues/85)
 - [x] Multi-model support
- Have a webUI!
+- [x] Have a webUI!
+- [x] Allow configuration of defaults for models.
+- [ ] Enable automatic downloading of models from a curated gallery, with only free-licensed models, directly from the webui.
+
+## Star history
+
+[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)

 ## License

+LocalAI is a community-driven project. It was initially created by [mudler](https://github.com/mudler/) at the [SpectroCloud OSS Office](https://github.com/spectrocloud).
+
 MIT

+## Golang bindings used
+
+- [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+- [go-skynet/go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp)
+- [go-skynet/go-gpt2.cpp](https://github.com/go-skynet/go-gpt2.cpp)
+- [donomii/go-rwkv.cpp](https://github.com/donomii/go-rwkv.cpp)
+
 ## Acknowledgements

 - [llama.cpp](https://github.com/ggerganov/llama.cpp)
 - https://github.com/tatsu-lab/stanford_alpaca
 - https://github.com/cornelk/llama-go for the initial ideas
 - https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)
+
+## Contributors
+
+<a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
+</a>
--- a/api/api.go
+++ b/api/api.go
@@ -1,234 +1,77 @@
 package api

 import (
-	"fmt"
-	"strings"
-	"sync"
+	"errors"

-	model "github.com/go-skynet/llama-cli/pkg/model"
-
-	llama "github.com/go-skynet/go-llama.cpp"
+	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
 	"github.com/gofiber/fiber/v2/middleware/recover"
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
 )

-type OpenAIResponse struct {
-	Created int      `json:"created,omitempty"`
-	Object  string   `json:"chat.completion,omitempty"`
-	ID      string   `json:"id,omitempty"`
-	Model   string   `json:"model,omitempty"`
-	Choices []Choice `json:"choices,omitempty"`
-}
-
-type Choice struct {
-	Index        int      `json:"index,omitempty"`
-	FinishReason string   `json:"finish_reason,omitempty"`
-	Message      *Message `json:"message,omitempty"`
-	Text         string   `json:"text,omitempty"`
-}
-
-type Message struct {
-	Role    string `json:"role,omitempty"`
-	Content string `json:"content,omitempty"`
-}
-
-type OpenAIModel struct {
-	ID     string `json:"id"`
-	Object string `json:"object"`
-}
-
-type OpenAIRequest struct {
-	Model string `json:"model"`
-
-	// Prompt is read only by completion API calls
-	Prompt string `json:"prompt"`
-
-	// Messages is read only by chat/completion API calls
-	Messages []Message `json:"messages"`
-
-	Echo bool `json:"echo"`
-	// Common options between all the API calls
-	TopP        float64 `json:"top_p"`
-	TopK        int     `json:"top_k"`
-	Temperature float64 `json:"temperature"`
-	Maxtokens   int     `json:"max_tokens"`
-
-	N int `json:"n"`
-
-	// Custom parameters - not present in the OpenAI API
-	Batch     int  `json:"batch"`
-	F16       bool `json:"f16kv"`
-	IgnoreEOS bool `json:"ignore_eos"`
-}
-
-// https://platform.openai.com/docs/api-reference/completions
-func openAIEndpoint(chat bool, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		var err error
-		var model *llama.LLama
-
-		input := new(OpenAIRequest)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		if input.Model == "" {
-			return fmt.Errorf("no model specified")
-		} else {
-			model, err = loader.LoadModel(input.Model)
-			if err != nil {
-				return err
-			}
-		}
-
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		if input.Model != "" {
-			mutexMap.Lock()
-			l, ok := mutexes[input.Model]
-			if !ok {
-				m := &sync.Mutex{}
-				mutexes[input.Model] = m
-				l = m
-			}
-			mutexMap.Unlock()
-			l.Lock()
-			defer l.Unlock()
-		} else {
-			defaultMutex.Lock()
-			defer defaultMutex.Unlock()
-		}
-
-		// Set the parameters for the language model prediction
-		topP := input.TopP
-		if topP == 0 {
-			topP = 0.7
-		}
-		topK := input.TopK
-		if topK == 0 {
-			topK = 80
-		}
-
-		temperature := input.Temperature
-		if temperature == 0 {
-			temperature = 0.9
-		}
-
-		tokens := input.Maxtokens
-		if tokens == 0 {
-			tokens = 512
-		}
-
-		predInput := input.Prompt
-		if chat {
-			mess := []string{}
-			for _, i := range input.Messages {
-				mess = append(mess, i.Content)
-			}
-
-			predInput = strings.Join(mess, "\n")
-		}
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
-			Input string
-		}{Input: predInput})
-		if err == nil {
-			predInput = templatedInput
-		}
-
-		result := []Choice{}
-
-		n := input.N
-
-		if input.N == 0 {
-			n = 1
-		}
-
-		for i := 0; i < n; i++ {
-			// Generate the prediction using the language model
-			predictOptions := []llama.PredictOption{
-				llama.SetTemperature(temperature),
-				llama.SetTopP(topP),
-				llama.SetTopK(topK),
-				llama.SetTokens(tokens),
-				llama.SetThreads(threads),
-			}
-
-			if input.Batch != 0 {
-				predictOptions = append(predictOptions, llama.SetBatch(input.Batch))
-			}
-
-			if input.F16 {
-				predictOptions = append(predictOptions, llama.EnableF16KV)
-			}
-
-			if input.IgnoreEOS {
-				predictOptions = append(predictOptions, llama.IgnoreEOS)
-			}
-
-			prediction, err := model.Predict(
-				predInput,
-				predictOptions...,
-			)
-			if err != nil {
-				return err
-			}
-
-			if input.Echo {
-				prediction = predInput + prediction
-			}
-			if chat {
-				result = append(result, Choice{Message: &Message{Role: "assistant", Content: prediction}})
-			} else {
-				result = append(result, Choice{Text: prediction})
-			}
-		}
-
-		// Return the prediction in the response body
-		return c.JSON(OpenAIResponse{
-			Model:   input.Model,
-			Choices: result,
-		})
+func App(configFile string, loader *model.ModelLoader, threads, ctxSize int, f16 bool, debug, disableMessage bool) *fiber.App {
+	zerolog.SetGlobalLevel(zerolog.InfoLevel)
+	if debug {
+		zerolog.SetGlobalLevel(zerolog.DebugLevel)
 	}
-}

-func Start(loader *model.ModelLoader, listenAddr string, threads int) error {
-	app := fiber.New()
+	// Return errors as JSON responses
+	app := fiber.New(fiber.Config{
+		DisableStartupMessage: disableMessage,
+		// Override default error handler
+		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
+			// Status code defaults to 500
+			code := fiber.StatusInternalServerError

+			// Retrieve the custom status code if it's a *fiber.Error
+			var e *fiber.Error
+			if errors.As(err, &e) {
+				code = e.Code
+			}
+
+			// Send custom error page
+			return ctx.Status(code).JSON(
+				ErrorResponse{
+					Error: &APIError{Message: err.Error(), Code: code},
+				},
+			)
+		},
+	})
+
+	cm := make(ConfigMerger)
+	if err := cm.LoadConfigs(loader.ModelPath); err != nil {
+		log.Error().Msgf("error loading config files: %s", err.Error())
+	}
+
+	if configFile != "" {
+		if err := cm.LoadConfigFile(configFile); err != nil {
+			log.Error().Msgf("error loading config file: %s", err.Error())
+		}
+	}
+
+	if debug {
+		for k, v := range cm {
+			log.Debug().Msgf("Model: %s (config: %+v)", k, v)
+		}
+	}
 	// Default middleware config
 	app.Use(recover.New())
 	app.Use(cors.New())

-	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-	var mutex = &sync.Mutex{}
-	mu := map[string]*sync.Mutex{}
-	var mumutex = &sync.Mutex{}
-
 	// openAI compatible API endpoint
-	app.Post("/v1/chat/completions", openAIEndpoint(true, loader, threads, mutex, mumutex, mu))
-	app.Post("/v1/completions", openAIEndpoint(false, loader, threads, mutex, mumutex, mu))
-	app.Get("/v1/models", func(c *fiber.Ctx) error {
-		models, err := loader.ListModels()
-		if err != nil {
-			return err
-		}
+	app.Post("/v1/chat/completions", chatEndpoint(cm, debug, loader, threads, ctxSize, f16))
+	app.Post("/chat/completions", chatEndpoint(cm, debug, loader, threads, ctxSize, f16))

-		dataModels := []OpenAIModel{}
-		for _, m := range models {
-			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
-		}
-		return c.JSON(struct {
-			Object string        `json:"object"`
-			Data   []OpenAIModel `json:"data"`
-		}{
-			Object: "list",
-			Data:   dataModels,
-		})
-	})
+	app.Post("/v1/edits", editEndpoint(cm, debug, loader, threads, ctxSize, f16))
+	app.Post("/edits", editEndpoint(cm, debug, loader, threads, ctxSize, f16))

-	// Start the server
-	app.Listen(listenAddr)
-	return nil
+	app.Post("/v1/completions", completionEndpoint(cm, debug, loader, threads, ctxSize, f16))
+	app.Post("/completions", completionEndpoint(cm, debug, loader, threads, ctxSize, f16))
+
+	app.Get("/v1/models", listModels(loader, cm))
+	app.Get("/models", listModels(loader, cm))
+
+	return app
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -0,0 +1,138 @@
+package api_test
+
+import (
+	"context"
+	"os"
+
+	. "github.com/go-skynet/LocalAI/api"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	openaigo "github.com/otiai10/openaigo"
+	"github.com/sashabaranov/go-openai"
+)
+
+var _ = Describe("API test", func() {
+
+	var app *fiber.App
+	var modelLoader *model.ModelLoader
+	var client *openai.Client
+	var client2 *openaigo.Client
+	Context("API query", func() {
+		BeforeEach(func() {
+			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
+			app = App("", modelLoader, 1, 512, false, true, true)
+			go app.Listen("127.0.0.1:9090")
+
+			defaultConfig := openai.DefaultConfig("")
+			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
+
+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+
+			// Wait for API to be ready
+			client = openai.NewClientWithConfig(defaultConfig)
+			Eventually(func() error {
+				_, err := client.ListModels(context.TODO())
+				return err
+			}, "2m").ShouldNot(HaveOccurred())
+		})
+		AfterEach(func() {
+			app.Shutdown()
+		})
+		It("returns the models list", func() {
+			models, err := client.ListModels(context.TODO())
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(models.Models)).To(Equal(3))
+			Expect(models.Models[0].ID).To(Equal("testmodel"))
+		})
+		It("can generate completions", func() {
+			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
+		})
+
+		It("can generate chat completions ", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+
+		It("can generate completions from model configs", func() {
+			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: "abcdedfghikl"})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
+		})
+
+		It("can generate chat completions from model configs", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+
+		It("returns errors", func() {
+			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 5 errors occurred:"))
+		})
+
+	})
+
+	Context("Config file", func() {
+		BeforeEach(func() {
+			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
+			app = App(os.Getenv("CONFIG_FILE"), modelLoader, 1, 512, false, true, true)
+			go app.Listen("127.0.0.1:9090")
+
+			defaultConfig := openai.DefaultConfig("")
+			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+			// Wait for API to be ready
+			client = openai.NewClientWithConfig(defaultConfig)
+			Eventually(func() error {
+				_, err := client.ListModels(context.TODO())
+				return err
+			}, "2m").ShouldNot(HaveOccurred())
+		})
+		AfterEach(func() {
+			app.Shutdown()
+		})
+		It("can generate chat completions from config file", func() {
+
+			models, err := client.ListModels(context.TODO())
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(models.Models)).To(Equal(5))
+			Expect(models.Models[0].ID).To(Equal("testmodel"))
+		})
+		It("can generate chat completions from config file", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+		It("can generate chat completions from config file", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+		It("can generate edit completions from config file", func() {
+			request := openaigo.EditCreateRequestBody{
+				Model:       "list2",
+				Instruction: "foo",
+				Input:       "bar",
+			}
+			resp, err := client2.CreateEdit(context.Background(), request)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
+		})
+	})
+})
--- a/api/apt_suite_test.go
+++ b/api/apt_suite_test.go
@@ -0,0 +1,13 @@
+package api_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestLocalAI(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "LocalAI test suite")
+}
--- a/api/config.go
+++ b/api/config.go
@@ -0,0 +1,102 @@
+package api
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"gopkg.in/yaml.v3"
+)
+
+type Config struct {
+	OpenAIRequest  `yaml:"parameters"`
+	Name           string            `yaml:"name"`
+	StopWords      []string          `yaml:"stopwords"`
+	Cutstrings     []string          `yaml:"cutstrings"`
+	TrimSpace      []string          `yaml:"trimspace"`
+	ContextSize    int               `yaml:"context_size"`
+	F16            bool              `yaml:"f16"`
+	Threads        int               `yaml:"threads"`
+	Debug          bool              `yaml:"debug"`
+	Roles          map[string]string `yaml:"roles"`
+	Backend        string            `yaml:"backend"`
+	TemplateConfig TemplateConfig    `yaml:"template"`
+}
+
+type TemplateConfig struct {
+	Completion string `yaml:"completion"`
+	Chat       string `yaml:"chat"`
+	Edit       string `yaml:"edit"`
+}
+
+type ConfigMerger map[string]Config
+
+func ReadConfigFile(file string) ([]*Config, error) {
+	c := &[]*Config{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	return *c, nil
+}
+
+func ReadConfig(file string) (*Config, error) {
+	c := &Config{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	return c, nil
+}
+
+func (cm ConfigMerger) LoadConfigFile(file string) error {
+	c, err := ReadConfigFile(file)
+	if err != nil {
+		return fmt.Errorf("cannot load config file: %w", err)
+	}
+
+	for _, cc := range c {
+		cm[cc.Name] = *cc
+	}
+	return nil
+}
+
+func (cm ConfigMerger) LoadConfig(file string) error {
+	c, err := ReadConfig(file)
+	if err != nil {
+		return fmt.Errorf("cannot read config file: %w", err)
+	}
+
+	cm[c.Name] = *c
+	return nil
+}
+
+func (cm ConfigMerger) LoadConfigs(path string) error {
+	files, err := ioutil.ReadDir(path)
+	if err != nil {
+		return err
+	}
+
+	for _, file := range files {
+		// Skip templates, YAML and .keep files
+		if !strings.Contains(file.Name(), ".yaml") {
+			continue
+		}
+		c, err := ReadConfig(filepath.Join(path, file.Name()))
+		if err == nil {
+			cm[c.Name] = *c
+		}
+	}
+
+	return nil
+}
--- a/api/openai.go
+++ b/api/openai.go
@@ -0,0 +1,479 @@
+package api
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+	"github.com/valyala/fasthttp"
+)
+
+// APIError provides error information returned by the OpenAI API.
+type APIError struct {
+	Code    any     `json:"code,omitempty"`
+	Message string  `json:"message"`
+	Param   *string `json:"param,omitempty"`
+	Type    string  `json:"type"`
+}
+
+type ErrorResponse struct {
+	Error *APIError `json:"error,omitempty"`
+}
+
+type OpenAIResponse struct {
+	Created int      `json:"created,omitempty"`
+	Object  string   `json:"object,omitempty"`
+	ID      string   `json:"id,omitempty"`
+	Model   string   `json:"model,omitempty"`
+	Choices []Choice `json:"choices,omitempty"`
+}
+
+type Choice struct {
+	Index        int      `json:"index,omitempty"`
+	FinishReason string   `json:"finish_reason,omitempty"`
+	Message      *Message `json:"message,omitempty"`
+	Delta        *Message `json:"delta,omitempty"`
+	Text         string   `json:"text,omitempty"`
+}
+
+type Message struct {
+	Role    string `json:"role,omitempty" yaml:"role"`
+	Content string `json:"content,omitempty" yaml:"content"`
+}
+
+type OpenAIModel struct {
+	ID     string `json:"id"`
+	Object string `json:"object"`
+}
+
+type OpenAIRequest struct {
+	Model string `json:"model" yaml:"model"`
+
+	// Prompt is read only by completion API calls
+	Prompt interface{} `json:"prompt" yaml:"prompt"`
+
+	// Edit endpoint
+	Instruction string `json:"instruction" yaml:"instruction"`
+	Input       string `json:"input" yaml:"input"`
+
+	Stop interface{} `json:"stop" yaml:"stop"`
+
+	// Messages is read only by chat/completion API calls
+	Messages []Message `json:"messages" yaml:"messages"`
+
+	Stream bool `json:"stream"`
+	Echo   bool `json:"echo"`
+	// Common options between all the API calls
+	TopP        float64 `json:"top_p" yaml:"top_p"`
+	TopK        int     `json:"top_k" yaml:"top_k"`
+	Temperature float64 `json:"temperature" yaml:"temperature"`
+	Maxtokens   int     `json:"max_tokens" yaml:"max_tokens"`
+
+	N int `json:"n"`
+
+	// Custom parameters - not present in the OpenAI API
+	Batch         int     `json:"batch" yaml:"batch"`
+	F16           bool    `json:"f16" yaml:"f16"`
+	IgnoreEOS     bool    `json:"ignore_eos" yaml:"ignore_eos"`
+	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
+	Keep          int     `json:"n_keep" yaml:"n_keep"`
+
+	Seed int `json:"seed" yaml:"seed"`
+}
+
+func defaultRequest(modelFile string) OpenAIRequest {
+	return OpenAIRequest{
+		TopP:        0.7,
+		TopK:        80,
+		Maxtokens:   512,
+		Temperature: 0.9,
+		Model:       modelFile,
+	}
+}
+
+func updateConfig(config *Config, input *OpenAIRequest) {
+	if input.Echo {
+		config.Echo = input.Echo
+	}
+	if input.TopK != 0 {
+		config.TopK = input.TopK
+	}
+	if input.TopP != 0 {
+		config.TopP = input.TopP
+	}
+
+	if input.Temperature != 0 {
+		config.Temperature = input.Temperature
+	}
+
+	if input.Maxtokens != 0 {
+		config.Maxtokens = input.Maxtokens
+	}
+
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
+	}
+
+	if input.RepeatPenalty != 0 {
+		config.RepeatPenalty = input.RepeatPenalty
+	}
+
+	if input.Keep != 0 {
+		config.Keep = input.Keep
+	}
+
+	if input.Batch != 0 {
+		config.Batch = input.Batch
+	}
+
+	if input.F16 {
+		config.F16 = input.F16
+	}
+
+	if input.IgnoreEOS {
+		config.IgnoreEOS = input.IgnoreEOS
+	}
+
+	if input.Seed != 0 {
+		config.Seed = input.Seed
+	}
+}
+
+func readConfig(cm ConfigMerger, c *fiber.Ctx, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*Config, *OpenAIRequest, error) {
+	input := new(OpenAIRequest)
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return nil, nil, err
+	}
+
+	modelFile := input.Model
+	received, _ := json.Marshal(input)
+
+	log.Debug().Msgf("Request received: %s", string(received))
+
+	// Set model from bearer token, if available
+	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+
+	// If no model was specified, take the first available
+	if modelFile == "" && !bearerExists {
+		models, _ := loader.ListModels()
+		if len(models) > 0 {
+			modelFile = models[0]
+			log.Debug().Msgf("No model specified, using: %s", modelFile)
+		} else {
+			log.Debug().Msgf("No model specified, returning error")
+			return nil, nil, fmt.Errorf("no model specified")
+		}
+	}
+
+	// If a model is found in bearer token takes precedence
+	if bearerExists {
+		log.Debug().Msgf("Using model from bearer token: %s", bearer)
+		modelFile = bearer
+	}
+
+	// Load a config file if present after the model name
+	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
+	if _, err := os.Stat(modelConfig); err == nil {
+		if err := cm.LoadConfig(modelConfig); err != nil {
+			return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+		}
+	}
+
+	var config *Config
+	cfg, exists := cm[modelFile]
+	if !exists {
+		config = &Config{
+			OpenAIRequest: defaultRequest(modelFile),
+		}
+	} else {
+		config = &cfg
+	}
+
+	// Set the parameters for the language model prediction
+	updateConfig(config, input)
+
+	if threads != 0 {
+		config.Threads = threads
+	}
+	if ctx != 0 {
+		config.ContextSize = ctx
+	}
+	if f16 {
+		config.F16 = true
+	}
+
+	if debug {
+		config.Debug = true
+	}
+
+	return config, input, nil
+}
+
+// https://platform.openai.com/docs/api-reference/completions
+func completionEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, threads, ctx int, f16 bool) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		config, input, err := readConfig(cm, c, loader, debug, threads, ctx, f16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		predInput := []string{}
+
+		switch p := input.Prompt.(type) {
+		case string:
+			predInput = append(predInput, p)
+		case []interface{}:
+			for _, pp := range p {
+				if s, ok := pp.(string); ok {
+					predInput = append(predInput, s)
+				}
+			}
+		}
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Completion != "" {
+			templateFile = config.TemplateConfig.Completion
+		}
+
+		var result []Choice
+		for _, i := range predInput {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := loader.TemplatePrefix(templateFile, struct {
+				Input string
+			}{Input: i})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}
+
+			r, err := ComputeChoices(i, input, config, loader, func(s string, c *[]Choice) {
+				*c = append(*c, Choice{Text: s})
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			result = append(result, r...)
+		}
+
+		resp := &OpenAIResponse{
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "text_completion",
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
+
+func chatEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, threads, ctx int, f16 bool) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		config, input, err := readConfig(cm, c, loader, debug, threads, ctx, f16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		var predInput string
+
+		mess := []string{}
+		for _, i := range input.Messages {
+			r := config.Roles[i.Role]
+			if r == "" {
+				r = i.Role
+			}
+
+			content := fmt.Sprint(r, " ", i.Content)
+			mess = append(mess, content)
+		}
+
+		predInput = strings.Join(mess, "\n")
+
+		if input.Stream {
+			log.Debug().Msgf("Stream request received")
+			c.Context().SetContentType("text/event-stream")
+			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
+			//	c.Set("Content-Type", "text/event-stream")
+			c.Set("Cache-Control", "no-cache")
+			c.Set("Connection", "keep-alive")
+			c.Set("Transfer-Encoding", "chunked")
+		}
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Chat != "" {
+			templateFile = config.TemplateConfig.Chat
+		}
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := loader.TemplatePrefix(templateFile, struct {
+			Input string
+		}{Input: predInput})
+		if err == nil {
+			predInput = templatedInput
+			log.Debug().Msgf("Template found, input modified to: %s", predInput)
+		}
+
+		if input.Stream {
+			responses := make(chan OpenAIResponse)
+
+			go func() {
+				ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {}, func(s string) bool {
+					resp := OpenAIResponse{
+						Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+						Choices: []Choice{{Delta: &Message{Role: "assistant", Content: s}}},
+						Object:  "chat.completion.chunk",
+					}
+
+					responses <- resp
+					return true
+				})
+				close(responses)
+			}()
+
+			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
+
+				for ev := range responses {
+					var buf bytes.Buffer
+					enc := json.NewEncoder(&buf)
+					enc.Encode(ev)
+
+					fmt.Fprintf(w, "event: data\n\n")
+					fmt.Fprintf(w, "data: %v\n\n", buf.String())
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					w.Flush()
+				}
+
+				w.WriteString("event: data\n\n")
+				resp := &OpenAIResponse{
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []Choice{{FinishReason: "stop"}},
+				}
+				respData, _ := json.Marshal(resp)
+
+				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
+				w.Flush()
+			}))
+			return nil
+		}
+
+		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
+			*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: s}})
+		}, nil)
+		if err != nil {
+			return err
+		}
+
+		resp := &OpenAIResponse{
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "chat.completion",
+		}
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
+
+func editEndpoint(cm ConfigMerger, debug bool, loader *model.ModelLoader, threads, ctx int, f16 bool) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		config, input, err := readConfig(cm, c, loader, debug, threads, ctx, f16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		predInput := input.Input
+		templateFile := config.Model
+
+		if config.TemplateConfig.Edit != "" {
+			templateFile = config.TemplateConfig.Edit
+		}
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := loader.TemplatePrefix(templateFile, struct {
+			Input       string
+			Instruction string
+		}{Input: predInput, Instruction: input.Instruction})
+		if err == nil {
+			predInput = templatedInput
+			log.Debug().Msgf("Template found, input modified to: %s", predInput)
+		}
+
+		result, err := ComputeChoices(predInput, input, config, loader, func(s string, c *[]Choice) {
+			*c = append(*c, Choice{Text: s})
+		}, nil)
+		if err != nil {
+			return err
+		}
+
+		resp := &OpenAIResponse{
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "edit",
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
+
+func listModels(loader *model.ModelLoader, cm ConfigMerger) func(ctx *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		models, err := loader.ListModels()
+		if err != nil {
+			return err
+		}
+		var mm map[string]interface{} = map[string]interface{}{}
+
+		dataModels := []OpenAIModel{}
+		for _, m := range models {
+			mm[m] = nil
+			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+		}
+
+		for k := range cm {
+			if _, exists := mm[k]; !exists {
+				dataModels = append(dataModels, OpenAIModel{ID: k, Object: "model"})
+			}
+		}
+
+		return c.JSON(struct {
+			Object string        `json:"object"`
+			Data   []OpenAIModel `json:"data"`
+		}{
+			Object: "list",
+			Data:   dataModels,
+		})
+	}
+}
--- a/api/prediction.go
+++ b/api/prediction.go
@@ -0,0 +1,343 @@
+package api
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+	"sync"
+
+	"github.com/donomii/go-rwkv.cpp"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	gpt2 "github.com/go-skynet/go-gpt2.cpp"
+	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
+	llama "github.com/go-skynet/go-llama.cpp"
+	"github.com/hashicorp/go-multierror"
+)
+
+const tokenizerSuffix = ".tokenizer.json"
+
+// mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+var mutexMap sync.Mutex
+var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)
+
+var loadedModels map[string]interface{} = map[string]interface{}{}
+var muModels sync.Mutex
+
+func backendLoader(backendString string, loader *model.ModelLoader, modelFile string, llamaOpts []llama.ModelOption, threads uint32) (model interface{}, err error) {
+	switch strings.ToLower(backendString) {
+	case "llama":
+		return loader.LoadLLaMAModel(modelFile, llamaOpts...)
+	case "stablelm":
+		return loader.LoadStableLMModel(modelFile)
+	case "gpt2":
+		return loader.LoadGPT2Model(modelFile)
+	case "gptj":
+		return loader.LoadGPTJModel(modelFile)
+	case "rwkv":
+		return loader.LoadRWKV(modelFile, modelFile+tokenizerSuffix, threads)
+	default:
+		return nil, fmt.Errorf("backend unsupported: %s", backendString)
+	}
+}
+
+func greedyLoader(loader *model.ModelLoader, modelFile string, llamaOpts []llama.ModelOption, threads uint32) (model interface{}, err error) {
+	updateModels := func(model interface{}) {
+		muModels.Lock()
+		defer muModels.Unlock()
+		loadedModels[modelFile] = model
+	}
+
+	muModels.Lock()
+	m, exists := loadedModels[modelFile]
+	if exists {
+		muModels.Unlock()
+		return m, nil
+	}
+	muModels.Unlock()
+
+	model, modelerr := loader.LoadLLaMAModel(modelFile, llamaOpts...)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadGPTJModel(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadGPT2Model(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadStableLMModel(modelFile)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	model, modelerr = loader.LoadRWKV(modelFile, modelFile+tokenizerSuffix, threads)
+	if modelerr == nil {
+		updateModels(model)
+		return model, nil
+	} else {
+		err = multierror.Append(err, modelerr)
+	}
+
+	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
+}
+
+func ModelInference(s string, loader *model.ModelLoader, c Config, tokenCallback func(string) bool) (func() (string, error), error) {
+	supportStreams := false
+	modelFile := c.Model
+
+	// Try to load the model
+	llamaOpts := []llama.ModelOption{}
+	if c.ContextSize != 0 {
+		llamaOpts = append(llamaOpts, llama.SetContext(c.ContextSize))
+	}
+	if c.F16 {
+		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+	}
+
+	var inferenceModel interface{}
+	var err error
+	if c.Backend == "" {
+		inferenceModel, err = greedyLoader(loader, modelFile, llamaOpts, uint32(c.Threads))
+	} else {
+		inferenceModel, err = backendLoader(c.Backend, loader, modelFile, llamaOpts, uint32(c.Threads))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	var fn func() (string, error)
+
+	switch model := inferenceModel.(type) {
+	case *rwkv.RwkvState:
+		supportStreams = true
+
+		fn = func() (string, error) {
+			//model.ProcessInput("You are a chatbot that is very good at chatting.  blah blah blah")
+			stopWord := "\n"
+			if len(c.StopWords) > 0 {
+				stopWord = c.StopWords[0]
+			}
+
+			response := model.GenerateResponse(c.Maxtokens, stopWord, float32(c.Temperature), float32(c.TopP), tokenCallback)
+
+			return response, nil
+		}
+	case *gpt2.StableLM:
+		fn = func() (string, error) {
+			// Generate the prediction using the language model
+			predictOptions := []gpt2.PredictOption{
+				gpt2.SetTemperature(c.Temperature),
+				gpt2.SetTopP(c.TopP),
+				gpt2.SetTopK(c.TopK),
+				gpt2.SetTokens(c.Maxtokens),
+				gpt2.SetThreads(c.Threads),
+			}
+
+			if c.Batch != 0 {
+				predictOptions = append(predictOptions, gpt2.SetBatch(c.Batch))
+			}
+
+			if c.Seed != 0 {
+				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
+			}
+
+			return model.Predict(
+				s,
+				predictOptions...,
+			)
+		}
+	case *gpt2.GPT2:
+		fn = func() (string, error) {
+			// Generate the prediction using the language model
+			predictOptions := []gpt2.PredictOption{
+				gpt2.SetTemperature(c.Temperature),
+				gpt2.SetTopP(c.TopP),
+				gpt2.SetTopK(c.TopK),
+				gpt2.SetTokens(c.Maxtokens),
+				gpt2.SetThreads(c.Threads),
+			}
+
+			if c.Batch != 0 {
+				predictOptions = append(predictOptions, gpt2.SetBatch(c.Batch))
+			}
+
+			if c.Seed != 0 {
+				predictOptions = append(predictOptions, gpt2.SetSeed(c.Seed))
+			}
+
+			return model.Predict(
+				s,
+				predictOptions...,
+			)
+		}
+	case *gptj.GPTJ:
+		fn = func() (string, error) {
+			// Generate the prediction using the language model
+			predictOptions := []gptj.PredictOption{
+				gptj.SetTemperature(c.Temperature),
+				gptj.SetTopP(c.TopP),
+				gptj.SetTopK(c.TopK),
+				gptj.SetTokens(c.Maxtokens),
+				gptj.SetThreads(c.Threads),
+			}
+
+			if c.Batch != 0 {
+				predictOptions = append(predictOptions, gptj.SetBatch(c.Batch))
+			}
+
+			if c.Seed != 0 {
+				predictOptions = append(predictOptions, gptj.SetSeed(c.Seed))
+			}
+
+			return model.Predict(
+				s,
+				predictOptions...,
+			)
+		}
+	case *llama.LLama:
+		supportStreams = true
+		fn = func() (string, error) {
+
+			if tokenCallback != nil {
+				model.SetTokenCallback(tokenCallback)
+			}
+
+			// Generate the prediction using the language model
+			predictOptions := []llama.PredictOption{
+				llama.SetTemperature(c.Temperature),
+				llama.SetTopP(c.TopP),
+				llama.SetTopK(c.TopK),
+				llama.SetTokens(c.Maxtokens),
+				llama.SetThreads(c.Threads),
+			}
+
+			if c.Debug {
+				predictOptions = append(predictOptions, llama.Debug)
+			}
+
+			predictOptions = append(predictOptions, llama.SetStopWords(c.StopWords...))
+
+			if c.RepeatPenalty != 0 {
+				predictOptions = append(predictOptions, llama.SetPenalty(c.RepeatPenalty))
+			}
+
+			if c.Keep != 0 {
+				predictOptions = append(predictOptions, llama.SetNKeep(c.Keep))
+			}
+
+			if c.Batch != 0 {
+				predictOptions = append(predictOptions, llama.SetBatch(c.Batch))
+			}
+
+			if c.F16 {
+				predictOptions = append(predictOptions, llama.EnableF16KV)
+			}
+
+			if c.IgnoreEOS {
+				predictOptions = append(predictOptions, llama.IgnoreEOS)
+			}
+
+			if c.Seed != 0 {
+				predictOptions = append(predictOptions, llama.SetSeed(c.Seed))
+			}
+
+			return model.Predict(
+				s,
+				predictOptions...,
+			)
+		}
+	}
+
+	return func() (string, error) {
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		mutexMap.Lock()
+		l, ok := mutexes[modelFile]
+		if !ok {
+			m := &sync.Mutex{}
+			mutexes[modelFile] = m
+			l = m
+		}
+		mutexMap.Unlock()
+		l.Lock()
+		defer l.Unlock()
+
+		res, err := fn()
+		if tokenCallback != nil && !supportStreams {
+			tokenCallback(res)
+		}
+		return res, err
+	}, nil
+}
+
+func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
+	result := []Choice{}
+
+	n := input.N
+
+	if input.N == 0 {
+		n = 1
+	}
+
+	// get the model function to call for the result
+	predFunc, err := ModelInference(predInput, loader, *config, tokenCallback)
+	if err != nil {
+		return result, err
+	}
+
+	for i := 0; i < n; i++ {
+		prediction, err := predFunc()
+		if err != nil {
+			return result, err
+		}
+
+		prediction = Finetune(*config, predInput, prediction)
+		cb(prediction, &result)
+
+		//result = append(result, Choice{Text: prediction})
+
+	}
+	return result, err
+}
+
+var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
+var mu sync.Mutex = sync.Mutex{}
+
+func Finetune(config Config, input, prediction string) string {
+	if config.Echo {
+		prediction = input + prediction
+	}
+
+	for _, c := range config.Cutstrings {
+		mu.Lock()
+		reg, ok := cutstrings[c]
+		if !ok {
+			cutstrings[c] = regexp.MustCompile(c)
+			reg = cutstrings[c]
+		}
+		mu.Unlock()
+		prediction = reg.ReplaceAllString(prediction, "")
+	}
+
+	for _, c := range config.TrimSpace {
+		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
+	}
+	return prediction
+
+}
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,28 +1,15 @@
 version: '3.6'

 services:
-
-  # chatgpt:
-  #   image: ghcr.io/mckaywrigley/chatbot-ui:main
-  #   # platform: linux/amd64
-  #   ports:
-  #     - 3000:3000
-  #   environment:
-  #     - 'OPENAI_API_KEY=sk-000000000000000'
-  #     - 'OPENAI_API_HOST=http://api:8080'
-
  api:
-    image: quay.io/go-skynet/llama-cli:latest
+    image: quay.io/go-skynet/local-ai:latest
    build:
      context: .
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile.dev
    ports:
      - 8080:8080
-    environment:
-      - MODELS_PATH=$MODELS_PATH
-      - CONTEXT_SIZE=$CONTEXT_SIZE
-      - THREADS=$THREADS
+    env_file:
+      - .env
    volumes:
      - ./models:/models:cached
-    command: api
-    
+    command: ["/usr/bin/local-ai" ]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /build
+
+make build
+
+./local-ai "$@"
--- a/examples/README.md
+++ b/examples/README.md
@@ -0,0 +1,14 @@
+# Examples
+
+Here is a list of projects that can easily be integrated with the LocalAI backend. 
+
+## Projects
+
+- [chatbot-ui](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui/) (by [@mkellerman](https://github.com/mkellerman))
+- [discord-bot](https://github.com/go-skynet/LocalAI/tree/master/examples/discord-bot/) (by [@mudler](https://github.com/mudler))
+- [langchain](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain/) (by [@dave-gray101](https://github.com/dave-gray101))
+- [slack-bot](https://github.com/go-skynet/LocalAI/tree/master/examples/slack-bot/) (by [@mudler](https://github.com/mudler))
+
+## Want to contribute?
+
+Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/chatbot-ui/README.md
+++ b/examples/chatbot-ui/README.md
@@ -0,0 +1,46 @@
+# chatbot-ui
+
+Example of integration with [mckaywrigley/chatbot-ui](https://github.com/mckaywrigley/chatbot-ui).
+
+![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/chatbot-ui
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# start with docker-compose
+docker-compose up -d --build
+```
+
+## Pointing chatbot-ui to a separately managed LocalAI service
+
+If you want to use the [chatbot-ui example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui) with an externally managed LocalAI service, you can alter the `docker-compose` file so that it looks like the below. You will notice the file is smaller, because we have removed the section that would normally start the LocalAI service. Take care to update the IP address (or FQDN) that the chatbot-ui service tries to access (marked `<<LOCALAI_IP>>` below):
+```
+version: '3.6'
+
+services:
+  chatgpt:
+    image: ghcr.io/mckaywrigley/chatbot-ui:main
+    ports:
+      - 3000:3000
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://<<LOCALAI_IP>>:8080'
+```
+
+Once you've edited the Dockerfile, you can start it with `docker compose up`, then browse to `http://localhost:3000`.
+
+## Accessing chatbot-ui
+
+Open http://localhost:3000 for the Web UI.
+
--- a/examples/chatbot-ui/docker-compose.yaml
+++ b/examples/chatbot-ui/docker-compose.yaml
@@ -0,0 +1,24 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  chatgpt:
+    image: ghcr.io/mckaywrigley/chatbot-ui:main
+    ports:
+      - 3000:3000
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://api:8080'
--- a/examples/chatbot-ui/models/completion.tmpl
+++ b/examples/chatbot-ui/models/completion.tmpl
@@ -0,0 +1 @@
+{{.Input}}
--- a/examples/chatbot-ui/models/gpt-3.5-turbo.yaml
+++ b/examples/chatbot-ui/models/gpt-3.5-turbo.yaml
@@ -0,0 +1,17 @@
+name: gpt-3.5-turbo
+parameters:
+  model: ggml-gpt4all-j
+  top_k: 80
+  temperature: 0.2
+  top_p: 0.7
+context_size: 1024
+threads: 14
+stopwords:
+- "HUMAN:"
+- "GPT:"
+roles:
+  user: " "
+  system: " "
+template:
+  completion: completion
+  chat: gpt4all
--- a/examples/chatbot-ui/models/gpt4all.tmpl
+++ b/examples/chatbot-ui/models/gpt4all.tmpl
@@ -0,0 +1,4 @@
+The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+### Prompt:
+{{.Input}}
+### Response:
--- a/examples/discord-bot/.env.example
+++ b/examples/discord-bot/.env.example
@@ -0,0 +1,6 @@
+OPENAI_API_KEY=x
+DISCORD_BOT_TOKEN=x
+DISCORD_CLIENT_ID=x
+OPENAI_API_BASE=http://api:8080
+ALLOWED_SERVER_IDS=x
+SERVER_TO_MODERATION_CHANNEL=1:1
--- a/examples/discord-bot/README.md
+++ b/examples/discord-bot/README.md
@@ -0,0 +1,76 @@
+# discord-bot
+
+![Screenshot from 2023-05-01 07-58-19](https://user-images.githubusercontent.com/2420543/235413924-0cb2e75b-f2d6-4119-8610-44386e44afb8.png)
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/discord-bot
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Set the discord bot options (see: https://github.com/go-skynet/gpt-discord-bot#setup)
+cp -rfv .env.example .env
+vim .env
+
+# start with docker-compose
+docker-compose up -d --build
+```
+
+Note: see setup options here: https://github.com/go-skynet/gpt-discord-bot#setup
+
+Open up the URL in the console and give permission to the bot in your server. Start a thread with `/chat ..`
+
+## Kubernetes
+
+- install the local-ai chart first
+- change OPENAI_API_BASE to point to the API address and apply the discord-bot manifest:
+
+```yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: discord-bot
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: localai
+  namespace: discord-bot
+  labels:
+    app: localai
+spec:
+  selector:
+    matchLabels:
+      app: localai
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: localai
+      name: localai
+    spec:
+      containers:
+        - name: localai-discord
+          env:
+          - name: OPENAI_API_KEY
+            value: "x"
+          - name: DISCORD_BOT_TOKEN
+            value: ""
+          - name: DISCORD_CLIENT_ID
+            value: ""
+          - name: OPENAI_API_BASE
+            value: "http://local-ai.default.svc.cluster.local:8080"
+          - name: ALLOWED_SERVER_IDS
+            value: "xx"
+          - name: SERVER_TO_MODERATION_CHANNEL
+            value: "1:1"
+          image: quay.io/go-skynet/gpt-discord-bot:main
+```
--- a/examples/discord-bot/docker-compose.yaml
+++ b/examples/discord-bot/docker-compose.yaml
@@ -0,0 +1,21 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  bot:
+    image: quay.io/go-skynet/gpt-discord-bot:main
+    env_file:
+    - .env
--- a/examples/discord-bot/models
+++ b/examples/discord-bot/models
@@ -0,0 +1 @@
+../chatbot-ui/models/
--- a/examples/langchain/.gitignore
+++ b/examples/langchain/.gitignore
@@ -0,0 +1,2 @@
+models/ggml-koala-13B-4bit-128g
+models/ggml-gpt4all-j
--- a/examples/langchain/JS.Dockerfile
+++ b/examples/langchain/JS.Dockerfile
@@ -0,0 +1,6 @@
+FROM node:latest
+COPY ./langchainjs-localai-example /app
+WORKDIR /app
+RUN npm install
+RUN npm run build
+ENTRYPOINT [ "npm", "run", "start" ]
--- a/examples/langchain/README.md
+++ b/examples/langchain/README.md
@@ -0,0 +1,31 @@
+# langchain
+
+Example of using langchain in TypeScript, with the standard OpenAI llm module, and LocalAI.
+
+Example for python langchain to follow at a later date
+
+Set up to make it easy to modify the `index.mts` file to look like any langchain example file.
+
+**Please Note** - This is a tech demo example at this time. ggml-gpt4all-j has pretty terrible results for most langchain applications with the settings used in this example.
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/langchain
+
+# (optional) - Edit the example code in typescript.
+# vi ./langchainjs-localai-example/index.ts
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# start with docker-compose
+docker-compose up --build
+```
+
+## Copyright
+
+Some of the example code in index.mts is adapted from the langchainjs project and is Copyright (c) Harrison Chase. Used under the terms of the MIT license, as is the remainder of this code.
--- a/examples/langchain/docker-compose.yaml
+++ b/examples/langchain/docker-compose.yaml
@@ -0,0 +1,25 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  langchainjs:
+    build:
+      context: .
+      dockerfile: JS.Dockerfile
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://api:8080/v1'
+      - 'MODEL_NAME=gpt-3.5-turbo' #gpt-3.5-turbo' # ggml-gpt4all-j' # ggml-koala-13B-4bit-128g'
--- a/examples/langchain/langchainjs-localai-example/.gitignore
+++ b/examples/langchain/langchainjs-localai-example/.gitignore
@@ -0,0 +1,2 @@
+node_modules/
+dist/
--- a/examples/langchain/langchainjs-localai-example/.vscode/launch.json
+++ b/examples/langchain/langchainjs-localai-example/.vscode/launch.json
@@ -0,0 +1,20 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "type": "node",
+            "request": "launch",
+            "name": "Launch Program",
+            // "skipFiles": [
+            //     "<node_internals>/**"
+            // ],
+            "program": "${workspaceFolder}\\dist\\index.mjs",
+            "outFiles": [
+                "${workspaceFolder}/**/*.js"
+            ]
+        }
+    ]
+}
--- a/examples/langchain/langchainjs-localai-example/package-lock.json
+++ b/examples/langchain/langchainjs-localai-example/package-lock.json
--- a/examples/langchain/langchainjs-localai-example/package.json
+++ b/examples/langchain/langchainjs-localai-example/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "langchainjs-localai-example",
+  "version": "0.1.0",
+  "description": "Trivial Example of using langchain + the OpenAI API + LocalAI together",
+  "main": "index.mjs",
+  "scripts": {
+    "build": "tsc --build",
+    "clean": "tsc --build --clean",
+    "start": "node --trace-warnings dist/index.mjs"
+  },
+  "author": "dave@gray101.com",
+  "license": "MIT",
+  "devDependencies": {
+    "@types/node": "^18.16.3",
+    "typescript": "^5.0.4"
+  },
+  "dependencies": {
+    "langchain": "^0.0.67",
+    "typeorm": "^0.3.15"
+  }
+}
--- a/examples/langchain/langchainjs-localai-example/src/index.mts
+++ b/examples/langchain/langchainjs-localai-example/src/index.mts
@@ -0,0 +1,79 @@
+import { OpenAIChat } from "langchain/llms/openai";
+import { loadQAStuffChain } from "langchain/chains";
+import { Document } from "langchain/document";
+import { initializeAgentExecutorWithOptions } from "langchain/agents";
+import {Calculator} from "langchain/tools/calculator";
+
+const pathToLocalAi = process.env['OPENAI_API_HOST'] || 'http://api:8080/v1';
+const fakeApiKey = process.env['OPENAI_API_KEY'] || '-';
+const modelName = process.env['MODEL_NAME'] || 'gpt-3.5-turbo';
+
+function getModel(): OpenAIChat {
+  return new OpenAIChat({
+    prefixMessages: [
+      {
+        role: "system",
+        content: "You are a helpful assistant that answers in pirate language",
+      },
+    ],
+    modelName: modelName,
+    maxTokens: 50,
+    openAIApiKey: fakeApiKey,
+    maxRetries: 2
+  }, {
+    basePath: pathToLocalAi,
+    apiKey: fakeApiKey,
+  });
+}
+
+// Minimal example.
+export const run = async () => {
+  const model = getModel();
+  console.log(`about to model.call at ${new Date().toUTCString()}`);
+  const res = await model.call(
+    "What would be a good company name a company that makes colorful socks?"
+  );
+  console.log(`${new Date().toUTCString()}`);
+  console.log({ res });
+};
+
+await run();
+
+// This example uses the `StuffDocumentsChain`
+export const run2 = async () => {
+  const model = getModel();
+  const chainA = loadQAStuffChain(model);
+  const docs = [
+    new Document({ pageContent: "Harrison went to Harvard." }),
+    new Document({ pageContent: "Ankush went to Princeton." }),
+  ];
+  const resA = await chainA.call({
+    input_documents: docs,
+    question: "Where did Harrison go to college?",
+  });
+  console.log({ resA });
+};
+
+await run2();
+
+// Quickly thrown together example of using tools + agents.
+// This seems like it should work, but it doesn't yet.
+export const temporarilyBrokenToolTest = async () => {
+  const model = getModel();
+
+  const executor = await initializeAgentExecutorWithOptions([new Calculator(true)], model, {
+    agentType: "zero-shot-react-description",
+  });
+
+  console.log("Loaded agent.");
+
+  const input = `What is the value of (500 *2) + 350 - 13?`;
+
+  console.log(`Executing with input "${input}"...`);
+
+  const result = await executor.call({ input });
+
+  console.log(`Got output ${result.output}`);
+}
+
+await temporarilyBrokenToolTest();
--- a/examples/langchain/langchainjs-localai-example/tsconfig.json
+++ b/examples/langchain/langchainjs-localai-example/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "es2022",
+    "lib": ["ES2022", "DOM"],
+    "module": "ES2022",
+    "moduleResolution": "node",
+    "strict": true,
+    "esModuleInterop": true,
+    "allowSyntheticDefaultImports": true,
+    "isolatedModules": true,
+    "outDir": "./dist"
+  },
+  "include": ["src", "test"],
+  "exclude": ["node_modules", "dist"]
+}
--- a/examples/langchain/models/completion.tmpl
+++ b/examples/langchain/models/completion.tmpl
@@ -0,0 +1 @@
+{{.Input}}
--- a/examples/langchain/models/gpt-3.5-turbo.yaml
+++ b/examples/langchain/models/gpt-3.5-turbo.yaml
@@ -0,0 +1,17 @@
+name: gpt-3.5-turbo
+parameters:
+  model: ggml-gpt4all-j # ggml-koala-13B-4bit-128g
+  top_k: 80
+  temperature: 0.2
+  top_p: 0.7
+context_size: 1024
+threads: 4
+stopwords:
+- "HUMAN:"
+- "GPT:"
+roles:
+  user: " "
+  system: " "
+template:
+  completion: completion
+  chat: completion # gpt4all
--- a/examples/langchain/models/gpt4all.tmpl
+++ b/examples/langchain/models/gpt4all.tmpl
@@ -0,0 +1,4 @@
+The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+### Prompt:
+{{.Input}}
+### Response:
--- a/examples/slack-bot/.env.example
+++ b/examples/slack-bot/.env.example
@@ -0,0 +1,11 @@
+SLACK_APP_TOKEN=xapp-1-...
+SLACK_BOT_TOKEN=xoxb-...
+OPENAI_API_KEY=sk-...
+OPENAI_API_BASE=http://api:8080
+OPENAI_MODEL=gpt-3.5-turbo
+OPENAI_TIMEOUT_SECONDS=60
+#OPENAI_SYSTEM_TEXT="You proofread text. When you receive a message, you will check
+#for mistakes and make suggestion to improve the language of the given text"
+USE_SLACK_LANGUAGE=true
+SLACK_APP_LOG_LEVEL=INFO
+TRANSLATE_MARKDOWN=true
--- a/examples/slack-bot/README.md
+++ b/examples/slack-bot/README.md
@@ -0,0 +1,27 @@
+# Slack bot
+
+Slackbot using: https://github.com/seratch/ChatGPT-in-Slack
+
+## Setup
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/slack-bot
+
+git clone https://github.com/seratch/ChatGPT-in-Slack
+
+# (optional) Checkout a specific LocalAI tag
+# git checkout -b build <TAG>
+
+# Download gpt4all-j to models/
+wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+
+# Set the discord bot options (see: https://github.com/seratch/ChatGPT-in-Slack)
+cp -rfv .env.example .env
+vim .env
+
+# start with docker-compose
+docker-compose up -d --build
+```
--- a/examples/slack-bot/docker-compose.yaml
+++ b/examples/slack-bot/docker-compose.yaml
@@ -0,0 +1,23 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: ../../
+      dockerfile: Dockerfile.dev
+    ports:
+      - 8080:8080
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+
+  bot:
+    build:
+     context: ./ChatGPT-in-Slack
+     dockerfile: Dockerfile
+    env_file:
+    - .env
--- a/examples/slack-bot/models
+++ b/examples/slack-bot/models
@@ -0,0 +1 @@
+../chatbot-ui/models
--- a/go.mod
+++ b/go.mod
@@ -1,30 +1,58 @@
-module github.com/go-skynet/llama-cli
+module github.com/go-skynet/LocalAI

 go 1.19

 require (
-	github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640
-	github.com/gofiber/fiber/v2 v2.42.0
-	github.com/urfave/cli/v2 v2.25.0
+	github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708
+	github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c
+	github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405
+	github.com/gofiber/fiber/v2 v2.44.0
+	github.com/hashicorp/go-multierror v1.1.1
+	github.com/jaypipes/ghw v0.10.0
+	github.com/onsi/ginkgo/v2 v2.9.3
+	github.com/onsi/gomega v1.27.6
+	github.com/otiai10/openaigo v1.1.0
+	github.com/rs/zerolog v1.29.1
+	github.com/sashabaranov/go-openai v1.9.1
+	github.com/urfave/cli/v2 v2.25.3
+	github.com/valyala/fasthttp v1.47.0
+	gopkg.in/yaml.v3 v3.0.1
 )

 require (
-	github.com/andybalholm/brotli v1.0.4 // indirect
+	github.com/StackExchange/wmi v1.2.1 // indirect
+	github.com/andybalholm/brotli v1.0.5 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
+	github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d // indirect
+	github.com/ghodss/yaml v1.0.0 // indirect
+	github.com/go-logr/logr v1.2.4 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
+	github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
+	github.com/google/go-cmp v0.5.9 // indirect
+	github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect
 	github.com/google/uuid v1.3.0 // indirect
-	github.com/klauspost/compress v1.15.9 // indirect
+	github.com/hashicorp/errwrap v1.0.0 // indirect
+	github.com/jaypipes/pcidb v1.0.0 // indirect
+	github.com/klauspost/compress v1.16.3 // indirect
+	github.com/kr/text v0.2.0 // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
-	github.com/mattn/go-isatty v0.0.17 // indirect
+	github.com/mattn/go-isatty v0.0.18 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
-	github.com/philhofer/fwd v1.1.1 // indirect
+	github.com/mitchellh/go-homedir v1.1.0 // indirect
+	github.com/philhofer/fwd v1.1.2 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
 	github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 // indirect
-	github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d // indirect
-	github.com/tinylib/msgp v1.1.6 // indirect
+	github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee // indirect
+	github.com/tinylib/msgp v1.1.8 // indirect
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
-	github.com/valyala/fasthttp v1.44.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
-	golang.org/x/sys v0.6.0 // indirect
+	golang.org/x/net v0.9.0 // indirect
+	golang.org/x/sys v0.7.0 // indirect
+	golang.org/x/text v0.9.0 // indirect
+	golang.org/x/tools v0.8.0 // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
+	howett.net/plist v1.0.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,86 +1,190 @@
-github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
-github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
+github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
+github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8=
+github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
+github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
+github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
+github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
+github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
+github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d h1:lSHwlYf1H4WAWYgf7rjEVTGen1qmigUq2Egpu8mnQiY=
+github.com/donomii/go-rwkv.cpp v0.0.0-20230502223004-0a3db3d72e7d/go.mod h1:H6QBF7/Tz6DAEBDXQged4H1BvsmqY/K5FG9wQRGa01g=
+github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
+github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
 github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640 h1:8SSVbQ3yvq7JnfLCLF4USV0PkQnnduUkaNCv/hHDa3E=
-github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
+github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
+github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708 h1:cfOi4TWvQ6JsAm9Q1A8I8j9YfNy10bmIfwOiyGyU5wQ=
+github.com/go-skynet/go-gpt2.cpp v0.0.0-20230422085954-245a5bfe6708/go.mod h1:1Wj/xbkMfwQSOrhNYK178IzqQHstZbRfhx4s8p1M5VM=
+github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c h1:48I7jpLNGiQeBmF0SFVVbREh8vlG0zN13v9LH5ctXis=
+github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230422090028-1f7bff57f66c/go.mod h1:5VZ9XbcINI0XcHhkcX8GPK8TplFGAzu1Hrg4tNiMCtI=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230430075552-377fd245eae2 h1:CYQRCbOfYtC77OxweAyrdxSVwoLIM/EdZ6Ij+xBzta8=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230430075552-377fd245eae2/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405 h1:pbIxJ/eiL1Irdprxk/mquaxjR1XDGCE+7CT9BGJNRaY=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230502121737-8ceb6167e405/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
-github.com/gofiber/fiber/v2 v2.42.0 h1:Fnp7ybWvS+sjNQsFvkhf4G8OhXswvB6Vee8hM/LyS+8=
-github.com/gofiber/fiber/v2 v2.42.0/go.mod h1:3+SGNjqMh5VQH5Vz2Wdi43zTIV16ktlFd3x3R6O1Zlc=
+github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
+github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/gofiber/fiber/v2 v2.44.0 h1:Z90bEvPcJM5GFJnu1py0E1ojoerkyew3iiNJ78MQCM8=
+github.com/gofiber/fiber/v2 v2.44.0/go.mod h1:VTMtb/au8g01iqvHyaCzftuM/xmZgKOZCtFzz6CdV9w=
+github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
+github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
+github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
-github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
+github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA=
+github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
+github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
+github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
+github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
+github.com/jaypipes/ghw v0.10.0 h1:UHu9UX08Py315iPojADFPOkmjTsNzHj4g4adsNKKteY=
+github.com/jaypipes/ghw v0.10.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g=
+github.com/jaypipes/pcidb v1.0.0 h1:vtZIfkiCUE42oYbJS0TAq9XSfSmcsgo9IdxSm9qzYU8=
+github.com/jaypipes/pcidb v1.0.0/go.mod h1:TnYUvqhPBzCKnH34KrIX22kAeEbDCSRJ9cqLRCuNDfk=
+github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
+github.com/klauspost/compress v1.16.3 h1:XuJt9zzcnaz6a16/OU53ZjWp/v7/42WcR5t2a0PcNQY=
+github.com/klauspost/compress v1.16.3/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng=
-github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-isatty v0.0.18 h1:DOKFKCQ7FNG2L1rbrmstDN4QVRdS89Nkh85u68Uwp98=
+github.com/mattn/go-isatty v0.0.18/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
+github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
+github.com/onsi/ginkgo/v2 v2.9.2/go.mod h1:WHcJJG2dIlcCqVfBAwUCrJxSPFb6v4azBwgxeMeDuts=
+github.com/onsi/ginkgo/v2 v2.9.3 h1:5X2vl/isiKqkrOYjiaGgp3JQOcLV59g5o5SuTMqCcxU=
+github.com/onsi/ginkgo/v2 v2.9.3/go.mod h1:gCQYp2Q+kSoIj7ykSVb9nskRSsR6PUj4AiLywzIhbKM=
 github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
-github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
+github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg=
+github.com/otiai10/mint v1.4.1 h1:HOVBfKP1oXIc0wWo9hZ8JLdZtyCPWqjvmFDuVZ0yv2Y=
+github.com/otiai10/openaigo v1.1.0 h1:zRvGBqZUW5PCMgdkJNsPVTBd8tOLCMTipXE5wD2pdTg=
+github.com/otiai10/openaigo v1.1.0/go.mod h1:792bx6AWTS61weDi2EzKpHHnTF4eDMAlJ5GvAk/mgPg=
 github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
+github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw=
+github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
+github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
+github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/sashabaranov/go-openai v1.9.1 h1:3N52HkJKo9Zlo/oe1AVv5ZkCOny0ra58/ACvAxkN3MM=
+github.com/sashabaranov/go-openai v1.9.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94/go.mod h1:90zrgN3D/WJsDd1iXHT96alCoN2KJo6/4x1DZC3wZs8=
-github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d h1:Q+gqLBOPkFGHyCJxXMRqtUgUbTjI8/Ze8vu8GGyNFwo=
 github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d/go.mod h1:Gy+0tqhJvgGlqnTF8CVGP0AaGRjwBtXs/a5PA0Y3+A4=
-github.com/tinylib/msgp v1.1.6 h1:i+SbKraHhnrf9M5MYmvQhFnbLhAXSDWF8WWsuyRdocw=
+github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee h1:8Iv5m6xEo1NR1AvpV+7XmhI4r39LGNzwUL4YpMuL5vk=
+github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee/go.mod h1:qwtSXrKuJh/zsFQ12yEE89xfCrGKK63Rr7ctU/uCo4g=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/tinylib/msgp v1.1.6/go.mod h1:75BAfg2hauQhs3qedfdDZmWAPcFMAvJE5b9rGOMufyw=
-github.com/urfave/cli/v2 v2.25.0 h1:ykdZKuQey2zq0yin/l7JOm9Mh+pg72ngYMeB0ABn6q8=
-github.com/urfave/cli/v2 v2.25.0/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
+github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0=
+github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw=
+github.com/urfave/cli/v2 v2.25.3 h1:VJkt6wvEBOoSjPFQvOkv6iWIrsJyCrKGtCtxXWwmGeY=
+github.com/urfave/cli/v2 v2.25.3/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/fasthttp v1.44.0 h1:R+gLUhldIsfg1HokMuQjdQ5bh9nuXHPIfvkYUu9eR5Q=
-github.com/valyala/fasthttp v1.44.0/go.mod h1:f6VbjjoI3z1NDOZOv17o6RvtRSWxC77seBFc2uWtgiY=
+github.com/valyala/fasthttp v1.47.0 h1:y7moDoxYzMooFpT5aHgNgVOQDrS3qlkfiP9mDtGGK9c=
+github.com/valyala/fasthttp v1.47.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA=
 github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
 github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
 github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
 golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
+golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
+golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM=
+golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
+golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU=
+golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
+golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ=
 golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
+golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s=
+golang.org/x/tools v0.8.0 h1:vSDcovVPld282ceKgDimkRSC8kpaH1dgyc9UMzlt84Y=
+golang.org/x/tools v0.8.0/go.mod h1:JxBZ99ISMI5ViVkT1tr6tdNmXeTrcpVSD3vZ1RsRdN4=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
+howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
--- a/kubernetes/deployment.yaml
+++ b/kubernetes/deployment.yaml
@@ -1,42 +0,0 @@
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: llama
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llama
-  namespace: llama
-  labels:
-    app: llama
-spec:
-  selector:
-    matchLabels:
-      app: llama
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        app: llama
-      name: llama
-    spec:
-      containers:
-        - name: llama
-          args:
-          - api
-          image: quay.io/go-skynet/llama-cli:latest
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama
-  namespace: llama
-spec:
-  selector:
-    app: llama
-  type: LoadBalancer
-  ports:
-    - protocol: TCP
-      port: 8080
-      targetPort: 8080
--- a/main.go
+++ b/main.go
@@ -1,232 +1,97 @@
 package main

 import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
 	"os"
-	"runtime"
-	"text/template"
-
-	llama "github.com/go-skynet/go-llama.cpp"
-	api "github.com/go-skynet/llama-cli/api"
-	model "github.com/go-skynet/llama-cli/pkg/model"

+	api "github.com/go-skynet/LocalAI/api"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/jaypipes/ghw"
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
 	"github.com/urfave/cli/v2"
 )

-// Define the template string
-var emptyInput string = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-{{.Instruction}}
-
-### Response:`
-
-var nonEmptyInput string = `Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-
-### Instruction:
-{{.Instruction}}
-
-### Input:
-{{.Input}}
-
-### Response:
-`
-
-func llamaFromOptions(ctx *cli.Context) (*llama.LLama, error) {
-	opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
-	return llama.New(ctx.String("model"), opts...)
-}
-
-func templateString(t string, in interface{}) (string, error) {
-	// Parse the template
-	tmpl, err := template.New("prompt").Parse(t)
-	if err != nil {
-		return "", err
-	}
-
-	var buf bytes.Buffer
-	err = tmpl.Execute(&buf, in)
-	if err != nil {
-		return "", err
-	}
-	return buf.String(), nil
-}
-
-var modelFlags = []cli.Flag{
-	&cli.StringFlag{
-		Name:    "model",
-		EnvVars: []string{"MODEL"},
-	},
-	&cli.IntFlag{
-		Name:    "tokens",
-		EnvVars: []string{"TOKENS"},
-		Value:   128,
-	},
-	&cli.IntFlag{
-		Name:    "context-size",
-		EnvVars: []string{"CONTEXT_SIZE"},
-		Value:   512,
-	},
-	&cli.IntFlag{
-		Name:    "threads",
-		EnvVars: []string{"THREADS"},
-		Value:   runtime.NumCPU(),
-	},
-	&cli.Float64Flag{
-		Name:    "temperature",
-		EnvVars: []string{"TEMPERATURE"},
-		Value:   0.95,
-	},
-	&cli.Float64Flag{
-		Name:    "topp",
-		EnvVars: []string{"TOP_P"},
-		Value:   0.85,
-	},
-	&cli.IntFlag{
-		Name:    "topk",
-		EnvVars: []string{"TOP_K"},
-		Value:   20,
-	},
-}
-
 func main() {
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+
+	path, err := os.Getwd()
+	if err != nil {
+		log.Error().Msgf("error: %s", err.Error())
+		os.Exit(1)
+	}
+
+	threads := 4
+	cpu, err := ghw.CPU()
+	if err == nil {
+		threads = int(cpu.TotalCores)
+	}
+
 	app := &cli.App{
-		Name:    "llama-cli",
-		Version: "0.1",
-		Usage:   "llama-cli --model ... --instruction 'What is an alpaca?'",
-		Flags: append(modelFlags,
-			&cli.StringFlag{
-				Name:    "template",
-				EnvVars: []string{"TEMPLATE"},
+		Name:  "LocalAI",
+		Usage: "OpenAI compatible API for running LLaMA/GPT models locally on CPU with consumer grade hardware.",
+		Flags: []cli.Flag{
+			&cli.BoolFlag{
+				Name:    "f16",
+				EnvVars: []string{"F16"},
+			},
+			&cli.BoolFlag{
+				Name:    "debug",
+				EnvVars: []string{"DEBUG"},
+			},
+			&cli.IntFlag{
+				Name:        "threads",
+				DefaultText: "Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested.",
+				EnvVars:     []string{"THREADS"},
+				Value:       threads,
 			},
 			&cli.StringFlag{
-				Name:    "instruction",
-				EnvVars: []string{"INSTRUCTION"},
+				Name:        "models-path",
+				DefaultText: "Path containing models used for inferencing",
+				EnvVars:     []string{"MODELS_PATH"},
+				Value:       path,
 			},
 			&cli.StringFlag{
-				Name:    "input",
-				EnvVars: []string{"INPUT"},
-			}),
-		Description: `Run llama.cpp inference`,
-		UsageText: `
-llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
-
-	An Alpaca (Vicugna pacos) is a domesticated species of South American camelid, related to llamas and originally from Peru but now found throughout much of Andean region. They are bred for their fleeces which can be spun into wool or knitted items such as hats, sweaters, blankets etc
-		
-echo "An Alpaca (Vicugna pacos) is a domesticated species of South American camelid, related to llamas and originally from Peru but now found throughout much of Andean region. They are bred for their fleeces which can be spun into wool or knitted items such as hats, sweaters, blankets etc" | llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "Proofread, improving clarity and flow" --input "-"
-
-	An Alpaca (Vicugna pacos) is a domesticated species from South America that's related to llamas. Originating in Peru but now found throughout the Andean region, they are bred for their fleeces which can be spun into wool or knitted items such as hats and sweaters—blankets too!
-`,
-		Copyright: "go-skynet authors",
-		Commands: []*cli.Command{
-			{
-
-				Name: "api",
-				Flags: []cli.Flag{
-					&cli.IntFlag{
-						Name:    "threads",
-						EnvVars: []string{"THREADS"},
-						Value:   runtime.NumCPU(),
-					},
-					&cli.StringFlag{
-						Name:    "models-path",
-						EnvVars: []string{"MODELS_PATH"},
-					},
-					&cli.StringFlag{
-						Name:    "address",
-						EnvVars: []string{"ADDRESS"},
-						Value:   ":8080",
-					},
-					&cli.IntFlag{
-						Name:    "context-size",
-						EnvVars: []string{"CONTEXT_SIZE"},
-						Value:   512,
-					},
-				},
-				Action: func(ctx *cli.Context) error {
-					return api.Start(model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
-				},
+				Name:        "config-file",
+				DefaultText: "Config file",
+				EnvVars:     []string{"CONFIG_FILE"},
+			},
+			&cli.StringFlag{
+				Name:        "address",
+				DefaultText: "Bind address for the API server.",
+				EnvVars:     []string{"ADDRESS"},
+				Value:       ":8080",
+			},
+			&cli.IntFlag{
+				Name:        "context-size",
+				DefaultText: "Default context size of the model",
+				EnvVars:     []string{"CONTEXT_SIZE"},
+				Value:       512,
 			},
 		},
+		Description: `
+LocalAI is a drop-in replacement OpenAI API which runs inference locally.
+
+Some of the models compatible are:
+- Vicuna
+- Koala
+- GPT4ALL
+- GPT4ALL-J
+- Cerebras
+- Alpaca
+- StableLM (ggml quantized)
+
+It uses llama.cpp, ggml and gpt4all as backend with golang c bindings.
+`,
+		UsageText: `local-ai [options]`,
+		Copyright: "go-skynet authors",
 		Action: func(ctx *cli.Context) error {
-
-			instruction := ctx.String("instruction")
-			input := ctx.String("input")
-			templ := ctx.String("template")
-
-			promptTemplate := ""
-
-			if input != "" {
-				promptTemplate = nonEmptyInput
-			} else {
-				promptTemplate = emptyInput
-			}
-
-			if templ != "" {
-				dat, err := os.ReadFile(templ)
-				if err != nil {
-					fmt.Printf("Failed reading file: %s", err.Error())
-					os.Exit(1)
-				}
-				promptTemplate = string(dat)
-			}
-
-			if instruction == "-" {
-				dat, err := ioutil.ReadAll(os.Stdin)
-				if err != nil {
-					fmt.Printf("reading stdin failed: %s", err)
-					os.Exit(1)
-				}
-				instruction = string(dat)
-			}
-
-			if input == "-" {
-				dat, err := ioutil.ReadAll(os.Stdin)
-				if err != nil {
-					fmt.Printf("reading stdin failed: %s", err)
-					os.Exit(1)
-				}
-				input = string(dat)
-			}
-
-			str, err := templateString(promptTemplate, struct {
-				Instruction string
-				Input       string
-			}{Instruction: instruction, Input: input})
-
-			if err != nil {
-				fmt.Println("Templating the input failed:", err.Error())
-				os.Exit(1)
-			}
-
-			l, err := llamaFromOptions(ctx)
-			if err != nil {
-				fmt.Println("Loading the model failed:", err.Error())
-				os.Exit(1)
-			}
-
-			res, err := l.Predict(
-				str,
-				llama.SetTemperature(ctx.Float64("temperature")),
-				llama.SetTopP(ctx.Float64("topp")),
-				llama.SetTopK(ctx.Int("topk")),
-				llama.SetTokens(ctx.Int("tokens")),
-				llama.SetThreads(ctx.Int("threads")),
-			)
-			if err != nil {
-				fmt.Printf("predicting failed: %s", err)
-				os.Exit(1)
-			}
-			fmt.Println(res)
-			return nil
+			return api.App(ctx.String("config-file"), model.NewModelLoader(ctx.String("models-path")), ctx.Int("threads"), ctx.Int("context-size"), ctx.Bool("f16"), ctx.Bool("debug"), false).Listen(ctx.String("address"))
 		},
 	}

-	err := app.Run(os.Args)
+	err = app.Run(os.Args)
 	if err != nil {
-		fmt.Println(err)
+		log.Error().Msgf("error: %s", err.Error())
 		os.Exit(1)
 	}
 }
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -10,31 +10,57 @@ import (
 	"sync"
 	"text/template"

+	"github.com/rs/zerolog/log"
+
+	rwkv "github.com/donomii/go-rwkv.cpp"
+	gpt2 "github.com/go-skynet/go-gpt2.cpp"
+	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
 )

 type ModelLoader struct {
-	modelPath        string
-	mu               sync.Mutex
-	models           map[string]*llama.LLama
-	promptsTemplates map[string]*template.Template
+	ModelPath string
+	mu        sync.Mutex
+
+	models            map[string]*llama.LLama
+	gptmodels         map[string]*gptj.GPTJ
+	gpt2models        map[string]*gpt2.GPT2
+	gptstablelmmodels map[string]*gpt2.StableLM
+	rwkv              map[string]*rwkv.RwkvState
+	promptsTemplates  map[string]*template.Template
 }

 func NewModelLoader(modelPath string) *ModelLoader {
-	return &ModelLoader{modelPath: modelPath, models: make(map[string]*llama.LLama), promptsTemplates: make(map[string]*template.Template)}
+	return &ModelLoader{
+		ModelPath:         modelPath,
+		gpt2models:        make(map[string]*gpt2.GPT2),
+		gptmodels:         make(map[string]*gptj.GPTJ),
+		gptstablelmmodels: make(map[string]*gpt2.StableLM),
+		models:            make(map[string]*llama.LLama),
+		rwkv:              make(map[string]*rwkv.RwkvState),
+		promptsTemplates:  make(map[string]*template.Template),
+	}
+}
+
+func (ml *ModelLoader) ExistsInModelPath(s string) bool {
+	_, err := os.Stat(filepath.Join(ml.ModelPath, s))
+	return err == nil
 }

 func (ml *ModelLoader) ListModels() ([]string, error) {
-	files, err := ioutil.ReadDir(ml.modelPath)
+	files, err := ioutil.ReadDir(ml.ModelPath)
 	if err != nil {
 		return []string{}, err
 	}

 	models := []string{}
 	for _, file := range files {
-		if strings.HasSuffix(file.Name(), ".bin") {
-			models = append(models, strings.TrimRight(file.Name(), ".bin"))
+		// Skip templates, YAML and .keep files
+		if strings.HasSuffix(file.Name(), ".tmpl") || strings.HasSuffix(file.Name(), ".keep") || strings.HasSuffix(file.Name(), ".yaml") || strings.HasSuffix(file.Name(), ".yml") {
+			continue
 		}
+
+		models = append(models, file.Name())
 	}

 	return models, nil
@@ -46,12 +72,19 @@ func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string,

 	m, ok := ml.promptsTemplates[modelName]
 	if !ok {
-		// try to find a s.bin
-		modelBin := fmt.Sprintf("%s.bin", modelName)
-		m, ok = ml.promptsTemplates[modelBin]
-		if !ok {
-			return "", fmt.Errorf("no prompt template available")
+		modelFile := filepath.Join(ml.ModelPath, modelName)
+		if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+			return "", err
 		}
+
+		t, exists := ml.promptsTemplates[modelName]
+		if exists {
+			m = t
+		}
+
+	}
+	if m == nil {
+		return "", nil
 	}

 	var buf bytes.Buffer
@@ -62,53 +95,191 @@ func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string,
 	return buf.String(), nil
 }

-func (ml *ModelLoader) LoadModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
+func (ml *ModelLoader) loadTemplateIfExists(modelName, modelFile string) error {
+	// Check if the template was already loaded
+	if _, ok := ml.promptsTemplates[modelName]; ok {
+		return nil
+	}
+
+	// Check if the model path exists
+	// skip any error here - we run anyway if a template does not exist
+	modelTemplateFile := fmt.Sprintf("%s.tmpl", modelName)
+
+	if !ml.ExistsInModelPath(modelTemplateFile) {
+		return nil
+	}
+
+	dat, err := os.ReadFile(filepath.Join(ml.ModelPath, modelTemplateFile))
+	if err != nil {
+		return err
+	}
+
+	// Parse the template
+	tmpl, err := template.New("prompt").Parse(string(dat))
+	if err != nil {
+		return err
+	}
+	ml.promptsTemplates[modelName] = tmpl
+
+	return nil
+}
+
+func (ml *ModelLoader) LoadStableLMModel(modelName string) (*gpt2.StableLM, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()

 	// Check if we already have a loaded model
-	modelFile := filepath.Join(ml.modelPath, modelName)
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}

-	if m, ok := ml.models[modelFile]; ok {
+	if m, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
 		return m, nil
 	}

-	// Check if the model path exists
-	if _, err := os.Stat(modelFile); os.IsNotExist(err) {
-		// try to find a s.bin
-		modelBin := fmt.Sprintf("%s.bin", modelFile)
-		if _, err := os.Stat(modelBin); os.IsNotExist(err) {
-			return nil, err
-		} else {
-			modelName = fmt.Sprintf("%s.bin", modelName)
-			modelFile = modelBin
-		}
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gpt2.NewStableLM(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gptstablelmmodels[modelName] = model
+	return model, err
+}
+
+func (ml *ModelLoader) LoadGPT2Model(modelName string) (*gpt2.GPT2, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.gpt2models[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
 	}

 	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gpt2.New(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gpt2models[modelName] = model
+	return model, err
+}
+
+func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.gptmodels[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gptj.New(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gptmodels[modelName] = model
+	return model, err
+}
+
+func (ml *ModelLoader) LoadRWKV(modelName, tokenFile string, threads uint32) (*rwkv.RwkvState, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	log.Debug().Msgf("Loading model name: %s", modelName)
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.rwkv[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	tokenPath := filepath.Join(ml.ModelPath, tokenFile)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model := rwkv.LoadFiles(modelFile, tokenPath, threads)
+	if model == nil {
+		return nil, fmt.Errorf("could not load model")
+	}
+
+	ml.rwkv[modelName] = model
+	return model, nil
+}
+
+func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	log.Debug().Msgf("Loading model name: %s", modelName)
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.models[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.ModelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
 	model, err := llama.New(modelFile, opts...)
 	if err != nil {
 		return nil, err
 	}

 	// If there is a prompt template, load it
-
-	modelTemplateFile := fmt.Sprintf("%s.tmpl", modelFile)
-	// Check if the model path exists
-	if _, err := os.Stat(modelTemplateFile); err == nil {
-		dat, err := os.ReadFile(modelTemplateFile)
-		if err != nil {
-			return nil, err
-		}
-
-		// Parse the template
-		tmpl, err := template.New("prompt").Parse(string(dat))
-		if err != nil {
-			return nil, err
-		}
-		ml.promptsTemplates[modelName] = tmpl
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
 	}

-	ml.models[modelFile] = model
+	ml.models[modelName] = model
 	return model, err
 }
--- a/prompt-templates/ggml-gpt4all-j.tmpl
+++ b/prompt-templates/ggml-gpt4all-j.tmpl
@@ -0,0 +1,4 @@
+The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+### Prompt:
+{{.Input}}
+### Response:
--- a/prompt-templates/wizardlm.tmpl
+++ b/prompt-templates/wizardlm.tmpl
@@ -0,0 +1,3 @@
+{{.Input}}
+
+### Response:
--- a/renovate.json
+++ b/renovate.json
@@ -0,0 +1,17 @@
+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": [
+    "config:base"
+  ],
+  "regexManagers": [
+    {
+      "fileMatch": [
+        "^Makefile$"
+      ],
+      "matchStrings": [
+        "#\\s*renovate:\\s*datasource=(?<datasource>.*?) depName=(?<depName>.*?)( datasourceTemplate=(?<datasourceTemplate>.*?))?( packageNameTemplate=(?<packageNameTemplate>.*?))?( depNameTemplate=(?<depNameTemplate>.*?))?( valueTemplate=(?<currentValueTemplate>.*?))?( versioning=(?<versioning>.*?))?\\s+.+_VERSION=(?<currentValue>.*?)\\s"
+      ],
+      "versioningTemplate": "{{#if versioning}}{{versioning}}{{/if}}"
+    }
+  ]
+}
--- a/tests/fixtures/completion.tmpl
+++ b/tests/fixtures/completion.tmpl
@@ -0,0 +1 @@
+{{.Input}}
--- a/tests/fixtures/config.yaml
+++ b/tests/fixtures/config.yaml
@@ -0,0 +1,28 @@
+- name: list1
+  parameters:
+    model: testmodel
+  context_size: 512
+  threads: 10
+  stopwords:
+  - "HUMAN:"
+  - "### Response:"
+  roles:
+    user: "HUMAN:"
+    system: "GPT:"
+  template:
+    completion: completion
+    chat: ggml-gpt4all-j
+- name: list2
+  parameters:
+    model: testmodel
+  context_size: 512
+  threads: 10
+  stopwords:
+  - "HUMAN:"
+  - "### Response:"
+  roles:
+    user: "HUMAN:"
+    system: "GPT:"
+  template:
+    completion: completion
+    chat: ggml-gpt4all-j
--- a/tests/fixtures/ggml-gpt4all-j.tmpl
+++ b/tests/fixtures/ggml-gpt4all-j.tmpl
@@ -0,0 +1,4 @@
+The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+### Prompt:
+{{.Input}}
+### Response:
--- a/tests/fixtures/gpt4.yaml
+++ b/tests/fixtures/gpt4.yaml
@@ -0,0 +1,14 @@
+name: gpt4all
+parameters:
+  model: testmodel
+context_size: 512
+threads: 10
+stopwords:
+- "HUMAN:"
+- "### Response:"
+roles:
+  user: "HUMAN:"
+  system: "GPT:"
+template:
+  completion: completion
+  chat: ggml-gpt4all-j
--- a/tests/fixtures/gpt4_2.yaml
+++ b/tests/fixtures/gpt4_2.yaml
@@ -0,0 +1,14 @@
+name: gpt4all-2
+parameters:
+  model: testmodel
+context_size: 1024
+threads: 5
+stopwords:
+- "HUMAN:"
+- "### Response:"
+roles:
+  user: "HUMAN:"
+  system: "GPT:"
+template:
+  completion: completion
+  chat: ggml-gpt4all-j
Author	SHA1	Message	Date
Ettore Di Giacinto	4eae570ef5	Update docs (#163 )	2023-05-03 15:51:54 +02:00
Ettore Di Giacinto	67992a7d99	feat: support slices or strings in the prompt completion endpoint (#162 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-03 13:13:31 +02:00
renovate[bot]	0a4899f366	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 8ceb616 (#150 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-03 11:48:06 +02:00
renovate[bot]	1eb02f6c91	fix(deps): update module github.com/onsi/ginkgo/v2 to v2.9.3 (#161 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-03 11:47:54 +02:00
mudler	575874e4fb	readme: minor update	2023-05-03 11:46:29 +02:00
Ettore Di Giacinto	751b7eca62	feat: add rwkv support (#158 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-03 11:45:22 +02:00
Ettore Di Giacinto	1ae7150810	feat: allow to specify default backend for model (#156 ) Signed-off-by: mudler <mudler@c3os.io>	2023-05-03 00:31:28 +02:00
Ettore Di Giacinto	70caf9bf8c	feat: support stopwords both string and arrays (#154 )	2023-05-02 23:30:00 +02:00
Dave	0b226ac027	Stop parameter of OpenAIRequest changed to String Array (#153 )	2023-05-02 22:02:45 +02:00
Ettore Di Giacinto	220d6fd59b	feat: add stream events (#152 )	2023-05-02 20:03:35 +02:00
antongisli	0a00a4b58e	adding mac build and example (#151 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-05-02 19:24:45 +02:00
Ettore Di Giacinto	156e15a4fa	Bump llama.cpp, downgrade gpt4all-j (#149 )	2023-05-02 16:07:18 +02:00
renovate[bot]	271d3f6673	fix(deps): update module github.com/valyala/fasthttp to v1.47.0 (#143 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 23:36:58 +02:00
Ettore Di Giacinto	fec4ab93c5	docs: Add langchain to the example index (#147 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 23:21:07 +02:00
renovate[bot]	38a7a7a54d	fix(deps): update github.com/go-skynet/go-gpt4all-j.cpp digest to 77bf8c1 (#141 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-05-01 23:18:41 +02:00
Ettore Di Giacinto	0db0704e2c	docs: Add slack-bot example (#145 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 23:18:24 +02:00
Dave	88f472e5d2	Add LangchainJS Examples (#146 )	2023-05-01 23:18:14 +02:00
Ettore Di Giacinto	92452d46da	feat: add new gpt4all-j binding (#142 )	2023-05-01 20:00:15 +02:00
Ettore Di Giacinto	ac70252d70	drop: remove helm charts, now in separate repo (#134 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-05-01 18:07:41 +02:00
renovate[bot]	f6451d2518	fix(deps): update module github.com/urfave/cli/v2 to v2.25.3 (#140 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 18:07:29 +02:00
Ettore Di Giacinto	2473f9d19b	docs: add discord-bot preview (#137 )	2023-05-01 11:03:34 +02:00
renovate[bot]	bc583385a9	fix(deps): update module github.com/urfave/cli/v2 to v2.25.2 (#136 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 07:53:48 +02:00
renovate[bot]	8286bfbab7	fix(deps): update module github.com/sashabaranov/go-openai to v1.9.1 (#135 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-05-01 07:52:20 +02:00
Ettore Di Giacinto	d129fabe3b	docs: enhancements (#133 )	2023-04-30 23:27:02 +02:00
renovate[bot]	2539867247	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 377fd24 (#129 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-30 11:09:48 +02:00
renovate[bot]	69fedb92d9	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 361b9f8 (#127 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-30 08:47:27 +02:00
Ettore Di Giacinto	54b5eadcc4	docs: add discord-bot example (#126 )	2023-04-30 00:31:28 +02:00
Ettore Di Giacinto	16773e2a35	feat: make images to build sources on start (#124 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-29 20:38:37 +02:00
renovate[bot]	78503c62b7	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 9bf702f (#125 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-29 16:53:39 +02:00
Ettore Di Giacinto	a330c9cee5	update: bump llama.cpp to 7f15c5c (#122 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-29 15:20:50 +02:00
Ettore Di Giacinto	ff0867996e	tests: increase timeout (#121 )	2023-04-29 14:56:00 +02:00
Ettore Di Giacinto	1bf8f996d1	docs: clarify GPT4ALL-J licensing (#120 )	2023-04-29 14:50:22 +02:00
Ettore Di Giacinto	52f4d993c1	feat: add /edit endpoint (#119 )	2023-04-29 09:22:09 +02:00
renovate[bot]	d0ceebc5d7	fix(deps): update module github.com/valyala/fasthttp to v1.46.0 (#118 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-28 22:44:29 +02:00
renovate[bot]	9122af3ae1	fix(deps): update github.com/go-skynet/go-llama.cpp digest to 3d084e4 (#108 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-28 19:24:49 +02:00
Ettore Di Giacinto	b8533428bc	bump: update llama.cpp (#117 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-28 19:24:28 +02:00
Ettore Di Giacinto	677905334c	docs: reorder section (#116 )	2023-04-28 13:55:23 +02:00
Mauro Morales	d1d55d29a0	Add Kairos LocalAI example to the links (#115 )	2023-04-28 13:52:17 +02:00
Ettore Di Giacinto	e07dba7ad6	docs: Add contributors (#113 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-28 10:54:39 +02:00
Matthieu Talbot	062f832510	Add EXPOSE to Dockerfile (#107 )	2023-04-27 16:45:24 +00:00
Ettore Di Giacinto	d0330bb64b	docs: update example README.md (#104 )	2023-04-27 17:46:14 +02:00
antongisli	91a23ec6ec	Anton readme (#99 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-04-27 17:17:03 +02:00
Ron Evans	0b000dd043	examples: correct typo in README (#103 ) Signed-off-by: deadprogram <ron@hybridgroup.com>	2023-04-27 17:14:38 +02:00
Ettore Di Giacinto	c73ba91a66	docs: update README	2023-04-27 15:39:48 +02:00
Ettore Di Giacinto	dfc00f8bc1	docs: update README.md (#98 )	2023-04-27 15:06:55 +02:00
Ettore Di Giacinto	a18ff9c9b3	docs: move api docs (#96 )	2023-04-27 10:42:50 +02:00
Ettore Di Giacinto	d0199279ad	docs: update, add config docs (#94 )	2023-04-27 10:39:01 +02:00
Ettore Di Giacinto	9ede1e12d8	few typos and clarity changes (#91 ) (#92 ) Co-authored-by: antongisli <anton@huge.geek.nz>	2023-04-27 07:47:39 +02:00
Ettore Di Giacinto	c806eae0de	feat: config files and SSE (#83 ) Signed-off-by: mudler <mudler@mocaccino.org> Signed-off-by: Tyler Gillson <tyler.gillson@gmail.com> Co-authored-by: Tyler Gillson <tyler.gillson@gmail.com>	2023-04-26 21:18:18 -07:00
renovate[bot]	4e2061636e	chore(deps): update actions/checkout action to v3 (#82 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-25 07:46:29 +02:00
renovate[bot]	e3ef171968	fix(deps): update module github.com/gofiber/fiber/v2 to v2.44.0 (#81 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-25 07:46:14 +02:00
Ettore Di Giacinto	12d83a4184	feat: Return OpenAI errors and update docs (#80 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-24 23:42:03 +02:00
renovate[bot]	045412e8dd	fix(deps): update module github.com/urfave/cli/v2 to v2.25.1 (#78 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-24 18:16:23 +02:00
renovate[bot]	9896a9a58b	fix(deps): update github.com/go-skynet/go-llama.cpp digest to e45cebe (#77 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-24 18:16:10 +02:00
Ettore Di Giacinto	b9011bda59	feat: automatic updates with renovate, docs updates (#76 )	2023-04-24 18:10:58 +02:00
Ettore Di Giacinto	2b2f5fa36a	feat: update llama.cpp (#72 )	2023-04-24 14:15:49 +02:00
renovate[bot]	43c557dc5c	fix(deps): update github.com/go-skynet/go-gpt4all-j.cpp digest to 1f7bff5 (#74 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-24 14:14:21 +02:00
renovate[bot]	7abb2c9bd7	fix(deps): update github.com/go-skynet/go-gpt2.cpp digest to 245a5bf (#73 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-24 14:13:04 +02:00
renovate[bot]	7a9ea4480a	Configure Renovate (#71 ) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>	2023-04-24 14:11:39 +02:00
Vladimir Malyutin	31bcc558de	Update README.md (#62 )	2023-04-22 14:42:30 +02:00
Ettore Di Giacinto	676e15f785	fix: make MacOS builds work (#61 )	2023-04-22 11:05:23 +02:00
Marc R Kellerman	3e71c90949	feature: add devcontainer for live debugging (#60 )	2023-04-22 01:20:03 +02:00
Ettore Di Giacinto	550ae9c968	docs: add Discord channel link (#59 )	2023-04-22 00:46:17 +02:00
Ettore Di Giacinto	1c872ec326	feat: add CI/tests (#58 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-22 00:44:52 +02:00
Marc R Kellerman	05f35b182c	fix(makefile): fix go-gpt2 folder and add verification before git clone (#51 ) Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-04-22 00:29:32 +02:00
Ettore Di Giacinto	79791438fe	Use the first available model if not specified (#55 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-21 22:54:43 +02:00
Tyler Gillson	bf20cc34f6	feat: Add helm chart (#56 )	2023-04-21 13:22:03 -07:00
Ettore Di Giacinto	5cba71de70	Add stopwords, debug mode, and other API enhancements (#54 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-21 19:46:59 +02:00
Ettore Di Giacinto	4b7e83056d	Update .env	2023-04-21 01:47:35 +02:00
Ettore Di Giacinto	ed954d66c3	Do not take all CPU by default (#50 )	2023-04-21 00:55:19 +02:00
Ettore Di Giacinto	f816dfae65	Add support for stablelm (#48 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-21 00:06:55 +02:00
Ettore Di Giacinto	142bcd66ca	Cleanup makefile, fix dep versions (#46 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-20 19:49:06 +02:00
Ettore Di Giacinto	1c4fbaae20	Add support for cerebras (#45 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-20 19:33:36 +02:00
Ettore Di Giacinto	d517a54e28	Major API enhancements (#44 )	2023-04-20 18:33:02 +02:00
Tyler Gillson	c905512bb0	Update example K8s manifests (#40 )	2023-04-20 18:31:11 +02:00
Ettore Di Giacinto	1254951fab	Add logo (#37 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-19 19:03:12 +02:00
Ettore Di Giacinto	80f50e6ccd	Rename project to LocalAI (#35 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-19 18:43:10 +02:00
Ettore Di Giacinto	7fec26f5d3	Enhancements (#34 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-19 17:10:29 +02:00
Ettore Di Giacinto	a9a875ee2b	⬆️ Bump llama.cpp (#33 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-17 21:34:02 +02:00
Ettore Di Giacinto	db5ac715f3	Use a reasonable default context size (#31 )	2023-04-17 18:45:42 +02:00