Do not take all CPU by default (#50 )

Add support for stablelm (#48 )
Signed-off-by: mudler <mudler@mocaccino.org>
2026-02-03 11:13:31 -05:00 · 2023-04-21 00:55:19 +02:00 · 2023-04-21 00:06:55 +02:00 · 2023-04-20 19:49:06 +02:00 · 2023-04-20 19:33:36 +02:00 · 2023-04-20 18:33:02 +02:00
27 changed files with 1234 additions and 861 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+models
--- a/.env
+++ b/.env
@@ -0,0 +1,4 @@
+THREADS=14
+CONTEXT_SIZE=512
+MODELS_PATH=/models
+# DEBUG=true
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -2,6 +2,7 @@
 name: 'build container images'

 on:
+  pull_request:
  push:
    branches:
      - master
@@ -12,68 +13,42 @@ jobs:
  docker:
    runs-on: ubuntu-latest
    steps:
-      - name: Release space from worker
-        run: |
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          df -h
-          echo
-          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-          sudo rm -rf /usr/local/lib/android
-          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-          sudo rm -rf /usr/share/dotnet
-          sudo apt-get remove -y '^mono-.*' || true
-          sudo apt-get remove -y '^ghc-.*' || true
-          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-          sudo apt-get remove -y 'php.*' || true
-          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-          sudo apt-get remove -y '^google-.*' || true
-          sudo apt-get remove -y azure-cli || true
-          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get autoremove -y
-          sudo apt-get clean
-          echo
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          sudo rm -rfv build || true
-          df -h
      - name: Checkout
        uses: actions/checkout@v3
+
      - name: Prepare
        id: prep
        run: |
-          DOCKER_IMAGE=quay.io/go-skynet/llama-cli
-          VERSION=latest
+          DOCKER_IMAGE=quay.io/go-skynet/local-ai
+          VERSION=master
          SHORTREF=${GITHUB_SHA::8}
+
          # If this is git tag, use the tag name as a docker tag
          if [[ $GITHUB_REF == refs/tags/* ]]; then
            VERSION=${GITHUB_REF#refs/tags/}
          fi
          TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
+
          # If the VERSION looks like a version number, assume that
          # this is the most recent version of the image and also
          # tag it 'latest'.
-          if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
+          if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
            TAGS="$TAGS,${DOCKER_IMAGE}:latest"
          fi
+
          # Set output parameters.
          echo ::set-output name=tags::${TAGS}
          echo ::set-output name=docker_image::${DOCKER_IMAGE}
-          echo ::set-output name=image::${DOCKER_IMAGE}:${VERSION}
+
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
        with:
          platforms: all
+
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@master
+
      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v2
@@ -81,9 +56,23 @@ jobs:
          registry: quay.io
          username: ${{ secrets.QUAY_USERNAME }}
          password: ${{ secrets.QUAY_PASSWORD }}
-      - uses: earthly/actions/setup-earthly@v1
      - name: Build
-        run: |
-            earthly config "global.conversion_parallelism" "1"
-            earthly config "global.buildkit_max_parallelism" "1"
-            earthly --push +image-all --IMAGE=${{ steps.prep.outputs.image }}
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@v4
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: .
+          file: ./Dockerfile
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: ${{ steps.prep.outputs.tags }}
+      - name: Build PRs
+        if: github.event_name == 'pull_request'
+        uses: docker/build-push-action@v4
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: .
+          file: ./Dockerfile
+          platforms: linux/amd64
+          push: false
+          tags: ${{ steps.prep.outputs.tags }}
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,11 @@
+# go-llama build artifacts
+go-llama
+go-gpt4all-j
+
+# LocalAI build binary
+LocalAI
+local-ai
+
+# Ignore models
+models/*.bin
+models/ggml-*
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -1,5 +1,5 @@
 # Make sure to check the documentation at http://goreleaser.com
-project_name: llama-cli
+project_name: local-ai
 builds:
  - ldflags:
      - -w -s
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+    
+    {
+        "name": "Launch Go",
+        "type": "go",
+        "request": "launch",
+        "mode": "debug",
+        "program": "${workspaceFolder}/main.go",
+        "args": [
+            "api"
+        ]
+    }
+    ]
+}
--- a/12
+++ b/12
@@ -0,0 +1,12 @@
+ARG GO_VERSION=1.20
+ARG DEBIAN_VERSION=11
+FROM golang:$GO_VERSION as builder
+WORKDIR /build
+RUN apt-get update && apt-get install -y cmake
+COPY . .
+ARG BUILD_TYPE=
+RUN make build${BUILD_TYPE}
+
+FROM debian:$DEBIAN_VERSION
+COPY --from=builder /build/local-ai /usr/bin/local-ai
+ENTRYPOINT [ "/usr/bin/local-ai" ]
--- a/46
+++ b/46
@@ -1,47 +1,5 @@
 VERSION 0.7

-go-deps:
-    ARG GO_VERSION=1.20
-    FROM golang:$GO_VERSION
-    WORKDIR /build
-    COPY go.mod ./
-    COPY go.sum ./
-    RUN go mod download
-    RUN apt-get update
-    SAVE ARTIFACT go.mod AS LOCAL go.mod
-    SAVE ARTIFACT go.sum AS LOCAL go.sum
-
-model-image:
-    ARG MODEL_IMAGE=quay.io/go-skynet/models:ggml2-alpaca-7b-v0.2
-    FROM $MODEL_IMAGE
-    SAVE ARTIFACT /models/model.bin
-
 build:
-    FROM +go-deps
-    WORKDIR /build
-    RUN git clone https://github.com/go-skynet/llama
-    RUN cd llama && make libllama.a
-    COPY . .
-    RUN C_INCLUDE_PATH=/build/llama LIBRARY_PATH=/build/llama go build -o llama-cli ./
-    SAVE ARTIFACT llama-cli AS LOCAL llama-cli
-
-image:
-    FROM +go-deps
-    ARG IMAGE=alpaca-cli
-    COPY +model-image/model.bin /model.bin
-    COPY +build/llama-cli /llama-cli
-    ENV MODEL_PATH=/model.bin
-    ENTRYPOINT [ "/llama-cli" ]
-    SAVE IMAGE --push $IMAGE
-
-lite-image:
-    FROM +go-deps
-    ARG IMAGE=alpaca-cli-nomodel
-    COPY +build/llama-cli /llama-cli
-    ENV MODEL_PATH=/model.bin
-    ENTRYPOINT [ "/llama-cli" ]
-    SAVE IMAGE --push $IMAGE-lite
-
-image-all:
-    BUILD --platform=linux/amd64 --platform=linux/arm64 +image
-    BUILD --platform=linux/amd64 --platform=linux/arm64 +lite-image
+    FROM DOCKERFILE -f Dockerfile .
+    SAVE ARTIFACT /usr/bin/local-ai AS LOCAL local-ai
--- a/104
+++ b/104
@@ -0,0 +1,104 @@
+GOCMD=go
+GOTEST=$(GOCMD) test
+GOVET=$(GOCMD) vet
+BINARY_NAME=local-ai
+GOLLAMA_VERSION?=llama.cpp-5ecff35
+GOGPT4ALLJ_VERSION?=1f548782d80d48b9a0fac33aae6f129358787bc0
+GOGPT2_VERSION?=1c24f5b86ac428cd5e81dae1f1427b1463bd2b06
+
+GREEN  := $(shell tput -Txterm setaf 2)
+YELLOW := $(shell tput -Txterm setaf 3)
+WHITE  := $(shell tput -Txterm setaf 7)
+CYAN   := $(shell tput -Txterm setaf 6)
+RESET  := $(shell tput -Txterm sgr0)
+
+.PHONY: all test build vendor
+
+all: help
+
+## Build:
+
+build: prepare ## Build the project
+	C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp $(GOCMD) build -o $(BINARY_NAME) ./
+
+buildgeneric: prepare-generic ## Build the project
+	C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2.cpp $(GOCMD) build -o $(BINARY_NAME) ./
+
+## GPT4ALL-J
+go-gpt4all-j:
+	git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j && cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION)
+# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
+	@find ./go-gpt4all-j -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gptj_/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_gptj_replace/g' {} +
+	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_gptj_replace/g' {} +
+
+go-gpt4all-j/libgptj.a: go-gpt4all-j
+	$(MAKE) -C go-gpt4all-j libgptj.a
+
+go-gpt4all-j/libgptj.a-generic: go-gpt4all-j
+	$(MAKE) -C go-gpt4all-j generic-libgptj.a
+
+# CEREBRAS GPT
+go-gpt2.cpp:
+	git clone --recurse-submodules https://github.com/go-skynet/go-gpt2.cpp go-gpt2.cpp && cd go-gpt2.cpp && git checkout -b build $(GOGPT2_VERSION)
+# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
+	@find ./go-gpt2.cpp -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
+	@find ./go-gpt2.cpp -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gpt2_/g' {} +
+
+go-gpt2.cpp/libgpt2.a: go-gpt2.cpp
+	$(MAKE) -C go-gpt2.cpp libgpt2.a
+
+go-gpt2.cpp/libgpt2.a-generic: go-gpt2.cpp
+	$(MAKE) -C go-gpt2.cpp generic-libgpt2.a
+
+go-llama:
+	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
+	$(MAKE) -C go-llama libbinding.a
+
+go-llama-generic:
+	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
+	$(MAKE) -C go-llama generic-libbinding.a
+
+replace:
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt2.cpp=$(shell pwd)/go-gpt2.cpp
+
+prepare: go-llama go-gpt4all-j/libgptj.a go-gpt2.cpp/libgpt2.a replace
+
+prepare-generic: go-llama-generic go-gpt4all-j/libgptj.a-generic go-gpt2.cpp/libgpt2.a-generic replace
+
+clean: ## Remove build related file
+	rm -fr ./go-llama
+	rm -rf ./go-gpt4all-j
+	rm -rf ./go-gpt2.cpp
+	rm -rf $(BINARY_NAME)
+
+## Run:
+run: prepare
+	$(GOCMD) run ./ api
+
+## Test:
+test: ## Run the tests of the project
+	$(GOTEST) -v -race ./... $(OUTPUT_OPTIONS)
+
+## Help:
+help: ## Show this help.
+	@echo ''
+	@echo 'Usage:'
+	@echo '  ${YELLOW}make${RESET} ${GREEN}<target>${RESET}'
+	@echo ''
+	@echo 'Targets:'
+	@awk 'BEGIN {FS = ":.*?## "} { \
+		if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf "    ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \
+		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
+		}' $(MAKEFILE_LIST)
--- a/README.md
+++ b/README.md
@@ -1,60 +1,97 @@
-## :camel: llama-cli
+<h1 align="center">
+  <br>
+  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+    LocalAI
+<br>
+</h1>

+> :warning: This project has been renamed from `llama-cli` to `LocalAI` to reflect the fact that we are focusing on a fast drop-in OpenAI API rather on the CLI interface. We think that there are already many projects that can be used as a CLI interface already, for instance  [llama.cpp](https://github.com/ggerganov/llama.cpp) and [gpt4all](https://github.com/nomic-ai/gpt4all). If you are were using `llama-cli` for CLI interactions and want to keep using it, use older versions or please open up an issue - contributions are welcome!

-llama-cli is a straightforward golang CLI interface for [llama.cpp](https://github.com/ggerganov/llama.cpp), providing a simple API and a command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
+LocalAI is a straightforward, drop-in replacement API compatible with OpenAI for local CPU inferencing, based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is Apache 2.0 Licensed and can be used for commercial purposes.

-## Container images
+- OpenAI compatible API
+- Supports multiple-models
+- Once loaded the first time, it keep models loaded in memory for faster inference
+- Support for prompt templates
+- Doesn't shell-out, but uses C bindings for a faster inference and better performance. Uses [go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) and [go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp).

-The `llama-cli` [container images](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest) come preloaded with the [alpaca.cpp 7B](https://github.com/antimatter15/alpaca.cpp) model, enabling you to start making predictions immediately! To begin, run:
+## Model compatibility

-```
-docker run -ti --rm quay.io/go-skynet/llama-cli:v0.2  --instruction "What's an alpaca?" --topk 10000
+It is compatible with the models supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) supports also [GPT4ALL-J](https://github.com/nomic-ai/gpt4all) and [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml).
+
+Tested with:
+- Vicuna
+- Alpaca
+- [GPT4ALL](https://github.com/nomic-ai/gpt4all)
+- [GPT4ALL-J](https://gpt4all.io/models/ggml-gpt4all-j.bin)
+- Koala
+- [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml)
+
+It should also be compatible with StableLM and GPTNeoX ggml models (untested)
+
+Note: You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
+
+## Usage
+
+> `LocalAI` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).
+
+The easiest way to run LocalAI is by using `docker-compose`:
+
+```bash
+
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI
+
+# copy your models to models/
+cp your-model.bin models/
+
+# (optional) Edit the .env file to set things like context size and threads
+# vim .env
+
+# start with docker-compose
+docker compose up -d --build
+
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
+
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "your-model.bin",            
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
 ```

-You will receive a response like the following:
+## Prompt templates 
+
+The API doesn't inject a default prompt for talking to the model. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.
+
+<details>
+You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:

 ```
-An alpaca is a member of the South American Camelid family, which includes the llama, guanaco and vicuña. It is a domesticated species that originates from the Andes mountain range in South America. Alpacas are used in the textile industry for their fleece, which is much softer than wool. Alpacas are also used for meat, milk, and fiber.
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Input}}
+
+### Response:
 ```

-## Basic usage
+See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prompt-templates) directory in this repository for templates for most popular models.

-To use llama-cli, specify a pre-trained GPT-based model, an input text, and an instruction for text generation. llama-cli takes the following arguments when running from the CLI:
+</details>

-```
-llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--template <template_path>] [--tokens <num_tokens>] [--threads <num_threads>] [--temperature <temperature>] [--topp <top_p>] [--topk <top_k>]
-```
+## API

-| Parameter    | Environment Variable | Default Value | Description                            |
-| ------------ | -------------------- | ------------- | -------------------------------------- |
-| template     | TEMPLATE             |               | A file containing a template for output formatting (optional).  |
-| instruction  | INSTRUCTION          |               | Input prompt text or instruction. "-" for STDIN.   |
-| input        | INPUT                | -             | Path to text or "-" for STDIN.                    |
-| model        | MODEL_PATH           |               | The path to the pre-trained GPT-based model.      |
-| tokens       | TOKENS               | 128           | The maximum number of tokens to generate. |
-| threads      | THREADS              | NumCPU()      | The number of threads to use for text generation. |
-| temperature  | TEMPERATURE          | 0.95          | Sampling temperature for model output. ( values between `0.1` and `1.0` )  |
-| top_p        | TOP_P                | 0.85          | The cumulative probability for top-p sampling. |
-| top_k        | TOP_K                | 20            | The number of top-k tokens to consider for text generation.  |
-| context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| alpaca       | ALPACA               | true          | Set to true for alpaca models. |
-
-Here's an example of using `llama-cli`:
-
-```
-llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
-```
-
-This will generate text based on the given model and instruction.
-
-## Advanced usage
-
-`llama-cli` also provides an API for running text generation as a service. 
+`LocalAI` provides an API for running text generation as a service, that follows the OpenAI reference and can be used as a drop-in. The models once loaded the first time will be kept in memory.

+<details>
 Example of starting the API with `docker`:

 ```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.2 api
+docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
 ```

 And you'll see:
@@ -72,101 +109,118 @@ And you'll see:
 You can control the API server options with command line arguments:

 ```
-llama-cli api --model <model_path> [--address <address>] [--threads <num_threads>]
+local-api --models-path <model_path> [--address <address>] [--threads <num_threads>]
 ```

-The API takes takes the following:
+The API takes takes the following parameters:

 | Parameter    | Environment Variable | Default Value | Description                            |
 | ------------ | -------------------- | ------------- | -------------------------------------- |
-| model        | MODEL_PATH           |               | The path to the pre-trained GPT-based model.      |
-| threads      | THREADS              | CPU cores     | The number of threads to use for text generation. |
+| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
+| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
 | address      | ADDRESS              | :8080         | The address and port to listen on. |
 | context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| alpaca       | ALPACA               | true          | Set to true for alpaca models. |

+Once the server is running, you can start making requests to it using HTTP, using the OpenAI API. 

-Once the server is running, you can make requests to it using HTTP. For example, to generate text based on an instruction, you can send a POST request to the `/predict` endpoint with the instruction as the request body:
+</details>
+
+### Supported OpenAI API endpoints
+
+You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
+
+Following the list of endpoints/parameters supported.
+
+#### Chat completions
+
+For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:

 ```
-curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
-    "text": "What is an alpaca?",
-    "topP": 0.8,
-    "topK": 50,
-    "temperature": 0.7,
-    "tokens": 100
-}'
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
 ```

-Note: The API doesn't inject a template for talking to the instance, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release, for instance:
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+#### Completions
+
+For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
+```
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+#### List models
+
+You can list all the models available with:

 ```
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-{instruction}
-
-### Response:
+curl http://localhost:8080/v1/models
 ```

 ## Using other models

-You can use the lite images ( for example `quay.io/go-skynet/llama-cli:v0.2-lite`) that don't ship any model, and specify a model binary to be used for inference with `--model`.
+gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):

-13B and 30B models are known to work:
-
-### 13B
-
-```
-# Download the model image, extract the model
-docker run --name model --entrypoint /models quay.io/go-skynet/models:ggml2-alpaca-13b-v0.2
-docker cp model:/models/model.bin ./
-
-# Use the model with llama-cli
-docker run -v $PWD:/models -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.2-lite api --model /models/model.bin
+```bash
+wget -O tokenizer.model https://huggingface.co/decapoda-research/llama-30b-hf/resolve/main/tokenizer.model
+mkdir models
+cp gpt4all.. models/
+git clone https://gist.github.com/eiz/828bddec6162a023114ce19146cb2b82
+pip install sentencepiece
+python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
+# There will be a new model with the ".tmp" extension, you have to use that one!
 ```

-### 30B
+### Windows compatibility

-```
-# Download the model image, extract the model
-docker run --name model --entrypoint /models quay.io/go-skynet/models:ggml2-alpaca-30b-v0.2
-docker cp model:/models/model.bin ./
-
-# Use the model with llama-cli
-docker run -v $PWD:/models -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.2-lite api --model /models/model.bin
-```
-
-### Golang client API
-
-The `llama-cli` codebase has also a small client in go that can be used alongside with the api:
-
-```golang
-package main
-
-import (
-	"fmt"
-
-	client "github.com/go-skynet/llama-cli/client"
-)
-
-func main() {
-
-	cli := client.NewClient("http://ip:30007")
-
-	out, err := cli.Predict("What's an alpaca?")
-	if err != nil {
-		panic(err)
-	}
-
-	fmt.Println(out)
-}
-```
+It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/LocalAI/issues/2

 ### Kubernetes

-You can run the API directly in Kubernetes:
+You can run the API in Kubernetes, see an example deployment in [kubernetes](https://github.com/go-skynet/LocalAI/tree/master/kubernetes)

-```bash
-kubectl apply -f https://raw.githubusercontent.com/go-skynet/llama-cli/master/kubernetes/deployment.yaml
-```
+### Build locally
+
+Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.
+
+In order to build the `LocalAI` container image locally you can use `docker`:
+
+```
+# build the image
+docker build -t LocalAI .
+docker run LocalAI
+```
+
+Or build the binary with `make`:
+
+```
+make build
+```
+
+## Short-term roadmap
+
+- [x] Mimic OpenAI API (https://github.com/go-skynet/LocalAI/issues/10)
+- Binary releases (https://github.com/go-skynet/LocalAI/issues/6)
+- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
+- [x] Multi-model support
+- Have a webUI!
+
+## License
+
+MIT
+
+## Acknowledgements
+
+- [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- https://github.com/tatsu-lab/stanford_alpaca
+- https://github.com/cornelk/llama-go for the initial ideas
+- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)
--- a/api.go
+++ b/api.go
@@ -1,78 +0,0 @@
-package main
-
-import (
-	"strconv"
-
-	llama "github.com/go-skynet/llama/go"
-	"github.com/gofiber/fiber/v2"
-)
-
-func api(l *llama.LLama, listenAddr string, threads int) error {
-	app := fiber.New()
-
-	/*
-		curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
-		    "text": "What is an alpaca?",
-		    "topP": 0.8,
-		    "topK": 50,
-		    "temperature": 0.7,
-		    "tokens": 100
-		}'
-	*/
-
-	// Endpoint to generate the prediction
-	app.Post("/predict", func(c *fiber.Ctx) error {
-		// Get input data from the request body
-		input := new(struct {
-			Text string `json:"text"`
-		})
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		// Set the parameters for the language model prediction
-		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
-		if err != nil {
-			return err
-		}
-
-		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
-		if err != nil {
-			return err
-		}
-
-		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
-		if err != nil {
-			return err
-		}
-
-		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
-		if err != nil {
-			return err
-		}
-
-		// Generate the prediction using the language model
-		prediction, err := l.Predict(
-			input.Text,
-			llama.SetTemperature(temperature),
-			llama.SetTopP(topP),
-			llama.SetTopK(topK),
-			llama.SetTokens(tokens),
-			llama.SetThreads(threads),
-		)
-		if err != nil {
-			return err
-		}
-
-		// Return the prediction in the response body
-		return c.JSON(struct {
-			Prediction string `json:"prediction"`
-		}{
-			Prediction: prediction,
-		})
-	})
-
-	// Start the server
-	app.Listen(":8080")
-	return nil
-}
--- a/api/api.go
+++ b/api/api.go
@@ -0,0 +1,386 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+	"sync"
+
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	gpt2 "github.com/go-skynet/go-gpt2.cpp"
+	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
+	llama "github.com/go-skynet/go-llama.cpp"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/recover"
+	"github.com/rs/zerolog/log"
+)
+
+type OpenAIResponse struct {
+	Created int      `json:"created,omitempty"`
+	Object  string   `json:"chat.completion,omitempty"`
+	ID      string   `json:"id,omitempty"`
+	Model   string   `json:"model,omitempty"`
+	Choices []Choice `json:"choices,omitempty"`
+}
+
+type Choice struct {
+	Index        int      `json:"index,omitempty"`
+	FinishReason string   `json:"finish_reason,omitempty"`
+	Message      *Message `json:"message,omitempty"`
+	Text         string   `json:"text,omitempty"`
+}
+
+type Message struct {
+	Role    string `json:"role,omitempty"`
+	Content string `json:"content,omitempty"`
+}
+
+type OpenAIModel struct {
+	ID     string `json:"id"`
+	Object string `json:"object"`
+}
+
+type OpenAIRequest struct {
+	Model string `json:"model"`
+
+	// Prompt is read only by completion API calls
+	Prompt string `json:"prompt"`
+
+	// Messages is read only by chat/completion API calls
+	Messages []Message `json:"messages"`
+
+	Echo bool `json:"echo"`
+	// Common options between all the API calls
+	TopP        float64 `json:"top_p"`
+	TopK        int     `json:"top_k"`
+	Temperature float64 `json:"temperature"`
+	Maxtokens   int     `json:"max_tokens"`
+
+	N int `json:"n"`
+
+	// Custom parameters - not present in the OpenAI API
+	Batch     int  `json:"batch"`
+	F16       bool `json:"f16kv"`
+	IgnoreEOS bool `json:"ignore_eos"`
+
+	Seed int `json:"seed"`
+}
+
+// https://platform.openai.com/docs/api-reference/completions
+func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16 bool, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		var err error
+		var model *llama.LLama
+		var gptModel *gptj.GPTJ
+		var gpt2Model *gpt2.GPT2
+		var stableLMModel *gpt2.StableLM
+
+		input := new(OpenAIRequest)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		modelFile := input.Model
+		received, _ := json.Marshal(input)
+
+		log.Debug().Msgf("Request received: %s", string(received))
+
+		// Set model from bearer token, if available
+		bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
+		bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+		if modelFile == "" && !bearerExists {
+			return fmt.Errorf("no model specified")
+		}
+
+		if bearerExists { // model specified in bearer token takes precedence
+			log.Debug().Msgf("Using model from bearer token: %s", bearer)
+			modelFile = bearer
+		}
+
+		// Try to load the model with both
+		var llamaerr, gpt2err, gptjerr, stableerr error
+		llamaOpts := []llama.ModelOption{}
+		if ctx != 0 {
+			llamaOpts = append(llamaOpts, llama.SetContext(ctx))
+		}
+		if f16 {
+			llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+		}
+
+		// TODO: this is ugly, better identifying the model somehow! however, it is a good stab for a first implementation..
+		model, llamaerr = loader.LoadLLaMAModel(modelFile, llamaOpts...)
+		if llamaerr != nil {
+			gptModel, gptjerr = loader.LoadGPTJModel(modelFile)
+			if gptjerr != nil {
+				gpt2Model, gpt2err = loader.LoadGPT2Model(modelFile)
+				if gpt2err != nil {
+					stableLMModel, stableerr = loader.LoadStableLMModel(modelFile)
+					if stableerr != nil {
+						return fmt.Errorf("llama: %s gpt: %s gpt2: %s stableLM: %s", llamaerr.Error(), gptjerr.Error(), gpt2err.Error(), stableerr.Error()) // llama failed first, so we want to catch both errors
+					}
+				}
+			}
+		}
+
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		mutexMap.Lock()
+		l, ok := mutexes[modelFile]
+		if !ok {
+			m := &sync.Mutex{}
+			mutexes[modelFile] = m
+			l = m
+		}
+		mutexMap.Unlock()
+		l.Lock()
+		defer l.Unlock()
+
+		// Set the parameters for the language model prediction
+		topP := input.TopP
+		if topP == 0 {
+			topP = 0.7
+		}
+		topK := input.TopK
+		if topK == 0 {
+			topK = 80
+		}
+
+		temperature := input.Temperature
+		if temperature == 0 {
+			temperature = 0.9
+		}
+
+		tokens := input.Maxtokens
+		if tokens == 0 {
+			tokens = 512
+		}
+
+		predInput := input.Prompt
+		if chat {
+			mess := []string{}
+			// TODO: encode roles
+			for _, i := range input.Messages {
+				mess = append(mess, i.Content)
+			}
+
+			predInput = strings.Join(mess, "\n")
+		}
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := loader.TemplatePrefix(modelFile, struct {
+			Input string
+		}{Input: predInput})
+		if err == nil {
+			predInput = templatedInput
+			log.Debug().Msgf("Template found, input modified to: %s", predInput)
+		}
+
+		result := []Choice{}
+
+		n := input.N
+
+		if input.N == 0 {
+			n = 1
+		}
+
+		var predFunc func() (string, error)
+		switch {
+		case stableLMModel != nil:
+			predFunc = func() (string, error) {
+				// Generate the prediction using the language model
+				predictOptions := []gpt2.PredictOption{
+					gpt2.SetTemperature(temperature),
+					gpt2.SetTopP(topP),
+					gpt2.SetTopK(topK),
+					gpt2.SetTokens(tokens),
+					gpt2.SetThreads(threads),
+				}
+
+				if input.Batch != 0 {
+					predictOptions = append(predictOptions, gpt2.SetBatch(input.Batch))
+				}
+
+				if input.Seed != 0 {
+					predictOptions = append(predictOptions, gpt2.SetSeed(input.Seed))
+				}
+
+				return stableLMModel.Predict(
+					predInput,
+					predictOptions...,
+				)
+			}
+		case gpt2Model != nil:
+			predFunc = func() (string, error) {
+				// Generate the prediction using the language model
+				predictOptions := []gpt2.PredictOption{
+					gpt2.SetTemperature(temperature),
+					gpt2.SetTopP(topP),
+					gpt2.SetTopK(topK),
+					gpt2.SetTokens(tokens),
+					gpt2.SetThreads(threads),
+				}
+
+				if input.Batch != 0 {
+					predictOptions = append(predictOptions, gpt2.SetBatch(input.Batch))
+				}
+
+				if input.Seed != 0 {
+					predictOptions = append(predictOptions, gpt2.SetSeed(input.Seed))
+				}
+
+				return gpt2Model.Predict(
+					predInput,
+					predictOptions...,
+				)
+			}
+		case gptModel != nil:
+			predFunc = func() (string, error) {
+				// Generate the prediction using the language model
+				predictOptions := []gptj.PredictOption{
+					gptj.SetTemperature(temperature),
+					gptj.SetTopP(topP),
+					gptj.SetTopK(topK),
+					gptj.SetTokens(tokens),
+					gptj.SetThreads(threads),
+				}
+
+				if input.Batch != 0 {
+					predictOptions = append(predictOptions, gptj.SetBatch(input.Batch))
+				}
+
+				if input.Seed != 0 {
+					predictOptions = append(predictOptions, gptj.SetSeed(input.Seed))
+				}
+
+				return gptModel.Predict(
+					predInput,
+					predictOptions...,
+				)
+			}
+		case model != nil:
+			predFunc = func() (string, error) {
+				// Generate the prediction using the language model
+				predictOptions := []llama.PredictOption{
+					llama.SetTemperature(temperature),
+					llama.SetTopP(topP),
+					llama.SetTopK(topK),
+					llama.SetTokens(tokens),
+					llama.SetThreads(threads),
+				}
+
+				if input.Batch != 0 {
+					predictOptions = append(predictOptions, llama.SetBatch(input.Batch))
+				}
+
+				if input.F16 {
+					predictOptions = append(predictOptions, llama.EnableF16KV)
+				}
+
+				if input.IgnoreEOS {
+					predictOptions = append(predictOptions, llama.IgnoreEOS)
+				}
+
+				if input.Seed != 0 {
+					predictOptions = append(predictOptions, llama.SetSeed(input.Seed))
+				}
+
+				return model.Predict(
+					predInput,
+					predictOptions...,
+				)
+			}
+		}
+
+		for i := 0; i < n; i++ {
+			prediction, err := predFunc()
+			if err != nil {
+				return err
+			}
+
+			if input.Echo {
+				prediction = predInput + prediction
+			}
+
+			if chat {
+				result = append(result, Choice{Message: &Message{Role: "assistant", Content: prediction}})
+			} else {
+				result = append(result, Choice{Text: prediction})
+			}
+		}
+
+		jsonResult, _ := json.Marshal(result)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(OpenAIResponse{
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+		})
+	}
+}
+
+func listModels(loader *model.ModelLoader) func(ctx *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		models, err := loader.ListModels()
+		if err != nil {
+			return err
+		}
+
+		dataModels := []OpenAIModel{}
+		for _, m := range models {
+			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+		}
+		return c.JSON(struct {
+			Object string        `json:"object"`
+			Data   []OpenAIModel `json:"data"`
+		}{
+			Object: "list",
+			Data:   dataModels,
+		})
+	}
+}
+
+func Start(loader *model.ModelLoader, listenAddr string, threads, ctxSize int, f16 bool) error {
+	// Return errors as JSON responses
+	app := fiber.New(fiber.Config{
+		// Override default error handler
+		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
+			// Status code defaults to 500
+			code := fiber.StatusInternalServerError
+
+			// Retrieve the custom status code if it's a *fiber.Error
+			var e *fiber.Error
+			if errors.As(err, &e) {
+				code = e.Code
+			}
+
+			// Send custom error page
+			return ctx.Status(code).JSON(struct {
+				Error string `json:"error"`
+			}{Error: err.Error()})
+		},
+	})
+
+	// Default middleware config
+	app.Use(recover.New())
+	app.Use(cors.New())
+
+	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+	mu := map[string]*sync.Mutex{}
+	var mumutex = &sync.Mutex{}
+
+	// openAI compatible API endpoint
+	app.Post("/v1/chat/completions", openAIEndpoint(true, loader, threads, ctxSize, f16, mumutex, mu))
+	app.Post("/chat/completions", openAIEndpoint(true, loader, threads, ctxSize, f16, mumutex, mu))
+
+	app.Post("/v1/completions", openAIEndpoint(false, loader, threads, ctxSize, f16, mumutex, mu))
+	app.Post("/completions", openAIEndpoint(false, loader, threads, ctxSize, f16, mumutex, mu))
+
+	app.Get("/v1/models", listModels(loader))
+	app.Get("/models", listModels(loader))
+
+	// Start the server
+	app.Listen(listenAddr)
+	return nil
+}
--- a/client/client.go
+++ b/client/client.go
@@ -1,75 +0,0 @@
-package client
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"net/http"
-)
-
-type Prediction struct {
-	Prediction string `json:"prediction"`
-}
-
-type Client struct {
-	baseURL  string
-	client   *http.Client
-	endpoint string
-}
-
-func NewClient(baseURL string) *Client {
-	return &Client{
-		baseURL:  baseURL,
-		client:   &http.Client{},
-		endpoint: "/predict",
-	}
-}
-
-type InputData struct {
-	Text        string  `json:"text"`
-	TopP        float64 `json:"topP,omitempty"`
-	TopK        int     `json:"topK,omitempty"`
-	Temperature float64 `json:"temperature,omitempty"`
-	Tokens      int     `json:"tokens,omitempty"`
-}
-
-func (c *Client) Predict(text string, opts ...InputOption) (string, error) {
-	input := NewInputData(opts...)
-	input.Text = text
-
-	// encode input data to JSON format
-	inputBytes, err := json.Marshal(input)
-	if err != nil {
-		return "", err
-	}
-
-	// create HTTP request
-	url := c.baseURL + c.endpoint
-	req, err := http.NewRequest("POST", url, bytes.NewBuffer(inputBytes))
-	if err != nil {
-		return "", err
-	}
-
-	// set request headers
-	req.Header.Set("Content-Type", "application/json")
-
-	// send request and get response
-	resp, err := c.client.Do(req)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		return "", fmt.Errorf("request failed with status %d", resp.StatusCode)
-	}
-
-	// decode response body to Prediction struct
-	var prediction Prediction
-	err = json.NewDecoder(resp.Body).Decode(&prediction)
-	if err != nil {
-		return "", err
-	}
-
-	return prediction.Prediction, nil
-}
--- a/client/options.go
+++ b/client/options.go
@@ -1,51 +0,0 @@
-package client
-
-import "net/http"
-
-type ClientOption func(c *Client)
-
-func WithHTTPClient(httpClient *http.Client) ClientOption {
-	return func(c *Client) {
-		c.client = httpClient
-	}
-}
-
-func WithEndpoint(endpoint string) ClientOption {
-	return func(c *Client) {
-		c.endpoint = endpoint
-	}
-}
-
-type InputOption func(d *InputData)
-
-func NewInputData(opts ...InputOption) *InputData {
-	data := &InputData{}
-	for _, opt := range opts {
-		opt(data)
-	}
-	return data
-}
-
-func WithTopP(topP float64) InputOption {
-	return func(d *InputData) {
-		d.TopP = topP
-	}
-}
-
-func WithTopK(topK int) InputOption {
-	return func(d *InputData) {
-		d.TopK = topK
-	}
-}
-
-func WithTemperature(temperature float64) InputOption {
-	return func(d *InputData) {
-		d.Temperature = temperature
-	}
-}
-
-func WithTokens(tokens int) InputOption {
-	return func(d *InputData) {
-		d.Tokens = tokens
-	}
-}
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,19 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    build:
+      context: .
+      dockerfile: Dockerfile
+      # args:
+        # BUILD_TYPE: generic # Uncomment to build CPU generic code that works on most HW
+    ports:
+      - 8080:8080
+    environment:
+      - MODELS_PATH=$MODELS_PATH
+      - CONTEXT_SIZE=$CONTEXT_SIZE
+      - THREADS=$THREADS
+      - DEBUG=$DEBUG
+    volumes:
+      - ./models:/models:cached
--- a/go.mod
+++ b/go.mod
@@ -1,34 +1,33 @@
-module github.com/go-skynet/llama-cli
+module github.com/go-skynet/LocalAI

 go 1.19

 require (
-	github.com/charmbracelet/bubbles v0.15.0
-	github.com/charmbracelet/bubbletea v0.23.2
-	github.com/charmbracelet/lipgloss v0.7.1
-	github.com/go-skynet/llama v0.0.0-20230321172246-7be5326e18cc
+	github.com/go-skynet/go-gpt2.cpp v0.0.0-20230420213900-1c24f5b86ac4
+	github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94
+	github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640
 	github.com/gofiber/fiber/v2 v2.42.0
+	github.com/jaypipes/ghw v0.10.0
+	github.com/rs/zerolog v1.29.1
 	github.com/urfave/cli/v2 v2.25.0
 )

 require (
+	github.com/StackExchange/wmi v1.2.1 // indirect
 	github.com/andybalholm/brotli v1.0.4 // indirect
-	github.com/atotto/clipboard v0.1.4 // indirect
-	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
-	github.com/containerd/console v1.0.3 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
+	github.com/ghodss/yaml v1.0.0 // indirect
+	github.com/go-ole/go-ole v1.2.6 // indirect
 	github.com/google/uuid v1.3.0 // indirect
+	github.com/jaypipes/pcidb v1.0.0 // indirect
 	github.com/klauspost/compress v1.15.9 // indirect
-	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
+	github.com/kr/text v0.2.0 // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
 	github.com/mattn/go-isatty v0.0.17 // indirect
-	github.com/mattn/go-localereader v0.0.1 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
-	github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b // indirect
-	github.com/muesli/cancelreader v0.2.2 // indirect
-	github.com/muesli/reflow v0.3.0 // indirect
-	github.com/muesli/termenv v0.15.1 // indirect
+	github.com/mitchellh/go-homedir v1.1.0 // indirect
 	github.com/philhofer/fwd v1.1.1 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
 	github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 // indirect
@@ -38,8 +37,7 @@ require (
 	github.com/valyala/fasthttp v1.44.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
-	golang.org/x/sync v0.1.0 // indirect
 	golang.org/x/sys v0.6.0 // indirect
-	golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
-	golang.org/x/text v0.3.7 // indirect
+	gopkg.in/yaml.v2 v2.4.0 // indirect
+	howett.net/plist v1.0.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,68 +1,65 @@
+github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
+github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8=
 github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
 github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
-github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4=
-github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI=
-github.com/aymanbagabas/go-osc52 v1.0.3/go.mod h1:zT8H+Rk4VSabYN90pWyugflM3ZhpTZNC7cASDfUCdT4=
-github.com/aymanbagabas/go-osc52 v1.2.1/go.mod h1:zT8H+Rk4VSabYN90pWyugflM3ZhpTZNC7cASDfUCdT4=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
-github.com/charmbracelet/bubbles v0.15.0 h1:c5vZ3woHV5W2b8YZI1q7v4ZNQaPetfHuoHzx+56Z6TI=
-github.com/charmbracelet/bubbles v0.15.0/go.mod h1:Y7gSFbBzlMpUDR/XM9MhZI374Q+1p1kluf1uLl8iK74=
-github.com/charmbracelet/bubbletea v0.23.1/go.mod h1:JAfGK/3/pPKHTnAS8JIE2u9f61BjWTQY57RbT25aMXU=
-github.com/charmbracelet/bubbletea v0.23.2 h1:vuUJ9HJ7b/COy4I30e8xDVQ+VRDUEFykIjryPfgsdps=
-github.com/charmbracelet/bubbletea v0.23.2/go.mod h1:FaP3WUivcTM0xOKNmhciz60M6I+weYLF76mr1JyI7sM=
-github.com/charmbracelet/harmonica v0.2.0/go.mod h1:KSri/1RMQOZLbw7AHqgcBycp8pgJnQMYYT8QZRqZ1Ao=
-github.com/charmbracelet/lipgloss v0.6.0/go.mod h1:tHh2wr34xcHjC2HCXIlGSG1jaDF0S0atAUvBMP6Ppuk=
-github.com/charmbracelet/lipgloss v0.7.1 h1:17WMwi7N1b1rVWOjMT+rCh7sQkvDU75B2hbZpc5Kc1E=
-github.com/charmbracelet/lipgloss v0.7.1/go.mod h1:yG0k3giv8Qj8edTCbbg6AlQ5e8KNWpFujkNawKNhE2c=
-github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
-github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
+github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/go-skynet/llama v0.0.0-20230321172246-7be5326e18cc h1:NcmO8mA7iRZIX0Qy2SjcsSaV14+g87MiTey1neUJaFQ=
-github.com/go-skynet/llama v0.0.0-20230321172246-7be5326e18cc/go.mod h1:ZtYsAIud4cvP9VTTI9uhdgR1uCwaO/gGKnZZ95h9i7w=
+github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
+github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
+github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
+github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
+github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
+github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
+github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
+github.com/go-skynet/go-gpt2.cpp v0.0.0-20230420213900-1c24f5b86ac4 h1:GkGuqnhDFKlCsT6Bo8sdY00A7rFXCzfU1nBOSS4ZnYM=
+github.com/go-skynet/go-gpt2.cpp v0.0.0-20230420213900-1c24f5b86ac4/go.mod h1:1Wj/xbkMfwQSOrhNYK178IzqQHstZbRfhx4s8p1M5VM=
+github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94 h1:rtrrMvlIq+g0/ltXjDdLeNtz0uc4wJ4Qs15GFU4ba4c=
+github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94/go.mod h1:5VZ9XbcINI0XcHhkcX8GPK8TplFGAzu1Hrg4tNiMCtI=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640 h1:8SSVbQ3yvq7JnfLCLF4USV0PkQnnduUkaNCv/hHDa3E=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
+github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/gofiber/fiber/v2 v2.42.0 h1:Fnp7ybWvS+sjNQsFvkhf4G8OhXswvB6Vee8hM/LyS+8=
 github.com/gofiber/fiber/v2 v2.42.0/go.mod h1:3+SGNjqMh5VQH5Vz2Wdi43zTIV16ktlFd3x3R6O1Zlc=
+github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
+github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/jaypipes/ghw v0.10.0 h1:UHu9UX08Py315iPojADFPOkmjTsNzHj4g4adsNKKteY=
+github.com/jaypipes/ghw v0.10.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g=
+github.com/jaypipes/pcidb v1.0.0 h1:vtZIfkiCUE42oYbJS0TAq9XSfSmcsgo9IdxSm9qzYU8=
+github.com/jaypipes/pcidb v1.0.0/go.mod h1:TnYUvqhPBzCKnH34KrIX22kAeEbDCSRJ9cqLRCuNDfk=
+github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
 github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
 github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
-github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
-github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
-github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
 github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
 github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng=
 github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
-github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
-github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
-github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
-github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b h1:1XF24mVaiu7u+CFywTdcDo2ie1pzzhwjt6RHqzpMU34=
-github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b/go.mod h1:fQuZ0gauxyBcmsdE3ZT4NasjaRdxmbCS0jRHsrWu3Ho=
-github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
-github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
-github.com/muesli/reflow v0.2.1-0.20210115123740-9e1d0d53df68/go.mod h1:Xk+z4oIWdQqJzsxyjgl3P22oYZnHdZ8FFTHAQQt5BMQ=
-github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s=
-github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8=
-github.com/muesli/termenv v0.11.1-0.20220204035834-5ac8409525e0/go.mod h1:Bd5NYQ7pd+SrtBSrSNoBBmXlcY8+Xj4BMJgh8qcZrvs=
-github.com/muesli/termenv v0.13.0/go.mod h1:sP1+uffeLaEYpyOTb8pLCUctGcGLnoFjSn4YJK5e2bc=
-github.com/muesli/termenv v0.14.0/go.mod h1:kG/pF1E7fh949Xhe156crRUrHNyK221IuGO7Ez60Uc8=
-github.com/muesli/termenv v0.15.1 h1:UzuTb/+hhlBugQz28rpzey4ZuKcZ03MeKsoG7IJZIxs=
-github.com/muesli/termenv v0.15.1/go.mod h1:HeAQPTzpfs016yGtA4g00CsdYnVLJvxsS4ANqrZs2sQ=
+github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
+github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
+github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
+github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
 github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
 github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
-github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
+github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
+github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/sahilm/fuzzy v0.1.0/go.mod h1:VFvziUEIMCrT6A6tw2RFIXPXXmzXbOsSHF0DOI8ZK9Y=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94/go.mod h1:90zrgN3D/WJsDd1iXHT96alCoN2KJo6/4x1DZC3wZs8=
 github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d h1:Q+gqLBOPkFGHyCJxXMRqtUgUbTjI8/Ze8vu8GGyNFwo=
@@ -90,34 +87,41 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
+golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
-golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
+howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
--- a/interactive.go
+++ b/interactive.go
@@ -1,142 +0,0 @@
-package main
-
-// A simple program demonstrating the text area component from the Bubbles
-// component library.
-
-import (
-	"fmt"
-	"strings"
-
-	"github.com/charmbracelet/bubbles/textarea"
-	"github.com/charmbracelet/bubbles/viewport"
-	tea "github.com/charmbracelet/bubbletea"
-	"github.com/charmbracelet/lipgloss"
-	llama "github.com/go-skynet/llama/go"
-)
-
-func startInteractive(l *llama.LLama, opts ...llama.PredictOption) error {
-	p := tea.NewProgram(initialModel(l, opts...))
-
-	_, err := p.Run()
-	return err
-}
-
-type (
-	errMsg error
-)
-
-type model struct {
-	viewport    viewport.Model
-	messages    *[]string
-	textarea    textarea.Model
-	senderStyle lipgloss.Style
-	err         error
-	l           *llama.LLama
-	opts        []llama.PredictOption
-
-	predictC chan string
-}
-
-func initialModel(l *llama.LLama, opts ...llama.PredictOption) model {
-	ta := textarea.New()
-	ta.Placeholder = "Send a message..."
-	ta.Focus()
-
-	ta.Prompt = "┃ "
-	ta.CharLimit = 280
-
-	ta.SetWidth(200)
-	ta.SetHeight(3)
-
-	// Remove cursor line styling
-	ta.FocusedStyle.CursorLine = lipgloss.NewStyle()
-
-	ta.ShowLineNumbers = false
-
-	vp := viewport.New(200, 5)
-	vp.SetContent(`Welcome to llama-cli. Type a message and press Enter to send. Alpaca doesn't keep context of the whole chat (yet).`)
-
-	ta.KeyMap.InsertNewline.SetEnabled(false)
-
-	predictChannel := make(chan string)
-	messages := []string{}
-	m := model{
-		textarea:    ta,
-		messages:    &messages,
-		viewport:    vp,
-		senderStyle: lipgloss.NewStyle().Foreground(lipgloss.Color("5")),
-		err:         nil,
-		l:           l,
-		opts:        opts,
-		predictC:    predictChannel,
-	}
-	go func() {
-		for p := range predictChannel {
-			str, _ := templateString(emptyInput, struct {
-				Instruction string
-				Input       string
-			}{Instruction: p})
-			res, _ := l.Predict(
-				str,
-				opts...,
-			)
-
-			mm := *m.messages
-			*m.messages = mm[:len(mm)-1]
-			*m.messages = append(*m.messages, m.senderStyle.Render("llama: ")+res)
-			m.viewport.SetContent(strings.Join(*m.messages, "\n"))
-			ta.Reset()
-			m.viewport.GotoBottom()
-		}
-	}()
-
-	return m
-}
-
-func (m model) Init() tea.Cmd {
-	return textarea.Blink
-}
-
-func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	var (
-		tiCmd tea.Cmd
-		vpCmd tea.Cmd
-	)
-
-	m.textarea, tiCmd = m.textarea.Update(msg)
-	m.viewport, vpCmd = m.viewport.Update(msg)
-
-	switch msg := msg.(type) {
-	case tea.WindowSizeMsg:
-
-	//	m.viewport.Width = msg.Width
-	//	m.viewport.Height = msg.Height
-	case tea.KeyMsg:
-		switch msg.Type {
-		case tea.KeyCtrlC, tea.KeyEsc:
-			fmt.Println(m.textarea.Value())
-			return m, tea.Quit
-		case tea.KeyEnter:
-			*m.messages = append(*m.messages, m.senderStyle.Render("You: ")+m.textarea.Value(), m.senderStyle.Render("Loading response..."))
-			m.predictC <- m.textarea.Value()
-			m.viewport.SetContent(strings.Join(*m.messages, "\n"))
-			m.textarea.Reset()
-			m.viewport.GotoBottom()
-		}
-
-	// We handle errors just like any other message
-	case errMsg:
-		m.err = msg
-		return m, nil
-	}
-
-	return m, tea.Batch(tiCmd, vpCmd)
-}
-
-func (m model) View() string {
-	return fmt.Sprintf(
-		"%s\n\n%s",
-		m.viewport.View(),
-		m.textarea.View(),
-	) + "\n\n"
-}
--- a/kubernetes/data-volume.yaml
+++ b/kubernetes/data-volume.yaml
@@ -0,0 +1,28 @@
+# Create a PVC containing a model binary, sourced from an arbitrary HTTP server
+# (requires https://github.com/kubevirt/containerized-data-importer)
+apiVersion: cdi.kubevirt.io/v1beta1
+kind: DataVolume
+metadata:
+  name: models
+  namespace: local-ai
+spec:
+  contentType: archive
+  source:
+    http:
+      url: http://<model_server>/koala-7B-4bit-128g.GGML.tar
+      secretRef: model-secret
+  pvc:
+    accessModes:
+    - ReadWriteOnce
+    resources:
+      requests:
+        storage: 5Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: model-secret
+  namespace: local-ai
+data:
+  accessKeyId: <model_server_username_base64_encoded>
+  secretKey: <model_server_password_base64_encoded>
--- a/kubernetes/deployment.yaml
+++ b/kubernetes/deployment.yaml
@@ -1,40 +1,55 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: llama
+  name: local-ai
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: llama
-  namespace: llama
+  name: local-ai
+  namespace: local-ai
  labels:
-    app: llama
+    app: local-ai
 spec:
  selector:
    matchLabels:
-      app: llama
+      app: local-ai
  replicas: 1
  template:
    metadata:
      labels:
-        app: llama
-      name: llama
+        app: local-ai
+      name: local-ai
    spec:
      containers:
-        - name: llama
-          args:
-          - api
-          image: quay.io/go-skynet/llama-cli:v0.1
+        - name: local-ai
+          image: quay.io/go-skynet/local-ai:latest
+          env:
+          - name: THREADS
+            value: "14"
+          - name: CONTEXT_SIZE
+            value: "512"
+          - name: MODELS_PATH
+            value: /models
+          volumeMounts:
+          - mountPath: /models
+            name: models
+      volumes:
+      - name: models
+        persistentVolumeClaim:
+          claimName: models
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: llama
-  namespace: llama
+  name: local-ai
+  namespace: local-ai
+  # If using AWS, you'll need to override the default 60s load balancer idle timeout
+  # annotations:
+  #   service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
 spec:
  selector:
-    app: llama
+    app: local-ai
  type: LoadBalancer
  ports:
    - protocol: TCP
--- a/main.go
+++ b/main.go
@@ -1,267 +1,96 @@
 package main

 import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
 	"os"
-	"runtime"
-	"text/template"

-	llama "github.com/go-skynet/llama/go"
+	api "github.com/go-skynet/LocalAI/api"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/jaypipes/ghw"
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
 	"github.com/urfave/cli/v2"
 )

-// Define the template string
-var emptyInput string = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-{{.Instruction}}
-
-### Response:`
-
-var nonEmptyInput string = `Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-
-### Instruction:
-{{.Instruction}}
-
-### Input:
-{{.Input}}
-
-### Response:
-`
-
-func llamaFromOptions(ctx *cli.Context) (*llama.LLama, error) {
-	opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
-	if ctx.Bool("alpaca") {
-		opts = append(opts, llama.EnableAlpaca)
-	}
-
-	return llama.New(ctx.String("model"), opts...)
-}
-
-func templateString(t string, in interface{}) (string, error) {
-	// Parse the template
-	tmpl, err := template.New("prompt").Parse(t)
-	if err != nil {
-		return "", err
-	}
-
-	var buf bytes.Buffer
-	err = tmpl.Execute(&buf, in)
-	if err != nil {
-		return "", err
-	}
-	return buf.String(), nil
-}
-
-var modelFlags = []cli.Flag{
-	&cli.StringFlag{
-		Name:    "model",
-		EnvVars: []string{"MODEL_PATH"},
-	},
-	&cli.IntFlag{
-		Name:    "tokens",
-		EnvVars: []string{"TOKENS"},
-		Value:   128,
-	},
-	&cli.IntFlag{
-		Name:    "context-size",
-		EnvVars: []string{"CONTEXT_SIZE"},
-		Value:   512,
-	},
-	&cli.IntFlag{
-		Name:    "threads",
-		EnvVars: []string{"THREADS"},
-		Value:   runtime.NumCPU(),
-	},
-	&cli.Float64Flag{
-		Name:    "temperature",
-		EnvVars: []string{"TEMPERATURE"},
-		Value:   0.95,
-	},
-	&cli.Float64Flag{
-		Name:    "topp",
-		EnvVars: []string{"TOP_P"},
-		Value:   0.85,
-	},
-	&cli.IntFlag{
-		Name:    "topk",
-		EnvVars: []string{"TOP_K"},
-		Value:   20,
-	},
-	&cli.BoolFlag{
-		Name:    "alpaca",
-		EnvVars: []string{"ALPACA"},
-		Value:   true,
-	},
-}
-
 func main() {
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+
+	path, err := os.Getwd()
+	if err != nil {
+		log.Error().Msgf("error: %s", err.Error())
+		os.Exit(1)
+	}
+
+	threads := 4
+	cpu, err := ghw.CPU()
+	if err == nil {
+		threads = int(cpu.TotalCores)
+	}
+
 	app := &cli.App{
-		Name:    "llama-cli",
-		Version: "0.1",
-		Usage:   "llama-cli --model ... --instruction 'What is an alpaca?'",
-		Flags: append(modelFlags,
-			&cli.StringFlag{
-				Name:    "template",
-				EnvVars: []string{"TEMPLATE"},
+		Name:  "LocalAI",
+		Usage: "OpenAI compatible API for running LLaMA/GPT models locally on CPU with consumer grade hardware.",
+		Flags: []cli.Flag{
+			&cli.BoolFlag{
+				Name:    "f16",
+				EnvVars: []string{"F16"},
+			},
+			&cli.BoolFlag{
+				Name:    "debug",
+				EnvVars: []string{"DEBUG"},
+			},
+			&cli.IntFlag{
+				Name:        "threads",
+				DefaultText: "Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested.",
+				EnvVars:     []string{"THREADS"},
+				Value:       threads,
 			},
 			&cli.StringFlag{
-				Name:    "instruction",
-				EnvVars: []string{"INSTRUCTION"},
+				Name:        "models-path",
+				DefaultText: "Path containing models used for inferencing",
+				EnvVars:     []string{"MODELS_PATH"},
+				Value:       path,
 			},
 			&cli.StringFlag{
-				Name:    "input",
-				EnvVars: []string{"INPUT"},
-			}),
-		Description: `Run llama.cpp inference`,
-		UsageText: `
-llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
-
-	An Alpaca (Vicugna pacos) is a domesticated species of South American camelid, related to llamas and originally from Peru but now found throughout much of Andean region. They are bred for their fleeces which can be spun into wool or knitted items such as hats, sweaters, blankets etc
-		
-echo "An Alpaca (Vicugna pacos) is a domesticated species of South American camelid, related to llamas and originally from Peru but now found throughout much of Andean region. They are bred for their fleeces which can be spun into wool or knitted items such as hats, sweaters, blankets etc" | llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "Proofread, improving clarity and flow" --input "-"
-
-	An Alpaca (Vicugna pacos) is a domesticated species from South America that's related to llamas. Originating in Peru but now found throughout the Andean region, they are bred for their fleeces which can be spun into wool or knitted items such as hats and sweaters—blankets too!
-`,
-		Copyright: "go-skynet authors",
-		Commands: []*cli.Command{
-			{
-				Flags: modelFlags,
-				Name:  "interactive",
-				Action: func(ctx *cli.Context) error {
-
-					l, err := llamaFromOptions(ctx)
-					if err != nil {
-						fmt.Println("Loading the model failed:", err.Error())
-						os.Exit(1)
-					}
-
-					return startInteractive(l, llama.SetTemperature(ctx.Float64("temperature")),
-						llama.SetTopP(ctx.Float64("topp")),
-						llama.SetTopK(ctx.Int("topk")),
-						llama.SetTokens(ctx.Int("tokens")),
-						llama.SetThreads(ctx.Int("threads")))
-				},
+				Name:        "address",
+				DefaultText: "Bind address for the API server.",
+				EnvVars:     []string{"ADDRESS"},
+				Value:       ":8080",
 			},
-			{
-
-				Name: "api",
-				Flags: []cli.Flag{
-					&cli.IntFlag{
-						Name:    "threads",
-						EnvVars: []string{"THREADS"},
-						Value:   runtime.NumCPU(),
-					},
-					&cli.StringFlag{
-						Name:    "model",
-						EnvVars: []string{"MODEL_PATH"},
-					},
-					&cli.StringFlag{
-						Name:    "address",
-						EnvVars: []string{"ADDRESS"},
-						Value:   ":8080",
-					},
-					&cli.BoolFlag{
-						Name:    "alpaca",
-						EnvVars: []string{"ALPACA"},
-						Value:   true,
-					},
-					&cli.IntFlag{
-						Name:    "context-size",
-						EnvVars: []string{"CONTEXT_SIZE"},
-						Value:   512,
-					},
-				},
-				Action: func(ctx *cli.Context) error {
-					l, err := llamaFromOptions(ctx)
-					if err != nil {
-						fmt.Println("Loading the model failed:", err.Error())
-						os.Exit(1)
-					}
-
-					return api(l, ctx.String("address"), ctx.Int("threads"))
-				},
+			&cli.IntFlag{
+				Name:        "context-size",
+				DefaultText: "Default context size of the model",
+				EnvVars:     []string{"CONTEXT_SIZE"},
+				Value:       512,
 			},
 		},
+		Description: `
+LocalAI is a drop-in replacement OpenAI API which runs inference locally.
+
+Some of the models compatible are:
+- Vicuna
+- Koala
+- GPT4ALL
+- GPT4ALL-J
+- Cerebras
+- Alpaca
+- StableLM (ggml quantized)
+
+It uses llama.cpp, ggml and gpt4all as backend with golang c bindings.
+`,
+		UsageText: `local-ai [options]`,
+		Copyright: "go-skynet authors",
 		Action: func(ctx *cli.Context) error {
-
-			instruction := ctx.String("instruction")
-			input := ctx.String("input")
-			templ := ctx.String("template")
-
-			promptTemplate := ""
-
-			if input != "" {
-				promptTemplate = nonEmptyInput
-			} else {
-				promptTemplate = emptyInput
+			zerolog.SetGlobalLevel(zerolog.InfoLevel)
+			if ctx.Bool("debug") {
+				zerolog.SetGlobalLevel(zerolog.DebugLevel)
 			}
-
-			if templ != "" {
-				dat, err := os.ReadFile(templ)
-				if err != nil {
-					fmt.Printf("Failed reading file: %s", err.Error())
-					os.Exit(1)
-				}
-				promptTemplate = string(dat)
-			}
-
-			if instruction == "-" {
-				dat, err := ioutil.ReadAll(os.Stdin)
-				if err != nil {
-					fmt.Printf("reading stdin failed: %s", err)
-					os.Exit(1)
-				}
-				instruction = string(dat)
-			}
-
-			if input == "-" {
-				dat, err := ioutil.ReadAll(os.Stdin)
-				if err != nil {
-					fmt.Printf("reading stdin failed: %s", err)
-					os.Exit(1)
-				}
-				input = string(dat)
-			}
-
-			str, err := templateString(promptTemplate, struct {
-				Instruction string
-				Input       string
-			}{Instruction: instruction, Input: input})
-
-			if err != nil {
-				fmt.Println("Templating the input failed:", err.Error())
-				os.Exit(1)
-			}
-
-			l, err := llamaFromOptions(ctx)
-			if err != nil {
-				fmt.Println("Loading the model failed:", err.Error())
-				os.Exit(1)
-			}
-
-			res, err := l.Predict(
-				str,
-				llama.SetTemperature(ctx.Float64("temperature")),
-				llama.SetTopP(ctx.Float64("topp")),
-				llama.SetTopK(ctx.Int("topk")),
-				llama.SetTokens(ctx.Int("tokens")),
-				llama.SetThreads(ctx.Int("threads")),
-			)
-			if err != nil {
-				fmt.Printf("predicting failed: %s", err)
-				os.Exit(1)
-			}
-			fmt.Println(res)
-			return nil
+			return api.Start(model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"), ctx.Int("context-size"), ctx.Bool("f16"))
 		},
 	}

-	err := app.Run(os.Args)
+	err = app.Run(os.Args)
 	if err != nil {
-		fmt.Println(err)
+		log.Error().Msgf("error: %s", err.Error())
 		os.Exit(1)
 	}
 }
--- a/models/.keep
+++ b/models/.keep
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -0,0 +1,274 @@
+package model
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"text/template"
+
+	"github.com/rs/zerolog/log"
+
+	gpt2 "github.com/go-skynet/go-gpt2.cpp"
+	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
+	llama "github.com/go-skynet/go-llama.cpp"
+)
+
+type ModelLoader struct {
+	modelPath string
+	mu        sync.Mutex
+
+	models            map[string]*llama.LLama
+	gptmodels         map[string]*gptj.GPTJ
+	gpt2models        map[string]*gpt2.GPT2
+	gptstablelmmodels map[string]*gpt2.StableLM
+
+	promptsTemplates map[string]*template.Template
+}
+
+func NewModelLoader(modelPath string) *ModelLoader {
+	return &ModelLoader{
+		modelPath:         modelPath,
+		gpt2models:        make(map[string]*gpt2.GPT2),
+		gptmodels:         make(map[string]*gptj.GPTJ),
+		gptstablelmmodels: make(map[string]*gpt2.StableLM),
+		models:            make(map[string]*llama.LLama),
+		promptsTemplates:  make(map[string]*template.Template),
+	}
+}
+
+func (ml *ModelLoader) ExistsInModelPath(s string) bool {
+	_, err := os.Stat(filepath.Join(ml.modelPath, s))
+	return err == nil
+}
+
+func (ml *ModelLoader) ListModels() ([]string, error) {
+	files, err := ioutil.ReadDir(ml.modelPath)
+	if err != nil {
+		return []string{}, err
+	}
+
+	models := []string{}
+	for _, file := range files {
+		// Skip templates, YAML and .keep files
+		if strings.HasSuffix(file.Name(), ".tmpl") || strings.HasSuffix(file.Name(), ".keep") || strings.HasSuffix(file.Name(), ".yaml") || strings.HasSuffix(file.Name(), ".yml") {
+			continue
+		}
+
+		models = append(models, file.Name())
+	}
+
+	return models, nil
+}
+
+func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	m, ok := ml.promptsTemplates[modelName]
+	if !ok {
+		return "", fmt.Errorf("no prompt template available")
+	}
+
+	var buf bytes.Buffer
+
+	if err := m.Execute(&buf, in); err != nil {
+		return "", err
+	}
+	return buf.String(), nil
+}
+
+func (ml *ModelLoader) loadTemplateIfExists(modelName, modelFile string) error {
+	// Check if the template was already loaded
+	if _, ok := ml.promptsTemplates[modelName]; ok {
+		return nil
+	}
+
+	// Check if the model path exists
+	// skip any error here - we run anyway if a template is not exist
+	modelTemplateFile := fmt.Sprintf("%s.tmpl", modelName)
+
+	if !ml.ExistsInModelPath(modelTemplateFile) {
+		return nil
+	}
+
+	dat, err := os.ReadFile(filepath.Join(ml.modelPath, modelTemplateFile))
+	if err != nil {
+		return err
+	}
+
+	// Parse the template
+	tmpl, err := template.New("prompt").Parse(string(dat))
+	if err != nil {
+		return err
+	}
+	ml.promptsTemplates[modelName] = tmpl
+
+	return nil
+}
+
+func (ml *ModelLoader) LoadStableLMModel(modelName string) (*gpt2.StableLM, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gpt2.NewStableLM(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gptstablelmmodels[modelName] = model
+	return model, err
+}
+
+func (ml *ModelLoader) LoadGPT2Model(modelName string) (*gpt2.GPT2, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.gpt2models[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// TODO: This needs refactoring, it's really bad to have it in here
+	// Check if we have a GPTStable model loaded instead - if we do we return an error so the API tries with StableLM
+	if _, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTStableLM one")
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gpt2.New(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gpt2models[modelName] = model
+	return model, err
+}
+
+func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.gptmodels[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// TODO: This needs refactoring, it's really bad to have it in here
+	// Check if we have a GPT2 model loaded instead - if we do we return an error so the API tries with GPT2
+	if _, ok := ml.gpt2models[modelName]; ok {
+		log.Debug().Msgf("Model is GPT2: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPT2 one")
+	}
+	if _, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTStableLM one")
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := gptj.New(modelFile)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.gptmodels[modelName] = model
+	return model, err
+}
+
+func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	log.Debug().Msgf("Loading model name: %s", modelName)
+
+	// Check if we already have a loaded model
+	if !ml.ExistsInModelPath(modelName) {
+		return nil, fmt.Errorf("model does not exist")
+	}
+
+	if m, ok := ml.models[modelName]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
+		return m, nil
+	}
+
+	// TODO: This needs refactoring, it's really bad to have it in here
+	// Check if we have a GPTJ model loaded instead - if we do we return an error so the API tries with GPTJ
+	if _, ok := ml.gptmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTJ: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTJ one")
+	}
+	if _, ok := ml.gpt2models[modelName]; ok {
+		log.Debug().Msgf("Model is GPT2: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPT2 one")
+	}
+	if _, ok := ml.gptstablelmmodels[modelName]; ok {
+		log.Debug().Msgf("Model is GPTStableLM: %s", modelName)
+		return nil, fmt.Errorf("this model is a GPTStableLM one")
+	}
+
+	// Load the model and keep it in memory for later use
+	modelFile := filepath.Join(ml.modelPath, modelName)
+	log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
+
+	model, err := llama.New(modelFile, opts...)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+	if err := ml.loadTemplateIfExists(modelName, modelFile); err != nil {
+		return nil, err
+	}
+
+	ml.models[modelName] = model
+	return model, err
+}
--- a/prompt-templates/alpaca.tmpl
+++ b/prompt-templates/alpaca.tmpl
@@ -0,0 +1,6 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Input}}
+
+### Response:
--- a/prompt-templates/ggml-gpt4all-j.tmpl
+++ b/prompt-templates/ggml-gpt4all-j.tmpl
@@ -0,0 +1,4 @@
+The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
+### Prompt:
+{{.Input}}
+### Response:
--- a/prompt-templates/koala.tmpl
+++ b/prompt-templates/koala.tmpl
@@ -0,0 +1 @@
+BEGINNING OF CONVERSATION: USER: {{.Input}} GPT:
--- a/prompt-templates/vicuna.tmpl
+++ b/prompt-templates/vicuna.tmpl
@@ -0,0 +1,6 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Input}}
+
+### Response:
Author	SHA1	Message	Date
Ettore Di Giacinto	ed954d66c3	Do not take all CPU by default (#50 )	2023-04-21 00:55:19 +02:00
Ettore Di Giacinto	f816dfae65	Add support for stablelm (#48 ) Signed-off-by: mudler <mudler@mocaccino.org>	2023-04-21 00:06:55 +02:00
Ettore Di Giacinto	142bcd66ca	Cleanup makefile, fix dep versions (#46 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-20 19:49:06 +02:00
Ettore Di Giacinto	1c4fbaae20	Add support for cerebras (#45 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-20 19:33:36 +02:00
Ettore Di Giacinto	d517a54e28	Major API enhancements (#44 )	2023-04-20 18:33:02 +02:00
Tyler Gillson	c905512bb0	Update example K8s manifests (#40 )	2023-04-20 18:31:11 +02:00
Ettore Di Giacinto	1254951fab	Add logo (#37 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-19 19:03:12 +02:00
Ettore Di Giacinto	80f50e6ccd	Rename project to LocalAI (#35 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-19 18:43:10 +02:00
Ettore Di Giacinto	7fec26f5d3	Enhancements (#34 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-19 17:10:29 +02:00
Ettore Di Giacinto	a9a875ee2b	⬆️ Bump llama.cpp (#33 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-17 21:34:02 +02:00
Ettore Di Giacinto	db5ac715f3	Use a reasonable default context size (#31 )	2023-04-17 18:45:42 +02:00
Ettore Di Giacinto	0b330d90ad	feat: drop embedded webui (#27 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-16 10:46:20 +02:00
Ettore Di Giacinto	63601fabd1	feat: drop default model and llama-specific API (#26 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-16 10:40:50 +02:00
Ettore Di Giacinto	1370b4482f	📖 Add prompt-templates examples (#25 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-16 10:24:15 +02:00
Ettore Di Giacinto	b062f3142b	feat: enhance API, expose more parameters (#24 ) Signed-off-by: mudler <mudler@c3os.io>	2023-04-16 10:16:48 +02:00
Marc R Kellerman	c37175271f	feature: makefile & updates (#23 ) Co-authored-by: mudler <mudler@c3os.io> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2023-04-15 16:39:07 -07:00
Ettore Di Giacinto	e8eab66c30	Merge pull request #22 from go-skynet/update-llama.cpp ⬆️ Update go-llama.cpp to `llama.cpp-2f7c8e0`	2023-04-16 00:06:52 +02:00
mudler	a73a497143	Update llama.cpp	2023-04-15 23:57:00 +02:00
Ettore Di Giacinto	6aea515e1d	Merge pull request #20 from go-skynet/mudler-patch-1 📖 Update README.md	2023-04-15 00:38:30 +02:00
Ettore Di Giacinto	dfc2b7e02a	📖 Update README.md	2023-04-15 00:38:18 +02:00
Ettore Di Giacinto	040290971c	Merge pull request #19 from go-skynet/tags Use tags for go-llama.cpp	2023-04-15 00:14:47 +02:00
mudler	553bad585e	Use tags for go-llama.cpp	2023-04-15 00:07:39 +02:00
Ettore Di Giacinto	f76b612506	Merge pull request #17 from go-skynet/mudler-patch-1 Fix comment typo	2023-04-13 15:21:13 +02:00
Ettore Di Giacinto	c4e94c88d7	Fix comment typo Thanks to @deadprogram for noticing it!	2023-04-13 15:20:51 +02:00
mudler	a9cd6b3ca3	ci: Fix tag detection for 'latest'	2023-04-13 01:37:09 +02:00
mudler	e786576b95	Update README	2023-04-13 01:28:15 +02:00
Ettore Di Giacinto	d426571789	Merge pull request #16 from go-skynet/fix_arm Drop armv7 builds	2023-04-13 01:21:58 +02:00
mudler	a896a2b5ad	Drop armv7 builds	2023-04-13 01:21:40 +02:00
Ettore Di Giacinto	8273cd5c04	Merge pull request #15 from go-skynet/docker-compose Add docker-compose file	2023-04-13 01:17:44 +02:00
mudler	16f1281d38	Minor workflow fixes	2023-04-13 01:16:13 +02:00
mudler	8042e9a2d6	Add docker-compose Fixes #14 Signed-off-by: mudler <mudler@c3os.io>	2023-04-13 01:13:14 +02:00
mudler	624092cb99	Update README	2023-04-12 00:07:30 +02:00
mudler	a422a883ac	Minor rephrasing	2023-04-12 00:04:15 +02:00
mudler	7858a97254	Update README	2023-04-12 00:02:47 +02:00
mudler	5556aa46dd	Small refinements and refactors	2023-04-12 00:02:39 +02:00
mudler	eb4257f946	Add .gitignore	2023-04-11 23:44:00 +02:00
mudler	ae30bd346d	Reorganize repository layout	2023-04-11 23:43:43 +02:00
mudler	93d8977ba2	Return model list	2023-04-10 12:02:40 +02:00
mudler	f43aeeb4a1	Add both API endpoints (completion, chat)	2023-04-09 12:30:55 +02:00
mudler	c17dcc5e9d	Allow to inject prompt as part of the call	2023-04-09 09:36:19 +02:00
mudler	4a932483e1	Small fixup to template loading	2023-04-08 11:59:40 +02:00
mudler	b710147b95	Add mutex on same models (parallel isn't supported yet)	2023-04-08 11:45:36 +02:00
mudler	ba70363330	Use template input	2023-04-08 11:24:25 +02:00
mudler	9fb581739b	Allow to template model prompts inputs	2023-04-08 10:46:51 +02:00
mudler	48aca246e3	Drop unused interactive mode	2023-04-07 11:31:14 +02:00
mudler	12eee097b7	Make it compatible with openAI api, support multiple models Signed-off-by: mudler <mudler@c3os.io>	2023-04-07 11:30:59 +02:00
mudler	b33d015b8c	Use go-llama.cpp	2023-04-07 10:08:15 +02:00
Ettore Di Giacinto	b7c0a108f5	Update README.md	2023-04-05 22:28:03 +02:00
Ettore Di Giacinto	f694a89c28	Update README.md	2023-04-05 22:14:00 +02:00
Ettore Di Giacinto	be682e6c2f	Update README.md Add short-term roadmap and mention webui	2023-04-05 22:04:35 +02:00
mudler	bf85a31f9e	Don't set a default model path	2023-04-05 22:00:15 +02:00
Ettore Di Giacinto	d69048e0b0	Update README.md	2023-04-05 00:41:02 +02:00
mudler	827f189163	Update README	2023-03-30 18:46:11 +02:00
mudler	a23deb5ec7	Drop duplicate target	2023-03-29 19:44:41 +02:00
mudler	999676b106	Add gpt4all instructions	2023-03-29 18:58:54 +02:00
mudler	c61b023bc8	Drop fat images, will document how to consume models	2023-03-29 18:55:24 +02:00
mudler	650a22aef1	Add compatibility to gpt4all models	2023-03-29 18:53:24 +02:00
mudler	17b1724f7c	Update llama-go	2023-03-27 01:18:14 +02:00
mudler	e860e62036	Add mutex, build only lite images	2023-03-27 01:01:38 +02:00
Ettore Di Giacinto	1f45ff8cd6	Update README.md	2023-03-26 23:37:26 +02:00
mudler	abee34f60a	Cleanup leftover	2023-03-25 01:10:50 +01:00
mudler	dbc70dc13c	Add a simple web-page as index of the API for helping with inference testing	2023-03-25 01:09:51 +01:00
mudler	55142065eb	Update README with building instructions	2023-03-24 01:11:13 +01:00
mudler	d83d2293b5	Update version in kubernetes deployment	2023-03-23 23:22:43 +01:00
mudler	467ce5a7aa	Update models download instructions, update images	2023-03-23 22:06:41 +01:00
				`@@ -0,0 +1 @@`
				`BEGINNING OF CONVERSATION: USER: {{.Input}} GPT:`