Merge pull request #22 from go-skynet/update-llama.cpp

⬆️ Update go-llama.cpp to `llama.cpp-2f7c8e0`
Update llama.cpp
2026-02-03 11:13:31 -05:00 · 2023-04-16 00:06:52 +02:00 · 2023-04-15 23:57:00 +02:00 · 2023-04-15 00:38:30 +02:00 · 2023-04-15 00:38:18 +02:00 · 2023-04-15 00:14:47 +02:00
18 changed files with 629 additions and 460 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+models/*.bin
--- a/.env
+++ b/.env
@@ -0,0 +1 @@
+THREADS=14
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -2,6 +2,7 @@
 name: 'build container images'

 on:
+  pull_request:
  push:
    branches:
      - master
@@ -12,68 +13,42 @@ jobs:
  docker:
    runs-on: ubuntu-latest
    steps:
-      - name: Release space from worker
-        run: |
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          df -h
-          echo
-          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-          sudo rm -rf /usr/local/lib/android
-          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-          sudo rm -rf /usr/share/dotnet
-          sudo apt-get remove -y '^mono-.*' || true
-          sudo apt-get remove -y '^ghc-.*' || true
-          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-          sudo apt-get remove -y 'php.*' || true
-          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-          sudo apt-get remove -y '^google-.*' || true
-          sudo apt-get remove -y azure-cli || true
-          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get autoremove -y
-          sudo apt-get clean
-          echo
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          sudo rm -rfv build || true
-          df -h
      - name: Checkout
        uses: actions/checkout@v3
+
      - name: Prepare
        id: prep
        run: |
          DOCKER_IMAGE=quay.io/go-skynet/llama-cli
-          VERSION=latest
+          VERSION=master
          SHORTREF=${GITHUB_SHA::8}
+
          # If this is git tag, use the tag name as a docker tag
          if [[ $GITHUB_REF == refs/tags/* ]]; then
            VERSION=${GITHUB_REF#refs/tags/}
          fi
          TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
+
          # If the VERSION looks like a version number, assume that
          # this is the most recent version of the image and also
          # tag it 'latest'.
-          if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
+          if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
            TAGS="$TAGS,${DOCKER_IMAGE}:latest"
          fi
+
          # Set output parameters.
          echo ::set-output name=tags::${TAGS}
          echo ::set-output name=docker_image::${DOCKER_IMAGE}
-          echo ::set-output name=image::${DOCKER_IMAGE}:${VERSION}
+
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
        with:
          platforms: all
+
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@master
+
      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v2
@@ -81,9 +56,23 @@ jobs:
          registry: quay.io
          username: ${{ secrets.QUAY_USERNAME }}
          password: ${{ secrets.QUAY_PASSWORD }}
-      - uses: earthly/actions/setup-earthly@v1
      - name: Build
-        run: |
-            earthly config "global.conversion_parallelism" "1"
-            earthly config "global.buildkit_max_parallelism" "1"
-            earthly --push +image-all --IMAGE=${{ steps.prep.outputs.image }}
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@v4
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: .
+          file: ./Dockerfile
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: ${{ steps.prep.outputs.tags }}
+      - name: Build PRs
+        if: github.event_name == 'pull_request'
+        uses: docker/build-push-action@v4
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: .
+          file: ./Dockerfile
+          platforms: linux/amd64
+          push: false
+          tags: ${{ steps.prep.outputs.tags }}
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+llama-cli
+models/*.bin
--- a/18
+++ b/18
@@ -0,0 +1,18 @@
+ARG GO_VERSION=1.20
+ARG DEBIAN_VERSION=11
+FROM golang:$GO_VERSION as builder
+WORKDIR /build
+ARG GO_LLAMA_CPP_TAG=llama.cpp-2f7c8e0
+RUN git clone -b $GO_LLAMA_CPP_TAG --recurse-submodules https://github.com/go-skynet/go-llama.cpp
+RUN cd go-llama.cpp && make libbinding.a
+COPY go.mod ./
+COPY go.sum ./
+RUN go mod download
+RUN apt-get update
+COPY . .
+RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
+RUN C_INCLUDE_PATH=/build/go-llama.cpp LIBRARY_PATH=/build/go-llama.cpp go build -o llama-cli ./
+
+FROM debian:$DEBIAN_VERSION
+COPY --from=builder /build/llama-cli /usr/bin/llama-cli
+ENTRYPOINT [ "/usr/bin/llama-cli" ]
--- a/31
+++ b/31
@@ -1,32 +1,5 @@
 VERSION 0.7

-go-deps:
-    ARG GO_VERSION=1.20
-    FROM golang:$GO_VERSION
-    WORKDIR /build
-    COPY go.mod ./
-    COPY go.sum ./
-    RUN go mod download
-    RUN apt-get update
-    SAVE ARTIFACT go.mod AS LOCAL go.mod
-    SAVE ARTIFACT go.sum AS LOCAL go.sum
-
 build:
-    FROM +go-deps
-    WORKDIR /build
-    RUN git clone https://github.com/go-skynet/llama
-    RUN cd llama && make libllama.a
-    COPY . .
-    RUN C_INCLUDE_PATH=/build/llama LIBRARY_PATH=/build/llama go build -o llama-cli ./
-    SAVE ARTIFACT llama-cli AS LOCAL llama-cli
-
-image:
-    FROM +go-deps
-    ARG IMAGE=alpaca-cli-nomodel
-    COPY +build/llama-cli /llama-cli
-    ENV MODEL_PATH=/model.bin
-    ENTRYPOINT [ "/llama-cli" ]
-    SAVE IMAGE --push $IMAGE
-
-image-all:
-    BUILD --platform=linux/amd64 --platform=linux/arm64 +image
+    FROM DOCKERFILE -f Dockerfile .
+    SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli
--- a/README.md
+++ b/README.md
@@ -1,14 +1,70 @@
 ## :camel: llama-cli


-llama-cli is a straightforward golang CLI interface for [llama.cpp](https://github.com/ggerganov/llama.cpp), providing a simple API and a command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
+llama-cli is a straightforward golang CLI interface and API compatible with OpenAI for [llama.cpp](https://github.com/ggerganov/llama.cpp), it supports multiple-models and also provides a simple command line interface that allows text generation using a GPT-based model like llama directly from the terminal. 
+
+It is compatible with the models supported by `llama.cpp`. You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
+
+`llama-cli` doesn't shell-out, it uses https://github.com/go-skynet/go-llama.cpp, which is a golang binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
+
+## Usage
+
+You can use `docker-compose`:
+
+```bash
+
+git clone https://github.com/go-skynet/llama-cli
+cd llama-cli
+
+# copy your models to models/
+cp your-model.bin models/
+
+# (optional) Edit the .env file to set the number of concurrent threads used for inference
+# echo "THREADS=14" > .env
+
+# start with docker-compose
+docker compose up -d --build
+
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "your-model.bin",            
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
+
+
+```
+
+Note: You can use a default template for every model in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
+
+```
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Input}}
+
+### Response:
+```

 ## Container images

+`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
+
 To begin, run:

 ```
-docker run -ti --rm quay.io/go-skynet/llama-cli:v0.3  --instruction "What's an alpaca?" --topk 10000
+docker run -ti --rm quay.io/go-skynet/llama-cli:latest  --instruction "What's an alpaca?" --topk 10000 --model ...
+```
+
+Where `--model` is the path of the model you want to use. 
+
+Note: you need to mount a volume to the docker container in order to load a model, for instance:
+
+```
+# assuming your model is in /path/to/your/models/foo.bin
+docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest  --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
 ```

 You will receive a response like the following:
@@ -37,8 +93,6 @@ llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--
 | top_p        | TOP_P                | 0.85          | The cumulative probability for top-p sampling. |
 | top_k        | TOP_K                | 20            | The number of top-k tokens to consider for text generation.  |
 | context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| alpaca       | ALPACA               | true          | Set to true for alpaca models. |
-| gpt4all       | GPT4ALL               | false          | Set to true for gpt4all models. |

 Here's an example of using `llama-cli`:

@@ -48,14 +102,14 @@ llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"

 This will generate text based on the given model and instruction.

-## Advanced usage
+## API

-`llama-cli` also provides an API for running text generation as a service. The model will be pre-loaded and kept in memory.
+`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.

 Example of starting the API with `docker`:

 ```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.3 api --context-size 700 --threads 4
+docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
 ```

 And you'll see:
@@ -70,36 +124,70 @@ And you'll see:
 └───────────────────────────────────────────────────┘ 
 ```

+Note: Models have to end up with `.bin`.
+
 You can control the API server options with command line arguments:

 ```
-llama-cli api --model <model_path> [--address <address>] [--threads <num_threads>]
+llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
 ```

 The API takes takes the following:

 | Parameter    | Environment Variable | Default Value | Description                            |
 | ------------ | -------------------- | ------------- | -------------------------------------- |
-| model        | MODEL_PATH           |               | The path to the pre-trained GPT-based model.      |
+| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
 | threads      | THREADS              | CPU cores     | The number of threads to use for text generation. |
 | address      | ADDRESS              | :8080         | The address and port to listen on. |
 | context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| alpaca       | ALPACA               | true          | Set to true for alpaca models. |
-| gpt4all       | GPT4ALL               | false          | Set to true for gpt4all models. |

+Once the server is running, you can start making requests to it using HTTP, using the OpenAI API. 

-Once the server is running, you can make requests to it using HTTP. For example, to generate text based on an instruction, you can send a POST request to the `/predict` endpoint with the instruction as the request body:
+### Supported OpenAI API endpoints
+
+You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
+
+Following the list of endpoints/parameters supported.
+
+#### Chat completions
+
+For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:

 ```
-curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
-    "text": "What is an alpaca?",
-    "topP": 0.8,
-    "topK": 50,
-    "temperature": 0.7,
-    "tokens": 100
-}'
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
 ```

+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+#### Completions
+
+For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
+```
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+#### List models
+
+You can list all the models available with:
+
+```
+curl http://localhost:8080/v1/models
+```
+
+## Web interface
+
+There is also available a simple web interface (for instance, http://localhost:8080/) which can be used as a playground.
+
 Note: The API doesn't inject a template for talking to the instance, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release, for instance:

 ```
@@ -111,19 +199,10 @@ Below is an instruction that describes a task. Write a response that appropriate
 ### Response:
 ```

+
 ## Using other models

-You can specify a model binary to be used for inference with `--model`.
-
-13B and 30B alpaca models are known to work:
-
-```
-# Download the model image, extract the model
-# Use the model with llama-cli
-docker run -v $PWD:/models -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.3-lite api --model /models/model.bin
-```
-
-gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted:
+gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):

 ```bash
 wget -O tokenizer.model https://huggingface.co/decapoda-research/llama-30b-hf/resolve/main/tokenizer.model
@@ -132,6 +211,7 @@ cp gpt4all.. models/
 git clone https://gist.github.com/eiz/828bddec6162a023114ce19146cb2b82
 pip install sentencepiece
 python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
+# There will be a new model with the ".tmp" extension, you have to use that one!
 ```

 ### Golang client API
@@ -149,7 +229,7 @@ import (

 func main() {

-	cli := client.NewClient("http://ip:30007")
+	cli := client.NewClient("http://ip:port")

 	out, err := cli.Predict("What's an alpaca?")
 	if err != nil {
@@ -160,6 +240,10 @@ func main() {
 }
 ```

+### Windows compatibility
+
+It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/llama-cli/issues/2
+
 ### Kubernetes

 You can run the API directly in Kubernetes:
@@ -176,9 +260,8 @@ In order to build the `llama-cli` container image locally you can use `docker`:

 ```
 # build the image as "alpaca-image"
-docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +image --IMAGE=alpaca-image
-# run the image
-docker run alpaca-image --instruction "What's an alpaca?"
+docker build -t llama-cli .
+docker run llama-cli --instruction "What's an alpaca?"
 ```

 Or build the binary with:
@@ -189,3 +272,22 @@ docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v
 # run the binary
 ./llama-cli --instruction "What's an alpaca?"
 ```
+
+## Short-term roadmap
+
+- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
+- Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
+- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
+- [x] Multi-model support
+- Have a webUI!
+
+## License
+
+MIT
+
+## Acknowledgements
+
+- [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- https://github.com/tatsu-lab/stanford_alpaca
+- https://github.com/cornelk/llama-go for the initial ideas
+- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)
--- a/api.go
+++ b/api.go
@@ -1,91 +0,0 @@
-package main
-
-import (
-	"embed"
-	"net/http"
-	"strconv"
-	"sync"
-
-	llama "github.com/go-skynet/llama/go"
-	"github.com/gofiber/fiber/v2"
-	"github.com/gofiber/fiber/v2/middleware/filesystem"
-)
-
-//go:embed index.html
-var indexHTML embed.FS
-
-func api(l *llama.LLama, listenAddr string, threads int) error {
-	app := fiber.New()
-	app.Use("/", filesystem.New(filesystem.Config{
-		Root:         http.FS(indexHTML),
-		NotFoundFile: "index.html",
-	}))
-	/*
-		curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
-		    "text": "What is an alpaca?",
-		    "topP": 0.8,
-		    "topK": 50,
-		    "temperature": 0.7,
-		    "tokens": 100
-		}'
-	*/
-	var mutex = &sync.Mutex{}
-
-	// Endpoint to generate the prediction
-	app.Post("/predict", func(c *fiber.Ctx) error {
-		mutex.Lock()
-		defer mutex.Unlock()
-		// Get input data from the request body
-		input := new(struct {
-			Text string `json:"text"`
-		})
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		// Set the parameters for the language model prediction
-		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
-		if err != nil {
-			return err
-		}
-
-		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
-		if err != nil {
-			return err
-		}
-
-		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
-		if err != nil {
-			return err
-		}
-
-		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
-		if err != nil {
-			return err
-		}
-
-		// Generate the prediction using the language model
-		prediction, err := l.Predict(
-			input.Text,
-			llama.SetTemperature(temperature),
-			llama.SetTopP(topP),
-			llama.SetTopK(topK),
-			llama.SetTokens(tokens),
-			llama.SetThreads(threads),
-		)
-		if err != nil {
-			return err
-		}
-
-		// Return the prediction in the response body
-		return c.JSON(struct {
-			Prediction string `json:"prediction"`
-		}{
-			Prediction: prediction,
-		})
-	})
-
-	// Start the server
-	app.Listen(":8080")
-	return nil
-}
--- a/api/api.go
+++ b/api/api.go
@@ -0,0 +1,276 @@
+package api
+
+import (
+	"embed"
+	"fmt"
+	"net/http"
+	"strconv"
+	"strings"
+	"sync"
+
+	model "github.com/go-skynet/llama-cli/pkg/model"
+
+	llama "github.com/go-skynet/go-llama.cpp"
+	"github.com/gofiber/fiber/v2"
+	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/filesystem"
+	"github.com/gofiber/fiber/v2/middleware/recover"
+)
+
+type OpenAIResponse struct {
+	Created int      `json:"created,omitempty"`
+	Object  string   `json:"chat.completion,omitempty"`
+	ID      string   `json:"id,omitempty"`
+	Model   string   `json:"model,omitempty"`
+	Choices []Choice `json:"choices,omitempty"`
+}
+
+type Choice struct {
+	Index        int     `json:"index,omitempty"`
+	FinishReason string  `json:"finish_reason,omitempty"`
+	Message      Message `json:"message,omitempty"`
+	Text         string  `json:"text,omitempty"`
+}
+
+type Message struct {
+	Role    string `json:"role,omitempty"`
+	Content string `json:"content,omitempty"`
+}
+
+type OpenAIModel struct {
+	ID     string `json:"id"`
+	Object string `json:"object"`
+}
+
+type OpenAIRequest struct {
+	Model string `json:"model"`
+
+	// Prompt is read only by completion API calls
+	Prompt string `json:"prompt"`
+	
+	// Messages is read only by chat/completion API calls
+	Messages []Message `json:"messages"`
+
+	// Common options between all the API calls
+	TopP        float64 `json:"top_p"`
+	TopK        int     `json:"top_k"`
+	Temperature float64 `json:"temperature"`
+	Maxtokens   int     `json:"max_tokens"`
+}
+
+//go:embed index.html
+var indexHTML embed.FS
+
+func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		var err error
+		var model *llama.LLama
+
+		input := new(OpenAIRequest)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		if input.Model == "" {
+			if defaultModel == nil {
+				return fmt.Errorf("no default model loaded, and no model specified")
+			}
+			model = defaultModel
+		} else {
+			model, err = loader.LoadModel(input.Model)
+			if err != nil {
+				return err
+			}
+		}
+
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		if input.Model != "" {
+			mutexMap.Lock()
+			l, ok := mutexes[input.Model]
+			if !ok {
+				m := &sync.Mutex{}
+				mutexes[input.Model] = m
+				l = m
+			}
+			mutexMap.Unlock()
+			l.Lock()
+			defer l.Unlock()
+		} else {
+			defaultMutex.Lock()
+			defer defaultMutex.Unlock()
+		}
+
+		// Set the parameters for the language model prediction
+		topP := input.TopP
+		if topP == 0 {
+			topP = 0.7
+		}
+		topK := input.TopK
+		if topK == 0 {
+			topK = 80
+		}
+
+		temperature := input.Temperature
+		if temperature == 0 {
+			temperature = 0.9
+		}
+
+		tokens := input.Maxtokens
+		if tokens == 0 {
+			tokens = 512
+		}
+
+		predInput := input.Prompt
+		if chat {
+			mess := []string{}
+			for _, i := range input.Messages {
+				mess = append(mess, i.Content)
+			}
+
+			predInput = strings.Join(mess, "\n")
+		}
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
+			Input string
+		}{Input: predInput})
+		if err == nil {
+			predInput = templatedInput
+		}
+
+		// Generate the prediction using the language model
+		prediction, err := model.Predict(
+			predInput,
+			llama.SetTemperature(temperature),
+			llama.SetTopP(topP),
+			llama.SetTopK(topK),
+			llama.SetTokens(tokens),
+			llama.SetThreads(threads),
+		)
+		if err != nil {
+			return err
+		}
+
+		if chat {
+			// Return the chat prediction in the response body
+			return c.JSON(OpenAIResponse{
+				Model:   input.Model,
+				Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
+			})
+		}
+
+		// Return the prediction in the response body
+		return c.JSON(OpenAIResponse{
+			Model:   input.Model,
+			Choices: []Choice{{Text: prediction}},
+		})
+	}
+}
+
+func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
+	app := fiber.New()
+
+	// Default middleware config
+	app.Use(recover.New())
+	app.Use(cors.New())
+
+	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+	var mutex = &sync.Mutex{}
+	mu := map[string]*sync.Mutex{}
+	var mumutex = &sync.Mutex{}
+
+	// openAI compatible API endpoint
+	app.Post("/v1/chat/completions", openAIEndpoint(true, defaultModel, loader, threads, mutex, mumutex, mu))
+	app.Post("/v1/completions", openAIEndpoint(false, defaultModel, loader, threads, mutex, mumutex, mu))
+	app.Get("/v1/models", func(c *fiber.Ctx) error {
+		models, err := loader.ListModels()
+		if err != nil {
+			return err
+		}
+
+		dataModels := []OpenAIModel{}
+		for _, m := range models {
+			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
+		}
+		return c.JSON(struct {
+			Object string        `json:"object"`
+			Data   []OpenAIModel `json:"data"`
+		}{
+			Object: "list",
+			Data:   dataModels,
+		})
+	})
+
+	app.Use("/", filesystem.New(filesystem.Config{
+		Root:         http.FS(indexHTML),
+		NotFoundFile: "index.html",
+	}))
+
+	/*
+		curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
+		    "text": "What is an alpaca?",
+		    "topP": 0.8,
+		    "topK": 50,
+		    "temperature": 0.7,
+		    "tokens": 100
+		}'
+	*/
+	// Endpoint to generate the prediction
+	app.Post("/predict", func(c *fiber.Ctx) error {
+		mutex.Lock()
+		defer mutex.Unlock()
+		// Get input data from the request body
+		input := new(struct {
+			Text string `json:"text"`
+		})
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		// Set the parameters for the language model prediction
+		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
+		if err != nil {
+			return err
+		}
+
+		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
+		if err != nil {
+			return err
+		}
+
+		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
+		if err != nil {
+			return err
+		}
+
+		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
+		if err != nil {
+			return err
+		}
+
+		// Generate the prediction using the language model
+		prediction, err := defaultModel.Predict(
+			input.Text,
+			llama.SetTemperature(temperature),
+			llama.SetTopP(topP),
+			llama.SetTopK(topK),
+			llama.SetTokens(tokens),
+			llama.SetThreads(threads),
+		)
+		if err != nil {
+			return err
+		}
+
+		// Return the prediction in the response body
+		return c.JSON(struct {
+			Prediction string `json:"prediction"`
+		}{
+			Prediction: prediction,
+		})
+	})
+
+	// Start the server
+	app.Listen(listenAddr)
+	return nil
+}
--- a/api/index.html
+++ b/api/index.html
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,15 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/llama-cli:latest
+    build: .
+    volumes:
+      - ./models:/models
+    ports:
+      - 8080:8080
+    environment:
+      - MODELS_PATH=/models
+      - CONTEXT_SIZE=700
+      - THREADS=$THREADS
+    command: api
--- a/go.mod
+++ b/go.mod
@@ -3,31 +3,19 @@ module github.com/go-skynet/llama-cli
 go 1.19

 require (
-	github.com/charmbracelet/bubbles v0.15.0
-	github.com/charmbracelet/bubbletea v0.23.2
-	github.com/charmbracelet/lipgloss v0.7.1
-	github.com/go-skynet/llama v0.0.0-20230329165201-84efc8db3647
+	github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4
 	github.com/gofiber/fiber/v2 v2.42.0
 	github.com/urfave/cli/v2 v2.25.0
 )

 require (
 	github.com/andybalholm/brotli v1.0.4 // indirect
-	github.com/atotto/clipboard v0.1.4 // indirect
-	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
-	github.com/containerd/console v1.0.3 // indirect
 	github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
 	github.com/google/uuid v1.3.0 // indirect
 	github.com/klauspost/compress v1.15.9 // indirect
-	github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
 	github.com/mattn/go-isatty v0.0.17 // indirect
-	github.com/mattn/go-localereader v0.0.1 // indirect
 	github.com/mattn/go-runewidth v0.0.14 // indirect
-	github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b // indirect
-	github.com/muesli/cancelreader v0.2.2 // indirect
-	github.com/muesli/reflow v0.3.0 // indirect
-	github.com/muesli/termenv v0.15.1 // indirect
 	github.com/philhofer/fwd v1.1.1 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
@@ -38,8 +26,5 @@ require (
 	github.com/valyala/fasthttp v1.44.0 // indirect
 	github.com/valyala/tcplisten v1.0.0 // indirect
 	github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
-	golang.org/x/sync v0.1.0 // indirect
 	golang.org/x/sys v0.6.0 // indirect
-	golang.org/x/term v0.5.0 // indirect
-	golang.org/x/text v0.7.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@@ -1,72 +1,34 @@
 github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
 github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
-github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4=
-github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI=
-github.com/aymanbagabas/go-osc52 v1.0.3/go.mod h1:zT8H+Rk4VSabYN90pWyugflM3ZhpTZNC7cASDfUCdT4=
-github.com/aymanbagabas/go-osc52 v1.2.1/go.mod h1:zT8H+Rk4VSabYN90pWyugflM3ZhpTZNC7cASDfUCdT4=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
-github.com/charmbracelet/bubbles v0.15.0 h1:c5vZ3woHV5W2b8YZI1q7v4ZNQaPetfHuoHzx+56Z6TI=
-github.com/charmbracelet/bubbles v0.15.0/go.mod h1:Y7gSFbBzlMpUDR/XM9MhZI374Q+1p1kluf1uLl8iK74=
-github.com/charmbracelet/bubbletea v0.23.1/go.mod h1:JAfGK/3/pPKHTnAS8JIE2u9f61BjWTQY57RbT25aMXU=
-github.com/charmbracelet/bubbletea v0.23.2 h1:vuUJ9HJ7b/COy4I30e8xDVQ+VRDUEFykIjryPfgsdps=
-github.com/charmbracelet/bubbletea v0.23.2/go.mod h1:FaP3WUivcTM0xOKNmhciz60M6I+weYLF76mr1JyI7sM=
-github.com/charmbracelet/harmonica v0.2.0/go.mod h1:KSri/1RMQOZLbw7AHqgcBycp8pgJnQMYYT8QZRqZ1Ao=
-github.com/charmbracelet/lipgloss v0.6.0/go.mod h1:tHh2wr34xcHjC2HCXIlGSG1jaDF0S0atAUvBMP6Ppuk=
-github.com/charmbracelet/lipgloss v0.7.1 h1:17WMwi7N1b1rVWOjMT+rCh7sQkvDU75B2hbZpc5Kc1E=
-github.com/charmbracelet/lipgloss v0.7.1/go.mod h1:yG0k3giv8Qj8edTCbbg6AlQ5e8KNWpFujkNawKNhE2c=
-github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
-github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
 github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
-github.com/go-skynet/llama v0.0.0-20230321172246-7be5326e18cc h1:NcmO8mA7iRZIX0Qy2SjcsSaV14+g87MiTey1neUJaFQ=
-github.com/go-skynet/llama v0.0.0-20230321172246-7be5326e18cc/go.mod h1:ZtYsAIud4cvP9VTTI9uhdgR1uCwaO/gGKnZZ95h9i7w=
-github.com/go-skynet/llama v0.0.0-20230325223742-a3563a2690ba h1:u6OhAqlWFHsTjfWKePdK2kP4/mTyXX5vsmKwrK5QX6o=
-github.com/go-skynet/llama v0.0.0-20230325223742-a3563a2690ba/go.mod h1:ZtYsAIud4cvP9VTTI9uhdgR1uCwaO/gGKnZZ95h9i7w=
-github.com/go-skynet/llama v0.0.0-20230329165201-84efc8db3647 h1:W6qHHD/Bv6wRXSzdv38gWMAXgw3fklHyEblfw88uEUU=
-github.com/go-skynet/llama v0.0.0-20230329165201-84efc8db3647/go.mod h1:ZtYsAIud4cvP9VTTI9uhdgR1uCwaO/gGKnZZ95h9i7w=
+github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4 h1:u/y9MlPHOeIj636IQmrf9ptMjjdgCVIcsfb7lMFh39M=
+github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
+github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
 github.com/gofiber/fiber/v2 v2.42.0 h1:Fnp7ybWvS+sjNQsFvkhf4G8OhXswvB6Vee8hM/LyS+8=
 github.com/gofiber/fiber/v2 v2.42.0/go.mod h1:3+SGNjqMh5VQH5Vz2Wdi43zTIV16ktlFd3x3R6O1Zlc=
+github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
+github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
 github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
 github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
 github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
-github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
-github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
-github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
-github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
 github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng=
 github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
-github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
-github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
-github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
-github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
 github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
 github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
-github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b h1:1XF24mVaiu7u+CFywTdcDo2ie1pzzhwjt6RHqzpMU34=
-github.com/muesli/ansi v0.0.0-20211018074035-2e021307bc4b/go.mod h1:fQuZ0gauxyBcmsdE3ZT4NasjaRdxmbCS0jRHsrWu3Ho=
-github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
-github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
-github.com/muesli/reflow v0.2.1-0.20210115123740-9e1d0d53df68/go.mod h1:Xk+z4oIWdQqJzsxyjgl3P22oYZnHdZ8FFTHAQQt5BMQ=
-github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s=
-github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8=
-github.com/muesli/termenv v0.11.1-0.20220204035834-5ac8409525e0/go.mod h1:Bd5NYQ7pd+SrtBSrSNoBBmXlcY8+Xj4BMJgh8qcZrvs=
-github.com/muesli/termenv v0.13.0/go.mod h1:sP1+uffeLaEYpyOTb8pLCUctGcGLnoFjSn4YJK5e2bc=
-github.com/muesli/termenv v0.14.0/go.mod h1:kG/pF1E7fh949Xhe156crRUrHNyK221IuGO7Ez60Uc8=
-github.com/muesli/termenv v0.15.1 h1:UzuTb/+hhlBugQz28rpzey4ZuKcZ03MeKsoG7IJZIxs=
-github.com/muesli/termenv v0.15.1/go.mod h1:HeAQPTzpfs016yGtA4g00CsdYnVLJvxsS4ANqrZs2sQ=
+github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
+github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
 github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
 github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
-github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
 github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
 github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/sahilm/fuzzy v0.1.0/go.mod h1:VFvziUEIMCrT6A6tw2RFIXPXXmzXbOsSHF0DOI8ZK9Y=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
 github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94/go.mod h1:90zrgN3D/WJsDd1iXHT96alCoN2KJo6/4x1DZC3wZs8=
 github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d h1:Q+gqLBOPkFGHyCJxXMRqtUgUbTjI8/Ze8vu8GGyNFwo=
@@ -94,36 +56,31 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
+golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
-golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.5.0 h1:n2a8QNdAb0sZNpU9R1ALUXBbY+w51fCQDN+7EdxNBsY=
-golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo=
-golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
 golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
--- a/interactive.go
+++ b/interactive.go
@@ -1,142 +0,0 @@
-package main
-
-// A simple program demonstrating the text area component from the Bubbles
-// component library.
-
-import (
-	"fmt"
-	"strings"
-
-	"github.com/charmbracelet/bubbles/textarea"
-	"github.com/charmbracelet/bubbles/viewport"
-	tea "github.com/charmbracelet/bubbletea"
-	"github.com/charmbracelet/lipgloss"
-	llama "github.com/go-skynet/llama/go"
-)
-
-func startInteractive(l *llama.LLama, opts ...llama.PredictOption) error {
-	p := tea.NewProgram(initialModel(l, opts...))
-
-	_, err := p.Run()
-	return err
-}
-
-type (
-	errMsg error
-)
-
-type model struct {
-	viewport    viewport.Model
-	messages    *[]string
-	textarea    textarea.Model
-	senderStyle lipgloss.Style
-	err         error
-	l           *llama.LLama
-	opts        []llama.PredictOption
-
-	predictC chan string
-}
-
-func initialModel(l *llama.LLama, opts ...llama.PredictOption) model {
-	ta := textarea.New()
-	ta.Placeholder = "Send a message..."
-	ta.Focus()
-
-	ta.Prompt = "┃ "
-	ta.CharLimit = 280
-
-	ta.SetWidth(200)
-	ta.SetHeight(3)
-
-	// Remove cursor line styling
-	ta.FocusedStyle.CursorLine = lipgloss.NewStyle()
-
-	ta.ShowLineNumbers = false
-
-	vp := viewport.New(200, 5)
-	vp.SetContent(`Welcome to llama-cli. Type a message and press Enter to send. Alpaca doesn't keep context of the whole chat (yet).`)
-
-	ta.KeyMap.InsertNewline.SetEnabled(false)
-
-	predictChannel := make(chan string)
-	messages := []string{}
-	m := model{
-		textarea:    ta,
-		messages:    &messages,
-		viewport:    vp,
-		senderStyle: lipgloss.NewStyle().Foreground(lipgloss.Color("5")),
-		err:         nil,
-		l:           l,
-		opts:        opts,
-		predictC:    predictChannel,
-	}
-	go func() {
-		for p := range predictChannel {
-			str, _ := templateString(emptyInput, struct {
-				Instruction string
-				Input       string
-			}{Instruction: p})
-			res, _ := l.Predict(
-				str,
-				opts...,
-			)
-
-			mm := *m.messages
-			*m.messages = mm[:len(mm)-1]
-			*m.messages = append(*m.messages, m.senderStyle.Render("llama: ")+res)
-			m.viewport.SetContent(strings.Join(*m.messages, "\n"))
-			ta.Reset()
-			m.viewport.GotoBottom()
-		}
-	}()
-
-	return m
-}
-
-func (m model) Init() tea.Cmd {
-	return textarea.Blink
-}
-
-func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
-	var (
-		tiCmd tea.Cmd
-		vpCmd tea.Cmd
-	)
-
-	m.textarea, tiCmd = m.textarea.Update(msg)
-	m.viewport, vpCmd = m.viewport.Update(msg)
-
-	switch msg := msg.(type) {
-	case tea.WindowSizeMsg:
-
-	//	m.viewport.Width = msg.Width
-	//	m.viewport.Height = msg.Height
-	case tea.KeyMsg:
-		switch msg.Type {
-		case tea.KeyCtrlC, tea.KeyEsc:
-			fmt.Println(m.textarea.Value())
-			return m, tea.Quit
-		case tea.KeyEnter:
-			*m.messages = append(*m.messages, m.senderStyle.Render("You: ")+m.textarea.Value(), m.senderStyle.Render("Loading response..."))
-			m.predictC <- m.textarea.Value()
-			m.viewport.SetContent(strings.Join(*m.messages, "\n"))
-			m.textarea.Reset()
-			m.viewport.GotoBottom()
-		}
-
-	// We handle errors just like any other message
-	case errMsg:
-		m.err = msg
-		return m, nil
-	}
-
-	return m, tea.Batch(tiCmd, vpCmd)
-}
-
-func (m model) View() string {
-	return fmt.Sprintf(
-		"%s\n\n%s",
-		m.viewport.View(),
-		m.textarea.View(),
-	) + "\n\n"
-}
--- a/kubernetes/deployment.yaml
+++ b/kubernetes/deployment.yaml
@@ -25,7 +25,7 @@ spec:
        - name: llama
          args:
          - api
-          image: quay.io/go-skynet/llama-cli:v0.3
+          image: quay.io/go-skynet/llama-cli:latest
 ---
 apiVersion: v1
 kind: Service
--- a/main.go
+++ b/main.go
@@ -8,7 +8,10 @@ import (
 	"runtime"
 	"text/template"

-	llama "github.com/go-skynet/llama/go"
+	llama "github.com/go-skynet/go-llama.cpp"
+	api "github.com/go-skynet/llama-cli/api"
+	model "github.com/go-skynet/llama-cli/pkg/model"
+
 	"github.com/urfave/cli/v2"
 )

@@ -33,12 +36,6 @@ var nonEmptyInput string = `Below is an instruction that describes a task, paire

 func llamaFromOptions(ctx *cli.Context) (*llama.LLama, error) {
 	opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
-	if ctx.Bool("alpaca") {
-		opts = append(opts, llama.EnableAlpaca)
-	}
-	if ctx.Bool("gpt4all") {
-		opts = append(opts, llama.EnableGPT4All)
-	}
 	return llama.New(ctx.String("model"), opts...)
 }

@@ -92,16 +89,6 @@ var modelFlags = []cli.Flag{
 		EnvVars: []string{"TOP_K"},
 		Value:   20,
 	},
-	&cli.BoolFlag{
-		Name:    "alpaca",
-		EnvVars: []string{"ALPACA"},
-		Value:   true,
-	},
-	&cli.BoolFlag{
-		Name:    "gpt4all",
-		EnvVars: []string{"GPT4ALL"},
-		Value:   false,
-	},
 }

 func main() {
@@ -134,24 +121,6 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
 `,
 		Copyright: "go-skynet authors",
 		Commands: []*cli.Command{
-			{
-				Flags: modelFlags,
-				Name:  "interactive",
-				Action: func(ctx *cli.Context) error {
-
-					l, err := llamaFromOptions(ctx)
-					if err != nil {
-						fmt.Println("Loading the model failed:", err.Error())
-						os.Exit(1)
-					}
-
-					return startInteractive(l, llama.SetTemperature(ctx.Float64("temperature")),
-						llama.SetTopP(ctx.Float64("topp")),
-						llama.SetTopK(ctx.Int("topk")),
-						llama.SetTokens(ctx.Int("tokens")),
-						llama.SetThreads(ctx.Int("threads")))
-				},
-			},
 			{

 				Name: "api",
@@ -162,24 +131,18 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
 						Value:   runtime.NumCPU(),
 					},
 					&cli.StringFlag{
-						Name:    "model",
-						EnvVars: []string{"MODEL_PATH"},
+						Name:    "models-path",
+						EnvVars: []string{"MODELS_PATH"},
+					},
+					&cli.StringFlag{
+						Name:    "default-model",
+						EnvVars: []string{"default-model"},
 					},
 					&cli.StringFlag{
 						Name:    "address",
 						EnvVars: []string{"ADDRESS"},
 						Value:   ":8080",
 					},
-					&cli.BoolFlag{
-						Name:    "alpaca",
-						EnvVars: []string{"ALPACA"},
-						Value:   true,
-					},
-					&cli.BoolFlag{
-						Name:    "gpt4all",
-						EnvVars: []string{"GPT4ALL"},
-						Value:   false,
-					},
 					&cli.IntFlag{
 						Name:    "context-size",
 						EnvVars: []string{"CONTEXT_SIZE"},
@@ -187,13 +150,19 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
 					},
 				},
 				Action: func(ctx *cli.Context) error {
-					l, err := llamaFromOptions(ctx)
-					if err != nil {
-						fmt.Println("Loading the model failed:", err.Error())
-						os.Exit(1)
+
+					var defaultModel *llama.LLama
+					defModel := ctx.String("default-model")
+					if defModel != "" {
+						opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
+						var err error
+						defaultModel, err = llama.New(ctx.String("default-model"), opts...)
+						if err != nil {
+							return err
+						}
 					}

-					return api(l, ctx.String("address"), ctx.Int("threads"))
+					return api.Start(defaultModel, model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
 				},
 			},
 		},
--- a/models/.keep
+++ b/models/.keep
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -0,0 +1,114 @@
+package model
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"text/template"
+
+	llama "github.com/go-skynet/go-llama.cpp"
+)
+
+type ModelLoader struct {
+	modelPath        string
+	mu               sync.Mutex
+	models           map[string]*llama.LLama
+	promptsTemplates map[string]*template.Template
+}
+
+func NewModelLoader(modelPath string) *ModelLoader {
+	return &ModelLoader{modelPath: modelPath, models: make(map[string]*llama.LLama), promptsTemplates: make(map[string]*template.Template)}
+}
+
+func (ml *ModelLoader) ListModels() ([]string, error) {
+	files, err := ioutil.ReadDir(ml.modelPath)
+	if err != nil {
+		return []string{}, err
+	}
+
+	models := []string{}
+	for _, file := range files {
+		if strings.HasSuffix(file.Name(), ".bin") {
+			models = append(models, strings.TrimRight(file.Name(), ".bin"))
+		}
+	}
+
+	return models, nil
+}
+
+func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	m, ok := ml.promptsTemplates[modelName]
+	if !ok {
+		// try to find a s.bin
+		modelBin := fmt.Sprintf("%s.bin", modelName)
+		m, ok = ml.promptsTemplates[modelBin]
+		if !ok {
+			return "", fmt.Errorf("no prompt template available")
+		}
+	}
+
+	var buf bytes.Buffer
+
+	if err := m.Execute(&buf, in); err != nil {
+		return "", err
+	}
+	return buf.String(), nil
+}
+
+func (ml *ModelLoader) LoadModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
+	// Check if we already have a loaded model
+	modelFile := filepath.Join(ml.modelPath, modelName)
+
+	if m, ok := ml.models[modelFile]; ok {
+		return m, nil
+	}
+
+	// Check if the model path exists
+	if _, err := os.Stat(modelFile); os.IsNotExist(err) {
+		// try to find a s.bin
+		modelBin := fmt.Sprintf("%s.bin", modelFile)
+		if _, err := os.Stat(modelBin); os.IsNotExist(err) {
+			return nil, err
+		} else {
+			modelName = fmt.Sprintf("%s.bin", modelName)
+			modelFile = modelBin
+		}
+	}
+
+	// Load the model and keep it in memory for later use
+	model, err := llama.New(modelFile, opts...)
+	if err != nil {
+		return nil, err
+	}
+
+	// If there is a prompt template, load it
+
+	modelTemplateFile := fmt.Sprintf("%s.tmpl", modelFile)
+	// Check if the model path exists
+	if _, err := os.Stat(modelTemplateFile); err == nil {
+		dat, err := os.ReadFile(modelTemplateFile)
+		if err != nil {
+			return nil, err
+		}
+
+		// Parse the template
+		tmpl, err := template.New("prompt").Parse(string(dat))
+		if err != nil {
+			return nil, err
+		}
+		ml.promptsTemplates[modelName] = tmpl
+	}
+
+	ml.models[modelFile] = model
+	return model, err
+}
Author	SHA1	Message	Date
Ettore Di Giacinto	e8eab66c30	Merge pull request #22 from go-skynet/update-llama.cpp ⬆️ Update go-llama.cpp to `llama.cpp-2f7c8e0`	2023-04-16 00:06:52 +02:00
mudler	a73a497143	Update llama.cpp	2023-04-15 23:57:00 +02:00
Ettore Di Giacinto	6aea515e1d	Merge pull request #20 from go-skynet/mudler-patch-1 📖 Update README.md	2023-04-15 00:38:30 +02:00
Ettore Di Giacinto	dfc2b7e02a	📖 Update README.md	2023-04-15 00:38:18 +02:00
Ettore Di Giacinto	040290971c	Merge pull request #19 from go-skynet/tags Use tags for go-llama.cpp	2023-04-15 00:14:47 +02:00
mudler	553bad585e	Use tags for go-llama.cpp	2023-04-15 00:07:39 +02:00
Ettore Di Giacinto	f76b612506	Merge pull request #17 from go-skynet/mudler-patch-1 Fix comment typo	2023-04-13 15:21:13 +02:00
Ettore Di Giacinto	c4e94c88d7	Fix comment typo Thanks to @deadprogram for noticing it!	2023-04-13 15:20:51 +02:00
mudler	a9cd6b3ca3	ci: Fix tag detection for 'latest'	2023-04-13 01:37:09 +02:00
mudler	e786576b95	Update README	2023-04-13 01:28:15 +02:00
Ettore Di Giacinto	d426571789	Merge pull request #16 from go-skynet/fix_arm Drop armv7 builds	2023-04-13 01:21:58 +02:00
mudler	a896a2b5ad	Drop armv7 builds	2023-04-13 01:21:40 +02:00
Ettore Di Giacinto	8273cd5c04	Merge pull request #15 from go-skynet/docker-compose Add docker-compose file	2023-04-13 01:17:44 +02:00
mudler	16f1281d38	Minor workflow fixes	2023-04-13 01:16:13 +02:00
mudler	8042e9a2d6	Add docker-compose Fixes #14 Signed-off-by: mudler <mudler@c3os.io>	2023-04-13 01:13:14 +02:00
mudler	624092cb99	Update README	2023-04-12 00:07:30 +02:00
mudler	a422a883ac	Minor rephrasing	2023-04-12 00:04:15 +02:00
mudler	7858a97254	Update README	2023-04-12 00:02:47 +02:00
mudler	5556aa46dd	Small refinements and refactors	2023-04-12 00:02:39 +02:00
mudler	eb4257f946	Add .gitignore	2023-04-11 23:44:00 +02:00
mudler	ae30bd346d	Reorganize repository layout	2023-04-11 23:43:43 +02:00
mudler	93d8977ba2	Return model list	2023-04-10 12:02:40 +02:00
mudler	f43aeeb4a1	Add both API endpoints (completion, chat)	2023-04-09 12:30:55 +02:00
mudler	c17dcc5e9d	Allow to inject prompt as part of the call	2023-04-09 09:36:19 +02:00
mudler	4a932483e1	Small fixup to template loading	2023-04-08 11:59:40 +02:00
mudler	b710147b95	Add mutex on same models (parallel isn't supported yet)	2023-04-08 11:45:36 +02:00
mudler	ba70363330	Use template input	2023-04-08 11:24:25 +02:00
mudler	9fb581739b	Allow to template model prompts inputs	2023-04-08 10:46:51 +02:00
mudler	48aca246e3	Drop unused interactive mode	2023-04-07 11:31:14 +02:00
mudler	12eee097b7	Make it compatible with openAI api, support multiple models Signed-off-by: mudler <mudler@c3os.io>	2023-04-07 11:30:59 +02:00
mudler	b33d015b8c	Use go-llama.cpp	2023-04-07 10:08:15 +02:00
Ettore Di Giacinto	b7c0a108f5	Update README.md	2023-04-05 22:28:03 +02:00
Ettore Di Giacinto	f694a89c28	Update README.md	2023-04-05 22:14:00 +02:00
Ettore Di Giacinto	be682e6c2f	Update README.md Add short-term roadmap and mention webui	2023-04-05 22:04:35 +02:00
mudler	bf85a31f9e	Don't set a default model path	2023-04-05 22:00:15 +02:00
Ettore Di Giacinto	d69048e0b0	Update README.md	2023-04-05 00:41:02 +02:00
mudler	827f189163	Update README	2023-03-30 18:46:11 +02:00