ci: Fix tag detection for 'latest'

Update README
Merge pull request #16 from go-skynet/fix_arm
2026-02-03 11:13:31 -05:00 · 2023-04-13 01:37:09 +02:00 · 2023-04-13 01:28:15 +02:00 · 2023-04-13 01:21:58 +02:00 · 2023-04-13 01:21:40 +02:00 · 2023-04-13 01:17:44 +02:00
14 changed files with 236 additions and 231 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+models/*.bin
--- a/.env
+++ b/.env
@@ -0,0 +1 @@
+THREADS=14
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -2,6 +2,7 @@
 name: 'build container images'

 on:
+  pull_request:
  push:
    branches:
      - master
@@ -12,68 +13,42 @@ jobs:
  docker:
    runs-on: ubuntu-latest
    steps:
-      - name: Release space from worker
-        run: |
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          df -h
-          echo
-          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
-          sudo rm -rf /usr/local/lib/android
-          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-          sudo rm -rf /usr/share/dotnet
-          sudo apt-get remove -y '^mono-.*' || true
-          sudo apt-get remove -y '^ghc-.*' || true
-          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-          sudo apt-get remove -y 'php.*' || true
-          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-          sudo apt-get remove -y '^google-.*' || true
-          sudo apt-get remove -y azure-cli || true
-          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get autoremove -y
-          sudo apt-get clean
-          echo
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          sudo rm -rfv build || true
-          df -h
      - name: Checkout
        uses: actions/checkout@v3
+
      - name: Prepare
        id: prep
        run: |
          DOCKER_IMAGE=quay.io/go-skynet/llama-cli
-          VERSION=latest
+          VERSION=master
          SHORTREF=${GITHUB_SHA::8}
+
          # If this is git tag, use the tag name as a docker tag
          if [[ $GITHUB_REF == refs/tags/* ]]; then
            VERSION=${GITHUB_REF#refs/tags/}
          fi
          TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
+
          # If the VERSION looks like a version number, assume that
          # this is the most recent version of the image and also
          # tag it 'latest'.
-          if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
+          if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
            TAGS="$TAGS,${DOCKER_IMAGE}:latest"
          fi
+
          # Set output parameters.
          echo ::set-output name=tags::${TAGS}
          echo ::set-output name=docker_image::${DOCKER_IMAGE}
-          echo ::set-output name=image::${DOCKER_IMAGE}:${VERSION}
+
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
        with:
          platforms: all
+
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@master
+
      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v2
@@ -81,9 +56,23 @@ jobs:
          registry: quay.io
          username: ${{ secrets.QUAY_USERNAME }}
          password: ${{ secrets.QUAY_PASSWORD }}
-      - uses: earthly/actions/setup-earthly@v1
      - name: Build
-        run: |
-            earthly config "global.conversion_parallelism" "1"
-            earthly config "global.buildkit_max_parallelism" "1"
-            earthly --push +image-all --IMAGE=${{ steps.prep.outputs.image }}
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@v4
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: .
+          file: ./Dockerfile
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: ${{ steps.prep.outputs.tags }}
+      - name: Build PRs
+        if: github.event_name == 'pull_request'
+        uses: docker/build-push-action@v4
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          context: .
+          file: ./Dockerfile
+          platforms: linux/amd64
+          push: false
+          tags: ${{ steps.prep.outputs.tags }}
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+llama-cli
+models/*.bin
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
+ARG GO_VERSION=1.20
+ARG DEBIAN_VERSION=11
+
+FROM golang:$GO_VERSION as builder
+
+WORKDIR /build
+RUN git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp
+RUN cd go-llama.cpp && make libbinding.a
+COPY go.mod ./
+COPY go.sum ./
+RUN go mod download
+RUN apt-get update
+COPY . .
+RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
+RUN C_INCLUDE_PATH=/build/go-llama.cpp LIBRARY_PATH=/build/go-llama.cpp go build -o llama-cli ./
+
+FROM debian:$DEBIAN_VERSION
+COPY --from=builder /build/llama-cli /usr/bin/llama-cli
+ENTRYPOINT [ "/usr/bin/llama-cli" ]
--- a/31
+++ b/31
@@ -1,32 +1,5 @@
 VERSION 0.7

-go-deps:
-    ARG GO_VERSION=1.20
-    FROM golang:$GO_VERSION
-    WORKDIR /build
-    COPY go.mod ./
-    COPY go.sum ./
-    RUN go mod download
-    RUN apt-get update
-    SAVE ARTIFACT go.mod AS LOCAL go.mod
-    SAVE ARTIFACT go.sum AS LOCAL go.sum
-
 build:
-    FROM +go-deps
-    WORKDIR /build
-    RUN git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp
-    RUN cd go-llama.cpp && make libbinding.a
-    COPY . .
-    RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
-    RUN C_INCLUDE_PATH=$GOPATH/src/github.com/go-skynet/go-llama.cpp LIBRARY_PATH=$GOPATH/src/github.com/go-skynet/go-llama.cpp go build -o llama-cli ./
-    SAVE ARTIFACT llama-cli AS LOCAL llama-cli
-
-image:
-    FROM +go-deps
-    ARG IMAGE=alpaca-cli-nomodel
-    COPY +build/llama-cli /llama-cli
-    ENTRYPOINT [ "/llama-cli" ]
-    SAVE IMAGE --push $IMAGE
-
-image-all:
-    BUILD --platform=linux/amd64 --platform=linux/arm64 +image
+    FROM DOCKERFILE -f Dockerfile .
+    SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli
--- a/README.md
+++ b/README.md
@@ -1,16 +1,70 @@
 ## :camel: llama-cli


-llama-cli is a straightforward golang CLI interface for [llama.cpp](https://github.com/ggerganov/llama.cpp), providing a simple API and a command line interface that allows text generation using a GPT-based model like llama directly from the terminal. It is also compatible with [gpt4all](https://github.com/nomic-ai/gpt4all) and [alpaca](https://github.com/tatsu-lab/stanford_alpaca).
+llama-cli is a straightforward golang CLI interface and API compatible with OpenAI for [llama.cpp](https://github.com/ggerganov/llama.cpp), it supports multiple-models and also provides a simple command line interface that allows text generation using a GPT-based model like llama directly from the terminal. 

-`llama-cli` uses https://github.com/go-skynet/llama, which is a fork of [llama.cpp](https://github.com/ggerganov/llama.cpp) providing golang binding.
+It is compatible with the models supported by `llama.cpp`. You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
+
+`llama-cli` doesn't shell-out, it uses https://github.com/go-skynet/go-llama.cpp, which is a golang binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
+
+## Usage
+
+You can use `docker-compose`:
+
+```bash
+
+git clone https://github.com/go-skynet/llama-cli
+cd llama-cli
+
+# copy your models to models/
+cp your-model.bin models/
+
+# (optional) Edit the .env file to set the number of concurrent threads used for inference
+# echo "THREADS=14" > .env
+
+# start with docker-compose
+docker compose up -d --build
+
+# Now API is accessible at localhost:8080
+curl http://localhost:8080/v1/models
+# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "your-model.bin",            
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
+
+
+```
+
+Note: You can use a use a default template for every model in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
+
+```
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+{{.Input}}
+
+### Response:
+```

 ## Container images

+`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
+
 To begin, run:

 ```
-docker run -ti --rm quay.io/go-skynet/llama-cli:v0.4  --instruction "What's an alpaca?" --topk 10000 --model ...
+docker run -ti --rm quay.io/go-skynet/llama-cli:latest  --instruction "What's an alpaca?" --topk 10000 --model ...
+```
+
+Where `--model` is the path of the model you want to use. 
+
+Note: you need to mount a volume to the docker container in order to load a model, for instance:
+
+```
+# assuming your model is in /path/to/your/models/foo.bin
+docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest  --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
 ```

 You will receive a response like the following:
@@ -39,8 +93,6 @@ llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--
 | top_p        | TOP_P                | 0.85          | The cumulative probability for top-p sampling. |
 | top_k        | TOP_K                | 20            | The number of top-k tokens to consider for text generation.  |
 | context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| alpaca       | ALPACA               | true          | Set to true for alpaca models. |
-| gpt4all       | GPT4ALL               | false          | Set to true for gpt4all models. |

 Here's an example of using `llama-cli`:

@@ -50,14 +102,14 @@ llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"

 This will generate text based on the given model and instruction.

-## Advanced usage
+## API

-`llama-cli` also provides an API for running text generation as a service. The model will be pre-loaded and kept in memory.
+`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.

 Example of starting the API with `docker`:

 ```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.4 api --context-size 700 --threads 4
+docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
 ```

 And you'll see:
@@ -72,36 +124,68 @@ And you'll see:
 └───────────────────────────────────────────────────┘ 
 ```

+Note: Models have to end up with `.bin`.
+
 You can control the API server options with command line arguments:

 ```
-llama-cli api --model <model_path> [--address <address>] [--threads <num_threads>]
+llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
 ```

 The API takes takes the following:

 | Parameter    | Environment Variable | Default Value | Description                            |
 | ------------ | -------------------- | ------------- | -------------------------------------- |
-| model        | MODEL_PATH           |               | The path to the pre-trained GPT-based model.      |
+| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
 | threads      | THREADS              | CPU cores     | The number of threads to use for text generation. |
 | address      | ADDRESS              | :8080         | The address and port to listen on. |
 | context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| alpaca       | ALPACA               | true          | Set to true for alpaca models. |
-| gpt4all       | GPT4ALL               | false          | Set to true for gpt4all models. |

+Once the server is running, you can start making requests to it using HTTP, using the OpenAI API. 

-Once the server is running, you can start making requests to it using HTTP. For example, to generate text based on an instruction, you can send a POST request to the `/predict` endpoint with the instruction as the request body:
+### Supported OpenAI API endpoints
+
+You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
+
+Following the list of endpoints/parameters supported.
+
+#### Chat completions
+
+For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:

 ```
-curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
-    "text": "What is an alpaca?",
-    "topP": 0.8,
-    "topK": 50,
-    "temperature": 0.7,
-    "tokens": 100
-}'
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
 ```

+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+#### Completions
+
+For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
+```
+curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+     "model": "ggml-koala-7b-model-q4_0-r2.bin",
+     "prompt": "A long time ago in a galaxy far, far away",
+     "temperature": 0.7
+   }'
+```
+
+Available additional parameters: `top_p`, `top_k`, `max_tokens`
+
+#### List models
+
+You can list all the models available with:
+
+```
+curl http://localhost:8080/v1/models
+```
+
+## Web interface
+
 There is also available a simple web interface (for instance, http://localhost:8080/) which can be used as a playground.

 Note: The API doesn't inject a template for talking to the instance, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release, for instance:
@@ -115,18 +199,9 @@ Below is an instruction that describes a task. Write a response that appropriate
 ### Response:
 ```

+
 ## Using other models

-You can specify a model binary to be used for inference with `--model`.
-
-13B and 30B alpaca models are known to work:
-
-```
-# Download the model image, extract the model
-# Use the model with llama-cli
-docker run -v $PWD:/models -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.4 api --model /models/model.bin
-```
-
 gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):

 ```bash
@@ -154,7 +229,7 @@ import (

 func main() {

-	cli := client.NewClient("http://ip:30007")
+	cli := client.NewClient("http://ip:port")

 	out, err := cli.Predict("What's an alpaca?")
 	if err != nil {
@@ -185,9 +260,8 @@ In order to build the `llama-cli` container image locally you can use `docker`:

 ```
 # build the image as "alpaca-image"
-docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +image --IMAGE=alpaca-image
-# run the image
-docker run alpaca-image --instruction "What's an alpaca?"
+docker build -t llama-cli .
+docker run llama-cli --instruction "What's an alpaca?"
 ```

 Or build the binary with:
@@ -201,11 +275,11 @@ docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v

 ## Short-term roadmap

- Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
+- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
 - Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
 - Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
- Multi-model support
- Full Deployment and compatibility with https://github.com/mckaywrigley/chatbot-ui
+- [x] Multi-model support
+- Have a webUI!

 ## License

--- a/api/api.go
+++ b/api/api.go
@@ -1,4 +1,4 @@
-package main
+package api

 import (
 	"embed"
@@ -8,6 +8,8 @@ import (
 	"strings"
 	"sync"

+	model "github.com/go-skynet/llama-cli/pkg/model"
+
 	llama "github.com/go-skynet/go-llama.cpp"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
@@ -40,20 +42,31 @@ type OpenAIModel struct {
 	Object string `json:"object"`
 }

+type OpenAIRequest struct {
+	Model string `json:"model"`
+
+	// Prompt is read only by completion API calls
+	Prompt string `json:"prompt"`
+	// Messages is readh only by chat/completion API calls
+	Messages []Message `json:"messages"`
+
+	// Common options between all the API calls
+	TopP        float64 `json:"top_p"`
+	TopK        int     `json:"top_k"`
+	Temperature float64 `json:"temperature"`
+	Maxtokens   int     `json:"max_tokens"`
+}
+
 //go:embed index.html
 var indexHTML embed.FS

-func completionEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
+func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-
 		var err error
 		var model *llama.LLama

+		input := new(OpenAIRequest)
 		// Get input data from the request body
-		input := new(struct {
-			Model  string `json:"model"`
-			Prompt string `json:"prompt"`
-		})
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
@@ -88,128 +101,35 @@ func completionEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads
 		}

 		// Set the parameters for the language model prediction
-		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
-		if err != nil {
-			return err
+		topP := input.TopP
+		if topP == 0 {
+			topP = 0.7
+		}
+		topK := input.TopK
+		if topK == 0 {
+			topK = 80
 		}

-		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
-		if err != nil {
-			return err
+		temperature := input.Temperature
+		if temperature == 0 {
+			temperature = 0.9
 		}

-		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
-		if err != nil {
-			return err
-		}
-
-		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
-		if err != nil {
-			return err
+		tokens := input.Maxtokens
+		if tokens == 0 {
+			tokens = 512
 		}

 		predInput := input.Prompt
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
-			Input string
-		}{Input: input.Prompt})
-		if err == nil {
-			predInput = templatedInput
-		}
-
-		// Generate the prediction using the language model
-		prediction, err := model.Predict(
-			predInput,
-			llama.SetTemperature(temperature),
-			llama.SetTopP(topP),
-			llama.SetTopK(topK),
-			llama.SetTokens(tokens),
-			llama.SetThreads(threads),
-		)
-		if err != nil {
-			return err
-		}
-
-		// Return the prediction in the response body
-		return c.JSON(OpenAIResponse{
-			Model:   input.Model,
-			Choices: []Choice{{Text: prediction}},
-		})
-	}
-}
-
-func chatEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		var err error
-		var model *llama.LLama
-
-		// Get input data from the request body
-		input := new(struct {
-			Messages []Message `json:"messages"`
-			Model    string    `json:"model"`
-		})
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		if input.Model == "" {
-			if defaultModel == nil {
-				return fmt.Errorf("no default model loaded, and no model specified")
+		if chat {
+			mess := []string{}
+			for _, i := range input.Messages {
+				mess = append(mess, i.Content)
 			}
-			model = defaultModel
-		} else {
-			model, err = loader.LoadModel(input.Model)
-			if err != nil {
-				return err
-			}
-		}

-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		if input.Model != "" {
-			mutexMap.Lock()
-			l, ok := mutexes[input.Model]
-			if !ok {
-				m := &sync.Mutex{}
-				mutexes[input.Model] = m
-				l = m
-			}
-			mutexMap.Unlock()
-			l.Lock()
-			defer l.Unlock()
-		} else {
-			defaultMutex.Lock()
-			defer defaultMutex.Unlock()
+			predInput = strings.Join(mess, "\n")
 		}

-		// Set the parameters for the language model prediction
-		topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
-		if err != nil {
-			return err
-		}
-
-		topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
-		if err != nil {
-			return err
-		}
-
-		temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
-		if err != nil {
-			return err
-		}
-
-		tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
-		if err != nil {
-			return err
-		}
-
-		mess := []string{}
-		for _, i := range input.Messages {
-			mess = append(mess, i.Content)
-		}
-
-		predInput := strings.Join(mess, "\n")
-
 		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
 		templatedInput, err := loader.TemplatePrefix(input.Model, struct {
 			Input string
@@ -231,15 +151,23 @@ func chatEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, d
 			return err
 		}

+		if chat {
+			// Return the chat prediction in the response body
+			return c.JSON(OpenAIResponse{
+				Model:   input.Model,
+				Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
+			})
+		}
+
 		// Return the prediction in the response body
 		return c.JSON(OpenAIResponse{
 			Model:   input.Model,
-			Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
+			Choices: []Choice{{Text: prediction}},
 		})
 	}
 }

-func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, threads int) error {
+func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
 	app := fiber.New()

 	// Default middleware config
@@ -252,8 +180,8 @@ func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, thre
 	var mumutex = &sync.Mutex{}

 	// openAI compatible API endpoint
-	app.Post("/v1/chat/completions", chatEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
-	app.Post("/v1/completions", completionEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
+	app.Post("/v1/chat/completions", openAIEndpoint(true, defaultModel, loader, threads, mutex, mumutex, mu))
+	app.Post("/v1/completions", openAIEndpoint(false, defaultModel, loader, threads, mutex, mumutex, mu))
 	app.Get("/v1/models", func(c *fiber.Ctx) error {
 		models, err := loader.ListModels()
 		if err != nil {
--- a/api/index.html
+++ b/api/index.html
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,15 @@
+version: '3.6'
+
+services:
+  api:
+    image: quay.io/go-skynet/llama-cli:latest
+    build: .
+    volumes:
+      - ./models:/models
+    ports:
+      - 8080:8080
+    environment:
+      - MODELS_PATH=/models
+      - CONTEXT_SIZE=700
+      - THREADS=$THREADS
+    command: api
--- a/kubernetes/deployment.yaml
+++ b/kubernetes/deployment.yaml
@@ -25,7 +25,7 @@ spec:
        - name: llama
          args:
          - api
-          image: quay.io/go-skynet/llama-cli:v0.3
+          image: quay.io/go-skynet/llama-cli:latest
 ---
 apiVersion: v1
 kind: Service
--- a/main.go
+++ b/main.go
@@ -9,6 +9,9 @@ import (
 	"text/template"

 	llama "github.com/go-skynet/go-llama.cpp"
+	api "github.com/go-skynet/llama-cli/api"
+	model "github.com/go-skynet/llama-cli/pkg/model"
+
 	"github.com/urfave/cli/v2"
 )

@@ -159,7 +162,7 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
 						}
 					}

-					return api(defaultModel, NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
+					return api.Start(defaultModel, model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
 				},
 			},
 		},
--- a/models/.keep
+++ b/models/.keep
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -1,4 +1,4 @@
-package main
+package model

 import (
 	"bytes"
Author	SHA1	Message	Date
mudler	a9cd6b3ca3	ci: Fix tag detection for 'latest'	2023-04-13 01:37:09 +02:00
mudler	e786576b95	Update README	2023-04-13 01:28:15 +02:00
Ettore Di Giacinto	d426571789	Merge pull request #16 from go-skynet/fix_arm Drop armv7 builds	2023-04-13 01:21:58 +02:00
mudler	a896a2b5ad	Drop armv7 builds	2023-04-13 01:21:40 +02:00
Ettore Di Giacinto	8273cd5c04	Merge pull request #15 from go-skynet/docker-compose Add docker-compose file	2023-04-13 01:17:44 +02:00
mudler	16f1281d38	Minor workflow fixes	2023-04-13 01:16:13 +02:00
mudler	8042e9a2d6	Add docker-compose Fixes #14 Signed-off-by: mudler <mudler@c3os.io>	2023-04-13 01:13:14 +02:00
mudler	624092cb99	Update README	2023-04-12 00:07:30 +02:00
mudler	a422a883ac	Minor rephrasing	2023-04-12 00:04:15 +02:00
mudler	7858a97254	Update README	2023-04-12 00:02:47 +02:00
mudler	5556aa46dd	Small refinements and refactors	2023-04-12 00:02:39 +02:00
mudler	eb4257f946	Add .gitignore	2023-04-11 23:44:00 +02:00
mudler	ae30bd346d	Reorganize repository layout	2023-04-11 23:43:43 +02:00