Compare commits

..

64 Commits
v0.1 ... v0.9

Author SHA1 Message Date
Ettore Di Giacinto
e8eab66c30 Merge pull request #22 from go-skynet/update-llama.cpp
⬆️ Update go-llama.cpp to `llama.cpp-2f7c8e0`
2023-04-16 00:06:52 +02:00
mudler
a73a497143 Update llama.cpp 2023-04-15 23:57:00 +02:00
Ettore Di Giacinto
6aea515e1d Merge pull request #20 from go-skynet/mudler-patch-1
📖 Update README.md
2023-04-15 00:38:30 +02:00
Ettore Di Giacinto
dfc2b7e02a 📖 Update README.md 2023-04-15 00:38:18 +02:00
Ettore Di Giacinto
040290971c Merge pull request #19 from go-skynet/tags
Use tags for go-llama.cpp
2023-04-15 00:14:47 +02:00
mudler
553bad585e Use tags for go-llama.cpp 2023-04-15 00:07:39 +02:00
Ettore Di Giacinto
f76b612506 Merge pull request #17 from go-skynet/mudler-patch-1
Fix comment typo
2023-04-13 15:21:13 +02:00
Ettore Di Giacinto
c4e94c88d7 Fix comment typo
Thanks to @deadprogram for noticing it!
2023-04-13 15:20:51 +02:00
mudler
a9cd6b3ca3 ci: Fix tag detection for 'latest' 2023-04-13 01:37:09 +02:00
mudler
e786576b95 Update README 2023-04-13 01:28:15 +02:00
Ettore Di Giacinto
d426571789 Merge pull request #16 from go-skynet/fix_arm
Drop armv7 builds
2023-04-13 01:21:58 +02:00
mudler
a896a2b5ad Drop armv7 builds 2023-04-13 01:21:40 +02:00
Ettore Di Giacinto
8273cd5c04 Merge pull request #15 from go-skynet/docker-compose
Add docker-compose file
2023-04-13 01:17:44 +02:00
mudler
16f1281d38 Minor workflow fixes 2023-04-13 01:16:13 +02:00
mudler
8042e9a2d6 Add docker-compose
Fixes #14

Signed-off-by: mudler <mudler@c3os.io>
2023-04-13 01:13:14 +02:00
mudler
624092cb99 Update README 2023-04-12 00:07:30 +02:00
mudler
a422a883ac Minor rephrasing 2023-04-12 00:04:15 +02:00
mudler
7858a97254 Update README 2023-04-12 00:02:47 +02:00
mudler
5556aa46dd Small refinements and refactors 2023-04-12 00:02:39 +02:00
mudler
eb4257f946 Add .gitignore 2023-04-11 23:44:00 +02:00
mudler
ae30bd346d Reorganize repository layout 2023-04-11 23:43:43 +02:00
mudler
93d8977ba2 Return model list 2023-04-10 12:02:40 +02:00
mudler
f43aeeb4a1 Add both API endpoints (completion, chat) 2023-04-09 12:30:55 +02:00
mudler
c17dcc5e9d Allow to inject prompt as part of the call 2023-04-09 09:36:19 +02:00
mudler
4a932483e1 Small fixup to template loading 2023-04-08 11:59:40 +02:00
mudler
b710147b95 Add mutex on same models (parallel isn't supported yet) 2023-04-08 11:45:36 +02:00
mudler
ba70363330 Use template input 2023-04-08 11:24:25 +02:00
mudler
9fb581739b Allow to template model prompts inputs 2023-04-08 10:46:51 +02:00
mudler
48aca246e3 Drop unused interactive mode 2023-04-07 11:31:14 +02:00
mudler
12eee097b7 Make it compatible with openAI api, support multiple models
Signed-off-by: mudler <mudler@c3os.io>
2023-04-07 11:30:59 +02:00
mudler
b33d015b8c Use go-llama.cpp 2023-04-07 10:08:15 +02:00
Ettore Di Giacinto
b7c0a108f5 Update README.md 2023-04-05 22:28:03 +02:00
Ettore Di Giacinto
f694a89c28 Update README.md 2023-04-05 22:14:00 +02:00
Ettore Di Giacinto
be682e6c2f Update README.md
Add short-term roadmap and mention webui
2023-04-05 22:04:35 +02:00
mudler
bf85a31f9e Don't set a default model path 2023-04-05 22:00:15 +02:00
Ettore Di Giacinto
d69048e0b0 Update README.md 2023-04-05 00:41:02 +02:00
mudler
827f189163 Update README 2023-03-30 18:46:11 +02:00
mudler
a23deb5ec7 Drop duplicate target 2023-03-29 19:44:41 +02:00
mudler
999676b106 Add gpt4all instructions 2023-03-29 18:58:54 +02:00
mudler
c61b023bc8 Drop fat images, will document how to consume models 2023-03-29 18:55:24 +02:00
mudler
650a22aef1 Add compatibility to gpt4all models 2023-03-29 18:53:24 +02:00
mudler
17b1724f7c Update llama-go 2023-03-27 01:18:14 +02:00
mudler
e860e62036 Add mutex, build only lite images 2023-03-27 01:01:38 +02:00
Ettore Di Giacinto
1f45ff8cd6 Update README.md 2023-03-26 23:37:26 +02:00
mudler
abee34f60a Cleanup leftover 2023-03-25 01:10:50 +01:00
mudler
dbc70dc13c Add a simple web-page as index of the API for helping with inference testing 2023-03-25 01:09:51 +01:00
mudler
55142065eb Update README with building instructions 2023-03-24 01:11:13 +01:00
mudler
d83d2293b5 Update version in kubernetes deployment 2023-03-23 23:22:43 +01:00
mudler
467ce5a7aa Update models download instructions, update images 2023-03-23 22:06:41 +01:00
mudler
4c9c5ce4ce Update README on instruction on how to prompt with the API 2023-03-23 19:25:28 +01:00
mudler
6394d85ca2 Lower conversion parallelism 2023-03-23 19:22:23 +01:00
mudler
2b6a5aef5f Lower earthly parallelism 2023-03-23 19:17:15 +01:00
mudler
d191ecb9fe Disable release pipeline 2023-03-23 19:14:39 +01:00
mudler
e14e1b0a77 Update README 2023-03-23 18:57:25 +01:00
mudler
bffaf2aa42 Build images without model 2023-03-23 18:50:43 +01:00
mudler
d98d1fe55e Use models from model repository 2023-03-23 18:44:24 +01:00
mudler
0785cb6b0b Update README with 13B and 30B model instructions 2023-03-22 00:18:48 +01:00
mudler
f88d5ad829 Update MODEL_URL 2023-03-21 22:03:20 +01:00
Ettore Di Giacinto
c7119a2882 Use tagged image in kubernetes deployment 2023-03-21 21:33:11 +01:00
mudler
8324402b49 Add interactive.go 2023-03-21 19:21:58 +01:00
mudler
9ba30c9c44 Update llama-go, allow to set context-size and enable alpaca model by default 2023-03-21 19:20:23 +01:00
mudler
973042bb4c Update README to use tagged container images 2023-03-21 18:45:59 +01:00
mudler
3ed2888646 Update README 2023-03-20 23:26:29 +01:00
mudler
593ff6308c Add simple client 2023-03-20 23:25:39 +01:00
20 changed files with 1018 additions and 231 deletions

1
.dockerignore Normal file
View File

@@ -0,0 +1 @@
models/*.bin

1
.env Normal file
View File

@@ -0,0 +1 @@
THREADS=14

View File

@@ -2,6 +2,7 @@
name: 'build container images'
on:
pull_request:
push:
branches:
- master
@@ -12,68 +13,42 @@ jobs:
docker:
runs-on: ubuntu-latest
steps:
- name: Release space from worker
run: |
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
df -h
echo
sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
sudo apt-get remove --auto-remove android-sdk-platform-tools || true
sudo apt-get purge --auto-remove android-sdk-platform-tools || true
sudo rm -rf /usr/local/lib/android
sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
sudo rm -rf /usr/share/dotnet
sudo apt-get remove -y '^mono-.*' || true
sudo apt-get remove -y '^ghc-.*' || true
sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
sudo apt-get remove -y 'php.*' || true
sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
sudo apt-get remove -y '^google-.*' || true
sudo apt-get remove -y azure-cli || true
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
sudo apt-get remove -y '^gfortran-.*' || true
sudo apt-get autoremove -y
sudo apt-get clean
echo
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
sudo rm -rfv build || true
df -h
- name: Checkout
uses: actions/checkout@v3
- name: Prepare
id: prep
run: |
DOCKER_IMAGE=quay.io/go-skynet/llama-cli
VERSION=latest
VERSION=master
SHORTREF=${GITHUB_SHA::8}
# If this is git tag, use the tag name as a docker tag
if [[ $GITHUB_REF == refs/tags/* ]]; then
VERSION=${GITHUB_REF#refs/tags/}
fi
TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
# If the VERSION looks like a version number, assume that
# this is the most recent version of the image and also
# tag it 'latest'.
if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
TAGS="$TAGS,${DOCKER_IMAGE}:latest"
fi
# Set output parameters.
echo ::set-output name=tags::${TAGS}
echo ::set-output name=docker_image::${DOCKER_IMAGE}
echo ::set-output name=image::${DOCKER_IMAGE}:${VERSION}
- name: Set up QEMU
uses: docker/setup-qemu-action@master
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@master
- name: Login to DockerHub
if: github.event_name != 'pull_request'
uses: docker/login-action@v2
@@ -81,7 +56,23 @@ jobs:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_PASSWORD }}
- uses: earthly/actions/setup-earthly@v1
- name: Build
run: |
earthly --push +image-all --IMAGE=${{ steps.prep.outputs.image }}
if: github.event_name != 'pull_request'
uses: docker/build-push-action@v4
with:
builder: ${{ steps.buildx.outputs.name }}
context: .
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.prep.outputs.tags }}
- name: Build PRs
if: github.event_name == 'pull_request'
uses: docker/build-push-action@v4
with:
builder: ${{ steps.buildx.outputs.name }}
context: .
file: ./Dockerfile
platforms: linux/amd64
push: false
tags: ${{ steps.prep.outputs.tags }}

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
llama-cli
models/*.bin

18
Dockerfile Normal file
View File

@@ -0,0 +1,18 @@
ARG GO_VERSION=1.20
ARG DEBIAN_VERSION=11
FROM golang:$GO_VERSION as builder
WORKDIR /build
ARG GO_LLAMA_CPP_TAG=llama.cpp-2f7c8e0
RUN git clone -b $GO_LLAMA_CPP_TAG --recurse-submodules https://github.com/go-skynet/go-llama.cpp
RUN cd go-llama.cpp && make libbinding.a
COPY go.mod ./
COPY go.sum ./
RUN go mod download
RUN apt-get update
COPY . .
RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
RUN C_INCLUDE_PATH=/build/go-llama.cpp LIBRARY_PATH=/build/go-llama.cpp go build -o llama-cli ./
FROM debian:$DEBIAN_VERSION
COPY --from=builder /build/llama-cli /usr/bin/llama-cli
ENTRYPOINT [ "/usr/bin/llama-cli" ]

View File

@@ -1,40 +1,5 @@
VERSION 0.7
go-deps:
ARG GO_VERSION=1.20
FROM golang:$GO_VERSION
WORKDIR /build
COPY go.mod ./
COPY go.sum ./
RUN go mod download
RUN apt-get update
SAVE ARTIFACT go.mod AS LOCAL go.mod
SAVE ARTIFACT go.sum AS LOCAL go.sum
alpaca-model:
FROM alpine
# This is the alpaca.cpp model https://github.com/antimatter15/alpaca.cpp
ARG MODEL_URL=https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
RUN wget -O model.bin -c https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
SAVE ARTIFACT model.bin AS LOCAL model.bin
build:
FROM +go-deps
WORKDIR /build
RUN git clone https://github.com/go-skynet/llama
RUN cd llama && make libllama.a
COPY . .
RUN C_INCLUDE_PATH=/build/llama LIBRARY_PATH=/build/llama go build -o llama-cli ./
SAVE ARTIFACT llama-cli AS LOCAL llama-cli
image:
FROM +go-deps
ARG IMAGE=alpaca-cli
COPY +alpaca-model/model.bin /model.bin
COPY +build/llama-cli /llama-cli
ENV MODEL_PATH=/model.bin
ENTRYPOINT [ "/llama-cli" ]
SAVE IMAGE --push $IMAGE
image-all:
BUILD --platform=linux/amd64 --platform=linux/arm64 +image
FROM DOCKERFILE -f Dockerfile .
SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli

251
README.md
View File

@@ -1,14 +1,70 @@
## :camel: llama-cli
llama-cli is a straightforward golang CLI interface for [llama.cpp](https://github.com/ggerganov/llama.cpp), providing a simple API and a command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
llama-cli is a straightforward golang CLI interface and API compatible with OpenAI for [llama.cpp](https://github.com/ggerganov/llama.cpp), it supports multiple-models and also provides a simple command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
It is compatible with the models supported by `llama.cpp`. You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
`llama-cli` doesn't shell-out, it uses https://github.com/go-skynet/go-llama.cpp, which is a golang binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
## Usage
You can use `docker-compose`:
```bash
git clone https://github.com/go-skynet/llama-cli
cd llama-cli
# copy your models to models/
cp your-model.bin models/
# (optional) Edit the .env file to set the number of concurrent threads used for inference
# echo "THREADS=14" > .env
# start with docker-compose
docker compose up -d --build
# Now API is accessible at localhost:8080
curl http://localhost:8080/v1/models
# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "your-model.bin",
"prompt": "A long time ago in a galaxy far, far away",
"temperature": 0.7
}'
```
Note: You can use a default template for every model in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
```
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{{.Input}}
### Response:
```
## Container images
The `llama-cli` [container images](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest) come preloaded with the [alpaca.cpp](https://github.com/antimatter15/alpaca.cpp) model, enabling you to start making predictions immediately! To begin, run:
`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
To begin, run:
```
docker run -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000
docker run -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model ...
```
Where `--model` is the path of the model you want to use.
Note: you need to mount a volume to the docker container in order to load a model, for instance:
```
# assuming your model is in /path/to/your/models/foo.bin
docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
```
You will receive a response like the following:
@@ -19,7 +75,7 @@ An alpaca is a member of the South American Camelid family, which includes the l
## Basic usage
To use llama-cli, specify a pre-trained GPT-based model, an input text, and an instruction for text generation. llama-cli takes the following arguments:
To use llama-cli, specify a pre-trained GPT-based model, an input text, and an instruction for text generation. llama-cli takes the following arguments when running from the CLI:
```
llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--template <template_path>] [--tokens <num_tokens>] [--threads <num_threads>] [--temperature <temperature>] [--topp <top_p>] [--topk <top_k>]
@@ -33,10 +89,10 @@ llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--
| model | MODEL_PATH | | The path to the pre-trained GPT-based model. |
| tokens | TOKENS | 128 | The maximum number of tokens to generate. |
| threads | THREADS | NumCPU() | The number of threads to use for text generation. |
| temperature | TEMPERATURE | 0.95 | Sampling temperature for model output. |
| temperature | TEMPERATURE | 0.95 | Sampling temperature for model output. ( values between `0.1` and `1.0` ) |
| top_p | TOP_P | 0.85 | The cumulative probability for top-p sampling. |
| top_k | TOP_K | 20 | The number of top-k tokens to consider for text generation. |
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
Here's an example of using `llama-cli`:
@@ -46,22 +102,15 @@ llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
This will generate text based on the given model and instruction.
## Advanced usage
## API
`llama-cli` also provides an API for running text generation as a service. You can start the API server using the following command:
`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.
Example of starting the API with `docker`:
```bash
docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
```
llama-cli api --model <model_path> [--address <address>] [--threads <num_threads>]
```
The API takes takes the following arguments:
| Parameter | Environment Variable | Default Value | Description |
| ------------ | -------------------- | ------------- | -------------------------------------- |
| model | MODEL_PATH | | The path to the pre-trained GPT-based model. |
| threads | THREADS | CPU cores | The number of threads to use for text generation. |
| address | ADDRESS | :8080 | The address and port to listen on. |
And you'll see:
```
@@ -75,28 +124,170 @@ And you'll see:
└───────────────────────────────────────────────────┘
```
Once the server is running, you can make requests to it using HTTP. For example, to generate text based on an instruction, you can send a POST request to the `/predict` endpoint with the instruction as the request body:
Note: Models have to end up with `.bin`.
You can control the API server options with command line arguments:
```
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
"text": "What is an alpaca?",
"topP": 0.8,
"topK": 50,
"temperature": 0.7,
"tokens": 100
}'
llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
```
Example of starting the API with `docker`:
The API takes takes the following:
| Parameter | Environment Variable | Default Value | Description |
| ------------ | -------------------- | ------------- | -------------------------------------- |
| models-path | MODELS_PATH | | The path where you have models (ending with `.bin`). |
| threads | THREADS | CPU cores | The number of threads to use for text generation. |
| address | ADDRESS | :8080 | The address and port to listen on. |
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
Once the server is running, you can start making requests to it using HTTP, using the OpenAI API.
### Supported OpenAI API endpoints
You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create).
Following the list of endpoints/parameters supported.
#### Chat completions
For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
```
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "ggml-koala-7b-model-q4_0-r2.bin",
"messages": [{"role": "user", "content": "Say this is a test!"}],
"temperature": 0.7
}'
```
Available additional parameters: `top_p`, `top_k`, `max_tokens`
#### Completions
For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
```
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "ggml-koala-7b-model-q4_0-r2.bin",
"prompt": "A long time ago in a galaxy far, far away",
"temperature": 0.7
}'
```
Available additional parameters: `top_p`, `top_k`, `max_tokens`
#### List models
You can list all the models available with:
```
curl http://localhost:8080/v1/models
```
## Web interface
There is also available a simple web interface (for instance, http://localhost:8080/) which can be used as a playground.
Note: The API doesn't inject a template for talking to the instance, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release, for instance:
```
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
```
## Using other models
gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):
```bash
docker run -ti --rm quay.io/go-skynet/llama-cli:latest api
wget -O tokenizer.model https://huggingface.co/decapoda-research/llama-30b-hf/resolve/main/tokenizer.model
mkdir models
cp gpt4all.. models/
git clone https://gist.github.com/eiz/828bddec6162a023114ce19146cb2b82
pip install sentencepiece
python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
# There will be a new model with the ".tmp" extension, you have to use that one!
```
### Golang client API
The `llama-cli` codebase has also a small client in go that can be used alongside with the api:
```golang
package main
import (
"fmt"
client "github.com/go-skynet/llama-cli/client"
)
func main() {
cli := client.NewClient("http://ip:port")
out, err := cli.Predict("What's an alpaca?")
if err != nil {
panic(err)
}
fmt.Println(out)
}
```
### Windows compatibility
It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/llama-cli/issues/2
### Kubernetes
You can run the API directly in Kubernetes:
```bash
kubectl apply -f https://raw.githubusercontent.com/go-skynet/llama-cli/master/kubernetes/deployment.yaml
```
```
### Build locally
Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.
In order to build the `llama-cli` container image locally you can use `docker`:
```
# build the image as "alpaca-image"
docker build -t llama-cli .
docker run llama-cli --instruction "What's an alpaca?"
```
Or build the binary with:
```
# build the image as "alpaca-image"
docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +build
# run the binary
./llama-cli --instruction "What's an alpaca?"
```
## Short-term roadmap
- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
- Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
- [x] Multi-model support
- Have a webUI!
## License
MIT
## Acknowledgements
- [llama.cpp](https://github.com/ggerganov/llama.cpp)
- https://github.com/tatsu-lab/stanford_alpaca
- https://github.com/cornelk/llama-go for the initial ideas
- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)

83
api.go
View File

@@ -1,83 +0,0 @@
package main
import (
"strconv"
llama "github.com/go-skynet/llama/go"
"github.com/gofiber/fiber/v2"
)
func api(model, listenAddr string, threads int) error {
app := fiber.New()
l, err := llama.New(model)
if err != nil {
return err
}
/*
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
"text": "What is an alpaca?",
"topP": 0.8,
"topK": 50,
"temperature": 0.7,
"tokens": 100
}'
*/
// Endpoint to generate the prediction
app.Post("/predict", func(c *fiber.Ctx) error {
// Get input data from the request body
input := new(struct {
Text string `json:"text"`
})
if err := c.BodyParser(input); err != nil {
return err
}
// Set the parameters for the language model prediction
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
if err != nil {
return err
}
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
if err != nil {
return err
}
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
if err != nil {
return err
}
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
if err != nil {
return err
}
// Generate the prediction using the language model
prediction, err := l.Predict(
input.Text,
llama.SetTemperature(temperature),
llama.SetTopP(topP),
llama.SetTopK(topK),
llama.SetTokens(tokens),
llama.SetThreads(threads),
)
if err != nil {
return err
}
// Return the prediction in the response body
return c.JSON(struct {
Prediction string `json:"prediction"`
}{
Prediction: prediction,
})
})
// Start the server
app.Listen(":8080")
return nil
}

276
api/api.go Normal file
View File

@@ -0,0 +1,276 @@
package api
import (
"embed"
"fmt"
"net/http"
"strconv"
"strings"
"sync"
model "github.com/go-skynet/llama-cli/pkg/model"
llama "github.com/go-skynet/go-llama.cpp"
"github.com/gofiber/fiber/v2"
"github.com/gofiber/fiber/v2/middleware/cors"
"github.com/gofiber/fiber/v2/middleware/filesystem"
"github.com/gofiber/fiber/v2/middleware/recover"
)
type OpenAIResponse struct {
Created int `json:"created,omitempty"`
Object string `json:"chat.completion,omitempty"`
ID string `json:"id,omitempty"`
Model string `json:"model,omitempty"`
Choices []Choice `json:"choices,omitempty"`
}
type Choice struct {
Index int `json:"index,omitempty"`
FinishReason string `json:"finish_reason,omitempty"`
Message Message `json:"message,omitempty"`
Text string `json:"text,omitempty"`
}
type Message struct {
Role string `json:"role,omitempty"`
Content string `json:"content,omitempty"`
}
type OpenAIModel struct {
ID string `json:"id"`
Object string `json:"object"`
}
type OpenAIRequest struct {
Model string `json:"model"`
// Prompt is read only by completion API calls
Prompt string `json:"prompt"`
// Messages is read only by chat/completion API calls
Messages []Message `json:"messages"`
// Common options between all the API calls
TopP float64 `json:"top_p"`
TopK int `json:"top_k"`
Temperature float64 `json:"temperature"`
Maxtokens int `json:"max_tokens"`
}
//go:embed index.html
var indexHTML embed.FS
func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
var err error
var model *llama.LLama
input := new(OpenAIRequest)
// Get input data from the request body
if err := c.BodyParser(input); err != nil {
return err
}
if input.Model == "" {
if defaultModel == nil {
return fmt.Errorf("no default model loaded, and no model specified")
}
model = defaultModel
} else {
model, err = loader.LoadModel(input.Model)
if err != nil {
return err
}
}
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
if input.Model != "" {
mutexMap.Lock()
l, ok := mutexes[input.Model]
if !ok {
m := &sync.Mutex{}
mutexes[input.Model] = m
l = m
}
mutexMap.Unlock()
l.Lock()
defer l.Unlock()
} else {
defaultMutex.Lock()
defer defaultMutex.Unlock()
}
// Set the parameters for the language model prediction
topP := input.TopP
if topP == 0 {
topP = 0.7
}
topK := input.TopK
if topK == 0 {
topK = 80
}
temperature := input.Temperature
if temperature == 0 {
temperature = 0.9
}
tokens := input.Maxtokens
if tokens == 0 {
tokens = 512
}
predInput := input.Prompt
if chat {
mess := []string{}
for _, i := range input.Messages {
mess = append(mess, i.Content)
}
predInput = strings.Join(mess, "\n")
}
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
templatedInput, err := loader.TemplatePrefix(input.Model, struct {
Input string
}{Input: predInput})
if err == nil {
predInput = templatedInput
}
// Generate the prediction using the language model
prediction, err := model.Predict(
predInput,
llama.SetTemperature(temperature),
llama.SetTopP(topP),
llama.SetTopK(topK),
llama.SetTokens(tokens),
llama.SetThreads(threads),
)
if err != nil {
return err
}
if chat {
// Return the chat prediction in the response body
return c.JSON(OpenAIResponse{
Model: input.Model,
Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
})
}
// Return the prediction in the response body
return c.JSON(OpenAIResponse{
Model: input.Model,
Choices: []Choice{{Text: prediction}},
})
}
}
func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
app := fiber.New()
// Default middleware config
app.Use(recover.New())
app.Use(cors.New())
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
var mutex = &sync.Mutex{}
mu := map[string]*sync.Mutex{}
var mumutex = &sync.Mutex{}
// openAI compatible API endpoint
app.Post("/v1/chat/completions", openAIEndpoint(true, defaultModel, loader, threads, mutex, mumutex, mu))
app.Post("/v1/completions", openAIEndpoint(false, defaultModel, loader, threads, mutex, mumutex, mu))
app.Get("/v1/models", func(c *fiber.Ctx) error {
models, err := loader.ListModels()
if err != nil {
return err
}
dataModels := []OpenAIModel{}
for _, m := range models {
dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
}
return c.JSON(struct {
Object string `json:"object"`
Data []OpenAIModel `json:"data"`
}{
Object: "list",
Data: dataModels,
})
})
app.Use("/", filesystem.New(filesystem.Config{
Root: http.FS(indexHTML),
NotFoundFile: "index.html",
}))
/*
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
"text": "What is an alpaca?",
"topP": 0.8,
"topK": 50,
"temperature": 0.7,
"tokens": 100
}'
*/
// Endpoint to generate the prediction
app.Post("/predict", func(c *fiber.Ctx) error {
mutex.Lock()
defer mutex.Unlock()
// Get input data from the request body
input := new(struct {
Text string `json:"text"`
})
if err := c.BodyParser(input); err != nil {
return err
}
// Set the parameters for the language model prediction
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
if err != nil {
return err
}
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
if err != nil {
return err
}
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
if err != nil {
return err
}
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
if err != nil {
return err
}
// Generate the prediction using the language model
prediction, err := defaultModel.Predict(
input.Text,
llama.SetTemperature(temperature),
llama.SetTopP(topP),
llama.SetTopK(topK),
llama.SetTokens(tokens),
llama.SetThreads(threads),
)
if err != nil {
return err
}
// Return the prediction in the response body
return c.JSON(struct {
Prediction string `json:"prediction"`
}{
Prediction: prediction,
})
})
// Start the server
app.Listen(listenAddr)
return nil
}

120
api/index.html Normal file
View File

@@ -0,0 +1,120 @@
<!DOCTYPE html>
<html>
<head>
<title>llama-cli</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css" crossorigin="anonymous" referrerpolicy="no-referrer" />
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
</head>
<style>
@keyframes rotating {
from {
transform: rotate(0deg);
}
to {
transform: rotate(360deg);
}
}
.waiting {
animation: rotating 1s linear infinite;
}
</style>
<body>
<div class="container mt-5" x-data="{ templates:[
{
name: 'Alpaca: Instruction without input',
text: `Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{{.Instruction}}
### Response:`,
},
{
name: 'Alpaca: Instruction with input',
text: `Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{{.Instruction}}
### Input:
{{.Input}}
### Response:`,
}
], selectedTemplate: '', selectedTemplateText: '' }">
<h1>llama-cli API</h1>
<div class="form-group">
<label for="inputText">Input Text:</label>
<textarea class="form-control" id="inputText" rows="6" placeholder="Your text input here..." x-text="selectedTemplateText"></textarea>
</div>
<div class="form-group">
<label for="templateSelect">Select Template:</label>
<select class="form-control" id="templateSelect" x-model="selectedTemplateText">
<option value="">None</option>
<template x-for="(template, index) in templates" :key="index">
<option :value="template.text" x-text="template.name"></option>
</template>
</select>
</div>
<div class="form-group">
<label for="topP">Top P:</label>
<input type="range" step="0.01" min="0" max="1" class="form-control" id="topP" value="0.20" name="topP" onchange="this.nextElementSibling.value = this.value" required>
<output>0.20</output>
</div>
<div class="form-group">
<label for="topK">Top K:</label>
<input type="number" class="form-control" id="topK" value="10000" name="topK" required>
</div>
<div class="form-group">
<label for="temperature">Temperature:</label>
<input type="range" step="0.01" min="0" max="1" value="0.9" class="form-control" id="temperature" name="temperature" onchange="this.nextElementSibling.value = this.value" required>
<output>0.9</output>
</div>
<div class="form-group">
<label for="tokens">Tokens:</label>
<input type="number" class="form-control" id="tokens" name="tokens" value="128" required>
</div>
<button class="btn btn-primary" x-on:click="submitRequest()">Submit <i class="fas fa-paper-plane"></i></button>
<hr>
<div class="form-group">
<label for="outputText">Output Text:</label>
<textarea class="form-control" id="outputText" rows="5" readonly></textarea>
</div>
</div>
<script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
<script>
function submitRequest() {
var button = document.querySelector("i.fa-paper-plane");
button.classList.add("waiting");
var text = document.getElementById("inputText").value;
var url = "/predict";
var data = {
"text": text,
"topP": document.getElementById("topP").value,
"topK": document.getElementById("topK").value,
"temperature": document.getElementById("temperature").value,
"tokens": document.getElementById("tokens").value
};
fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json"
},
body: JSON.stringify(data)
})
.then(response => response.json())
.then(data => {
document.getElementById("outputText").value = data.prediction;
button.classList.remove("waiting");
})
.catch(error => { console.error(error); button.classList.remove("waiting"); });
}
</script>
</body>
</html>

75
client/client.go Normal file
View File

@@ -0,0 +1,75 @@
package client
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
)
type Prediction struct {
Prediction string `json:"prediction"`
}
type Client struct {
baseURL string
client *http.Client
endpoint string
}
func NewClient(baseURL string) *Client {
return &Client{
baseURL: baseURL,
client: &http.Client{},
endpoint: "/predict",
}
}
type InputData struct {
Text string `json:"text"`
TopP float64 `json:"topP,omitempty"`
TopK int `json:"topK,omitempty"`
Temperature float64 `json:"temperature,omitempty"`
Tokens int `json:"tokens,omitempty"`
}
func (c *Client) Predict(text string, opts ...InputOption) (string, error) {
input := NewInputData(opts...)
input.Text = text
// encode input data to JSON format
inputBytes, err := json.Marshal(input)
if err != nil {
return "", err
}
// create HTTP request
url := c.baseURL + c.endpoint
req, err := http.NewRequest("POST", url, bytes.NewBuffer(inputBytes))
if err != nil {
return "", err
}
// set request headers
req.Header.Set("Content-Type", "application/json")
// send request and get response
resp, err := c.client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("request failed with status %d", resp.StatusCode)
}
// decode response body to Prediction struct
var prediction Prediction
err = json.NewDecoder(resp.Body).Decode(&prediction)
if err != nil {
return "", err
}
return prediction.Prediction, nil
}

51
client/options.go Normal file
View File

@@ -0,0 +1,51 @@
package client
import "net/http"
type ClientOption func(c *Client)
func WithHTTPClient(httpClient *http.Client) ClientOption {
return func(c *Client) {
c.client = httpClient
}
}
func WithEndpoint(endpoint string) ClientOption {
return func(c *Client) {
c.endpoint = endpoint
}
}
type InputOption func(d *InputData)
func NewInputData(opts ...InputOption) *InputData {
data := &InputData{}
for _, opt := range opts {
opt(data)
}
return data
}
func WithTopP(topP float64) InputOption {
return func(d *InputData) {
d.TopP = topP
}
}
func WithTopK(topK int) InputOption {
return func(d *InputData) {
d.TopK = topK
}
}
func WithTemperature(temperature float64) InputOption {
return func(d *InputData) {
d.Temperature = temperature
}
}
func WithTokens(tokens int) InputOption {
return func(d *InputData) {
d.Tokens = tokens
}
}

15
docker-compose.yaml Normal file
View File

@@ -0,0 +1,15 @@
version: '3.6'
services:
api:
image: quay.io/go-skynet/llama-cli:latest
build: .
volumes:
- ./models:/models
ports:
- 8080:8080
environment:
- MODELS_PATH=/models
- CONTEXT_SIZE=700
- THREADS=$THREADS
command: api

4
go.mod
View File

@@ -3,7 +3,7 @@ module github.com/go-skynet/llama-cli
go 1.19
require (
github.com/go-skynet/llama v0.0.0-20230319223917-0076188dd548
github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4
github.com/gofiber/fiber/v2 v2.42.0
github.com/urfave/cli/v2 v2.25.0
)
@@ -26,5 +26,5 @@ require (
github.com/valyala/fasthttp v1.44.0 // indirect
github.com/valyala/tcplisten v1.0.0 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect
golang.org/x/sys v0.6.0 // indirect
)

17
go.sum
View File

@@ -2,10 +2,14 @@ github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/go-skynet/llama v0.0.0-20230319223917-0076188dd548 h1:wXcEESf+zNXidrSZoDKBoIJQDDMzUwVysLIbCkWGYvM=
github.com/go-skynet/llama v0.0.0-20230319223917-0076188dd548/go.mod h1:ZtYsAIud4cvP9VTTI9uhdgR1uCwaO/gGKnZZ95h9i7w=
github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4 h1:u/y9MlPHOeIj636IQmrf9ptMjjdgCVIcsfb7lMFh39M=
github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
github.com/gofiber/fiber/v2 v2.42.0 h1:Fnp7ybWvS+sjNQsFvkhf4G8OhXswvB6Vee8hM/LyS+8=
github.com/gofiber/fiber/v2 v2.42.0/go.mod h1:3+SGNjqMh5VQH5Vz2Wdi43zTIV16ktlFd3x3R6O1Zlc=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
@@ -17,6 +21,8 @@ github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPn
github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
@@ -50,6 +56,7 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -59,17 +66,21 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=

View File

@@ -39,4 +39,4 @@ spec:
ports:
- protocol: TCP
port: 8080
targetPort: 8080
targetPort: 8080

113
main.go
View File

@@ -8,7 +8,10 @@ import (
"runtime"
"text/template"
llama "github.com/go-skynet/llama/go"
llama "github.com/go-skynet/go-llama.cpp"
api "github.com/go-skynet/llama-cli/api"
model "github.com/go-skynet/llama-cli/pkg/model"
"github.com/urfave/cli/v2"
)
@@ -31,6 +34,11 @@ var nonEmptyInput string = `Below is an instruction that describes a task, paire
### Response:
`
func llamaFromOptions(ctx *cli.Context) (*llama.LLama, error) {
opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
return llama.New(ctx.String("model"), opts...)
}
func templateString(t string, in interface{}) (string, error) {
// Parse the template
tmpl, err := template.New("prompt").Parse(t)
@@ -46,12 +54,49 @@ func templateString(t string, in interface{}) (string, error) {
return buf.String(), nil
}
var modelFlags = []cli.Flag{
&cli.StringFlag{
Name: "model",
EnvVars: []string{"MODEL_PATH"},
},
&cli.IntFlag{
Name: "tokens",
EnvVars: []string{"TOKENS"},
Value: 128,
},
&cli.IntFlag{
Name: "context-size",
EnvVars: []string{"CONTEXT_SIZE"},
Value: 512,
},
&cli.IntFlag{
Name: "threads",
EnvVars: []string{"THREADS"},
Value: runtime.NumCPU(),
},
&cli.Float64Flag{
Name: "temperature",
EnvVars: []string{"TEMPERATURE"},
Value: 0.95,
},
&cli.Float64Flag{
Name: "topp",
EnvVars: []string{"TOP_P"},
Value: 0.85,
},
&cli.IntFlag{
Name: "topk",
EnvVars: []string{"TOP_K"},
Value: 20,
},
}
func main() {
app := &cli.App{
Name: "llama-cli",
Version: "0.1",
Usage: "llama-cli --model ... --instruction 'What is an alpaca?'",
Flags: []cli.Flag{
Flags: append(modelFlags,
&cli.StringFlag{
Name: "template",
EnvVars: []string{"TEMPLATE"},
@@ -63,37 +108,7 @@ func main() {
&cli.StringFlag{
Name: "input",
EnvVars: []string{"INPUT"},
},
&cli.StringFlag{
Name: "model",
EnvVars: []string{"MODEL_PATH"},
},
&cli.IntFlag{
Name: "tokens",
EnvVars: []string{"TOKENS"},
Value: 128,
},
&cli.IntFlag{
Name: "threads",
EnvVars: []string{"THREADS"},
Value: runtime.NumCPU(),
},
&cli.Float64Flag{
Name: "temperature",
EnvVars: []string{"TEMPERATURE"},
Value: 0.95,
},
&cli.Float64Flag{
Name: "topp",
EnvVars: []string{"TOP_P"},
Value: 0.85,
},
&cli.IntFlag{
Name: "topk",
EnvVars: []string{"TOP_K"},
Value: 20,
},
},
}),
Description: `Run llama.cpp inference`,
UsageText: `
llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
@@ -107,6 +122,7 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
Copyright: "go-skynet authors",
Commands: []*cli.Command{
{
Name: "api",
Flags: []cli.Flag{
&cli.IntFlag{
@@ -115,17 +131,38 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
Value: runtime.NumCPU(),
},
&cli.StringFlag{
Name: "model",
EnvVars: []string{"MODEL_PATH"},
Name: "models-path",
EnvVars: []string{"MODELS_PATH"},
},
&cli.StringFlag{
Name: "default-model",
EnvVars: []string{"default-model"},
},
&cli.StringFlag{
Name: "address",
EnvVars: []string{"ADDRESS"},
Value: ":8080",
},
&cli.IntFlag{
Name: "context-size",
EnvVars: []string{"CONTEXT_SIZE"},
Value: 512,
},
},
Action: func(ctx *cli.Context) error {
return api(ctx.String("model"), ctx.String("address"), ctx.Int("threads"))
var defaultModel *llama.LLama
defModel := ctx.String("default-model")
if defModel != "" {
opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
var err error
defaultModel, err = llama.New(ctx.String("default-model"), opts...)
if err != nil {
return err
}
}
return api.Start(defaultModel, model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
},
},
},
@@ -179,11 +216,13 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
fmt.Println("Templating the input failed:", err.Error())
os.Exit(1)
}
l, err := llama.New(ctx.String("model"))
l, err := llamaFromOptions(ctx)
if err != nil {
fmt.Println("Loading the model failed:", err.Error())
os.Exit(1)
}
res, err := l.Predict(
str,
llama.SetTemperature(ctx.Float64("temperature")),

0
models/.keep Normal file
View File

114
pkg/model/loader.go Normal file
View File

@@ -0,0 +1,114 @@
package model
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"sync"
"text/template"
llama "github.com/go-skynet/go-llama.cpp"
)
type ModelLoader struct {
modelPath string
mu sync.Mutex
models map[string]*llama.LLama
promptsTemplates map[string]*template.Template
}
func NewModelLoader(modelPath string) *ModelLoader {
return &ModelLoader{modelPath: modelPath, models: make(map[string]*llama.LLama), promptsTemplates: make(map[string]*template.Template)}
}
func (ml *ModelLoader) ListModels() ([]string, error) {
files, err := ioutil.ReadDir(ml.modelPath)
if err != nil {
return []string{}, err
}
models := []string{}
for _, file := range files {
if strings.HasSuffix(file.Name(), ".bin") {
models = append(models, strings.TrimRight(file.Name(), ".bin"))
}
}
return models, nil
}
func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string, error) {
ml.mu.Lock()
defer ml.mu.Unlock()
m, ok := ml.promptsTemplates[modelName]
if !ok {
// try to find a s.bin
modelBin := fmt.Sprintf("%s.bin", modelName)
m, ok = ml.promptsTemplates[modelBin]
if !ok {
return "", fmt.Errorf("no prompt template available")
}
}
var buf bytes.Buffer
if err := m.Execute(&buf, in); err != nil {
return "", err
}
return buf.String(), nil
}
func (ml *ModelLoader) LoadModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
ml.mu.Lock()
defer ml.mu.Unlock()
// Check if we already have a loaded model
modelFile := filepath.Join(ml.modelPath, modelName)
if m, ok := ml.models[modelFile]; ok {
return m, nil
}
// Check if the model path exists
if _, err := os.Stat(modelFile); os.IsNotExist(err) {
// try to find a s.bin
modelBin := fmt.Sprintf("%s.bin", modelFile)
if _, err := os.Stat(modelBin); os.IsNotExist(err) {
return nil, err
} else {
modelName = fmt.Sprintf("%s.bin", modelName)
modelFile = modelBin
}
}
// Load the model and keep it in memory for later use
model, err := llama.New(modelFile, opts...)
if err != nil {
return nil, err
}
// If there is a prompt template, load it
modelTemplateFile := fmt.Sprintf("%s.tmpl", modelFile)
// Check if the model path exists
if _, err := os.Stat(modelTemplateFile); err == nil {
dat, err := os.ReadFile(modelTemplateFile)
if err != nil {
return nil, err
}
// Parse the template
tmpl, err := template.New("prompt").Parse(string(dat))
if err != nil {
return nil, err
}
ml.promptsTemplates[modelName] = tmpl
}
ml.models[modelFile] = model
return model, err
}