Compare commits

...

72 Commits

Author SHA1 Message Date
Ettore Di Giacinto
7fec26f5d3 Enhancements (#34)
Signed-off-by: mudler <mudler@c3os.io>
2023-04-19 17:10:29 +02:00
Ettore Di Giacinto
a9a875ee2b ⬆️ Bump llama.cpp (#33)
Signed-off-by: mudler <mudler@c3os.io>
2023-04-17 21:34:02 +02:00
Ettore Di Giacinto
db5ac715f3 Use a reasonable default context size (#31) 2023-04-17 18:45:42 +02:00
Ettore Di Giacinto
0b330d90ad feat: drop embedded webui (#27)
Signed-off-by: mudler <mudler@c3os.io>
2023-04-16 10:46:20 +02:00
Ettore Di Giacinto
63601fabd1 feat: drop default model and llama-specific API (#26)
Signed-off-by: mudler <mudler@c3os.io>
2023-04-16 10:40:50 +02:00
Ettore Di Giacinto
1370b4482f 📖 Add prompt-templates examples (#25)
Signed-off-by: mudler <mudler@c3os.io>
2023-04-16 10:24:15 +02:00
Ettore Di Giacinto
b062f3142b feat: enhance API, expose more parameters (#24)
Signed-off-by: mudler <mudler@c3os.io>
2023-04-16 10:16:48 +02:00
Marc R Kellerman
c37175271f feature: makefile & updates (#23)
Co-authored-by: mudler <mudler@c3os.io>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2023-04-15 16:39:07 -07:00
Ettore Di Giacinto
e8eab66c30 Merge pull request #22 from go-skynet/update-llama.cpp
⬆️ Update go-llama.cpp to `llama.cpp-2f7c8e0`
2023-04-16 00:06:52 +02:00
mudler
a73a497143 Update llama.cpp 2023-04-15 23:57:00 +02:00
Ettore Di Giacinto
6aea515e1d Merge pull request #20 from go-skynet/mudler-patch-1
📖 Update README.md
2023-04-15 00:38:30 +02:00
Ettore Di Giacinto
dfc2b7e02a 📖 Update README.md 2023-04-15 00:38:18 +02:00
Ettore Di Giacinto
040290971c Merge pull request #19 from go-skynet/tags
Use tags for go-llama.cpp
2023-04-15 00:14:47 +02:00
mudler
553bad585e Use tags for go-llama.cpp 2023-04-15 00:07:39 +02:00
Ettore Di Giacinto
f76b612506 Merge pull request #17 from go-skynet/mudler-patch-1
Fix comment typo
2023-04-13 15:21:13 +02:00
Ettore Di Giacinto
c4e94c88d7 Fix comment typo
Thanks to @deadprogram for noticing it!
2023-04-13 15:20:51 +02:00
mudler
a9cd6b3ca3 ci: Fix tag detection for 'latest' 2023-04-13 01:37:09 +02:00
mudler
e786576b95 Update README 2023-04-13 01:28:15 +02:00
Ettore Di Giacinto
d426571789 Merge pull request #16 from go-skynet/fix_arm
Drop armv7 builds
2023-04-13 01:21:58 +02:00
mudler
a896a2b5ad Drop armv7 builds 2023-04-13 01:21:40 +02:00
Ettore Di Giacinto
8273cd5c04 Merge pull request #15 from go-skynet/docker-compose
Add docker-compose file
2023-04-13 01:17:44 +02:00
mudler
16f1281d38 Minor workflow fixes 2023-04-13 01:16:13 +02:00
mudler
8042e9a2d6 Add docker-compose
Fixes #14

Signed-off-by: mudler <mudler@c3os.io>
2023-04-13 01:13:14 +02:00
mudler
624092cb99 Update README 2023-04-12 00:07:30 +02:00
mudler
a422a883ac Minor rephrasing 2023-04-12 00:04:15 +02:00
mudler
7858a97254 Update README 2023-04-12 00:02:47 +02:00
mudler
5556aa46dd Small refinements and refactors 2023-04-12 00:02:39 +02:00
mudler
eb4257f946 Add .gitignore 2023-04-11 23:44:00 +02:00
mudler
ae30bd346d Reorganize repository layout 2023-04-11 23:43:43 +02:00
mudler
93d8977ba2 Return model list 2023-04-10 12:02:40 +02:00
mudler
f43aeeb4a1 Add both API endpoints (completion, chat) 2023-04-09 12:30:55 +02:00
mudler
c17dcc5e9d Allow to inject prompt as part of the call 2023-04-09 09:36:19 +02:00
mudler
4a932483e1 Small fixup to template loading 2023-04-08 11:59:40 +02:00
mudler
b710147b95 Add mutex on same models (parallel isn't supported yet) 2023-04-08 11:45:36 +02:00
mudler
ba70363330 Use template input 2023-04-08 11:24:25 +02:00
mudler
9fb581739b Allow to template model prompts inputs 2023-04-08 10:46:51 +02:00
mudler
48aca246e3 Drop unused interactive mode 2023-04-07 11:31:14 +02:00
mudler
12eee097b7 Make it compatible with openAI api, support multiple models
Signed-off-by: mudler <mudler@c3os.io>
2023-04-07 11:30:59 +02:00
mudler
b33d015b8c Use go-llama.cpp 2023-04-07 10:08:15 +02:00
Ettore Di Giacinto
b7c0a108f5 Update README.md 2023-04-05 22:28:03 +02:00
Ettore Di Giacinto
f694a89c28 Update README.md 2023-04-05 22:14:00 +02:00
Ettore Di Giacinto
be682e6c2f Update README.md
Add short-term roadmap and mention webui
2023-04-05 22:04:35 +02:00
mudler
bf85a31f9e Don't set a default model path 2023-04-05 22:00:15 +02:00
Ettore Di Giacinto
d69048e0b0 Update README.md 2023-04-05 00:41:02 +02:00
mudler
827f189163 Update README 2023-03-30 18:46:11 +02:00
mudler
a23deb5ec7 Drop duplicate target 2023-03-29 19:44:41 +02:00
mudler
999676b106 Add gpt4all instructions 2023-03-29 18:58:54 +02:00
mudler
c61b023bc8 Drop fat images, will document how to consume models 2023-03-29 18:55:24 +02:00
mudler
650a22aef1 Add compatibility to gpt4all models 2023-03-29 18:53:24 +02:00
mudler
17b1724f7c Update llama-go 2023-03-27 01:18:14 +02:00
mudler
e860e62036 Add mutex, build only lite images 2023-03-27 01:01:38 +02:00
Ettore Di Giacinto
1f45ff8cd6 Update README.md 2023-03-26 23:37:26 +02:00
mudler
abee34f60a Cleanup leftover 2023-03-25 01:10:50 +01:00
mudler
dbc70dc13c Add a simple web-page as index of the API for helping with inference testing 2023-03-25 01:09:51 +01:00
mudler
55142065eb Update README with building instructions 2023-03-24 01:11:13 +01:00
mudler
d83d2293b5 Update version in kubernetes deployment 2023-03-23 23:22:43 +01:00
mudler
467ce5a7aa Update models download instructions, update images 2023-03-23 22:06:41 +01:00
mudler
4c9c5ce4ce Update README on instruction on how to prompt with the API 2023-03-23 19:25:28 +01:00
mudler
6394d85ca2 Lower conversion parallelism 2023-03-23 19:22:23 +01:00
mudler
2b6a5aef5f Lower earthly parallelism 2023-03-23 19:17:15 +01:00
mudler
d191ecb9fe Disable release pipeline 2023-03-23 19:14:39 +01:00
mudler
e14e1b0a77 Update README 2023-03-23 18:57:25 +01:00
mudler
bffaf2aa42 Build images without model 2023-03-23 18:50:43 +01:00
mudler
d98d1fe55e Use models from model repository 2023-03-23 18:44:24 +01:00
mudler
0785cb6b0b Update README with 13B and 30B model instructions 2023-03-22 00:18:48 +01:00
mudler
f88d5ad829 Update MODEL_URL 2023-03-21 22:03:20 +01:00
Ettore Di Giacinto
c7119a2882 Use tagged image in kubernetes deployment 2023-03-21 21:33:11 +01:00
mudler
8324402b49 Add interactive.go 2023-03-21 19:21:58 +01:00
mudler
9ba30c9c44 Update llama-go, allow to set context-size and enable alpaca model by default 2023-03-21 19:20:23 +01:00
mudler
973042bb4c Update README to use tagged container images 2023-03-21 18:45:59 +01:00
mudler
3ed2888646 Update README 2023-03-20 23:26:29 +01:00
mudler
593ff6308c Add simple client 2023-03-20 23:25:39 +01:00
23 changed files with 929 additions and 232 deletions

1
.dockerignore Normal file
View File

@@ -0,0 +1 @@
models

3
.env Normal file
View File

@@ -0,0 +1,3 @@
THREADS=14
CONTEXT_SIZE=512
MODELS_PATH=/models

View File

@@ -2,6 +2,7 @@
name: 'build container images'
on:
pull_request:
push:
branches:
- master
@@ -12,68 +13,42 @@ jobs:
docker:
runs-on: ubuntu-latest
steps:
- name: Release space from worker
run: |
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
df -h
echo
sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
sudo apt-get remove --auto-remove android-sdk-platform-tools || true
sudo apt-get purge --auto-remove android-sdk-platform-tools || true
sudo rm -rf /usr/local/lib/android
sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
sudo rm -rf /usr/share/dotnet
sudo apt-get remove -y '^mono-.*' || true
sudo apt-get remove -y '^ghc-.*' || true
sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
sudo apt-get remove -y 'php.*' || true
sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
sudo apt-get remove -y '^google-.*' || true
sudo apt-get remove -y azure-cli || true
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
sudo apt-get remove -y '^gfortran-.*' || true
sudo apt-get autoremove -y
sudo apt-get clean
echo
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
sudo rm -rfv build || true
df -h
- name: Checkout
uses: actions/checkout@v3
- name: Prepare
id: prep
run: |
DOCKER_IMAGE=quay.io/go-skynet/llama-cli
VERSION=latest
VERSION=master
SHORTREF=${GITHUB_SHA::8}
# If this is git tag, use the tag name as a docker tag
if [[ $GITHUB_REF == refs/tags/* ]]; then
VERSION=${GITHUB_REF#refs/tags/}
fi
TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
# If the VERSION looks like a version number, assume that
# this is the most recent version of the image and also
# tag it 'latest'.
if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
TAGS="$TAGS,${DOCKER_IMAGE}:latest"
fi
# Set output parameters.
echo ::set-output name=tags::${TAGS}
echo ::set-output name=docker_image::${DOCKER_IMAGE}
echo ::set-output name=image::${DOCKER_IMAGE}:${VERSION}
- name: Set up QEMU
uses: docker/setup-qemu-action@master
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@master
- name: Login to DockerHub
if: github.event_name != 'pull_request'
uses: docker/login-action@v2
@@ -81,7 +56,23 @@ jobs:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_PASSWORD }}
- uses: earthly/actions/setup-earthly@v1
- name: Build
run: |
earthly --push +image-all --IMAGE=${{ steps.prep.outputs.image }}
if: github.event_name != 'pull_request'
uses: docker/build-push-action@v4
with:
builder: ${{ steps.buildx.outputs.name }}
context: .
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.prep.outputs.tags }}
- name: Build PRs
if: github.event_name == 'pull_request'
uses: docker/build-push-action@v4
with:
builder: ${{ steps.buildx.outputs.name }}
context: .
file: ./Dockerfile
platforms: linux/amd64
push: false
tags: ${{ steps.prep.outputs.tags }}

10
.gitignore vendored Normal file
View File

@@ -0,0 +1,10 @@
# go-llama build artifacts
go-llama
go-gpt4all-j
# llama-cli build binary
llama-cli
# Ignore models
models/*.bin
models/ggml-*

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Launch Go",
"type": "go",
"request": "launch",
"mode": "debug",
"program": "${workspaceFolder}/main.go",
"args": [
"api"
]
}
]
}

12
Dockerfile Normal file
View File

@@ -0,0 +1,12 @@
ARG GO_VERSION=1.20
ARG DEBIAN_VERSION=11
FROM golang:$GO_VERSION as builder
WORKDIR /build
RUN apt-get update && apt-get install -y cmake
COPY . .
ARG BUILD_TYPE=
RUN make build${BUILD_TYPE}
FROM debian:$DEBIAN_VERSION
COPY --from=builder /build/llama-cli /usr/bin/llama-cli
ENTRYPOINT [ "/usr/bin/llama-cli" ]

View File

@@ -1,40 +1,5 @@
VERSION 0.7
go-deps:
ARG GO_VERSION=1.20
FROM golang:$GO_VERSION
WORKDIR /build
COPY go.mod ./
COPY go.sum ./
RUN go mod download
RUN apt-get update
SAVE ARTIFACT go.mod AS LOCAL go.mod
SAVE ARTIFACT go.sum AS LOCAL go.sum
alpaca-model:
FROM alpine
# This is the alpaca.cpp model https://github.com/antimatter15/alpaca.cpp
ARG MODEL_URL=https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
RUN wget -O model.bin -c https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
SAVE ARTIFACT model.bin AS LOCAL model.bin
build:
FROM +go-deps
WORKDIR /build
RUN git clone https://github.com/go-skynet/llama
RUN cd llama && make libllama.a
COPY . .
RUN C_INCLUDE_PATH=/build/llama LIBRARY_PATH=/build/llama go build -o llama-cli ./
SAVE ARTIFACT llama-cli AS LOCAL llama-cli
image:
FROM +go-deps
ARG IMAGE=alpaca-cli
COPY +alpaca-model/model.bin /model.bin
COPY +build/llama-cli /llama-cli
ENV MODEL_PATH=/model.bin
ENTRYPOINT [ "/llama-cli" ]
SAVE IMAGE --push $IMAGE
image-all:
BUILD --platform=linux/amd64 --platform=linux/arm64 +image
FROM DOCKERFILE -f Dockerfile .
SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli

79
Makefile Normal file
View File

@@ -0,0 +1,79 @@
GOCMD=go
GOTEST=$(GOCMD) test
GOVET=$(GOCMD) vet
BINARY_NAME=llama-cli
GOLLAMA_VERSION?=llama.cpp-5ecff35
GREEN := $(shell tput -Txterm setaf 2)
YELLOW := $(shell tput -Txterm setaf 3)
WHITE := $(shell tput -Txterm setaf 7)
CYAN := $(shell tput -Txterm setaf 6)
RESET := $(shell tput -Txterm sgr0)
.PHONY: all test build vendor
all: help
## Build:
build: prepare ## Build the project
C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j $(GOCMD) build -o $(BINARY_NAME) ./
buildgeneric: prepare-generic ## Build the project
C_INCLUDE_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j LIBRARY_PATH=$(shell pwd)/go-llama.cpp:$(shell pwd)/go-gpt4all-j $(GOCMD) build -o $(BINARY_NAME) ./
go-gpt4all-j:
git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j
# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
@find ./go-gpt4all-j -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
go-gpt4all-j/libgptj.a: go-gpt4all-j
$(MAKE) -C go-gpt4all-j libgptj.a
go-gpt4all-j/libgptj.a-generic: go-gpt4all-j
$(MAKE) -C go-gpt4all-j generic-libgptj.a
go-llama:
git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
$(MAKE) -C go-llama libbinding.a
go-llama-generic:
git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
$(MAKE) -C go-llama generic-libbinding.a
prepare: go-llama go-gpt4all-j/libgptj.a
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
prepare-generic: go-llama-generic go-gpt4all-j/libgptj.a-generic
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
clean: ## Remove build related file
rm -fr ./go-llama
rm -rf ./go-gpt4all-j
rm -rf $(BINARY_NAME)
## Run:
run: prepare
$(GOCMD) run ./ api
## Test:
test: ## Run the tests of the project
$(GOTEST) -v -race ./... $(OUTPUT_OPTIONS)
## Help:
help: ## Show this help.
@echo ''
@echo 'Usage:'
@echo ' ${YELLOW}make${RESET} ${GREEN}<target>${RESET}'
@echo ''
@echo 'Targets:'
@awk 'BEGIN {FS = ":.*?## "} { \
if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf " ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \
else if (/^## .*$$/) {printf " ${CYAN}%s${RESET}\n", substr($$1,4)} \
}' $(MAKEFILE_LIST)

223
README.md
View File

@@ -1,14 +1,82 @@
## :camel: llama-cli
llama-cli is a straightforward golang CLI interface for [llama.cpp](https://github.com/ggerganov/llama.cpp), providing a simple API and a command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
llama-cli is a straightforward, drop-in replacement API compatible with OpenAI for local CPU inferencing, based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is Apache 2.0 Licensed and can be used for commercial purposes.
- OpenAI compatible API
- Supports multiple-models
- Once loaded the first time, it keep models loaded in memory for faster inference
- Provides a simple command line interface that allows text generation directly from the terminal
- Support for prompt templates
- Doesn't shell-out, but uses C bindings for a faster inference and better performance. Uses [go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) and [go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp).
## Model compatibility
It is compatible with the models supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) and also [GPT4ALL-J](https://github.com/nomic-ai/gpt4all).
Note: You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
## Usage
The easiest way to run llama-cli is by using `docker-compose`:
```bash
git clone https://github.com/go-skynet/llama-cli
cd llama-cli
# copy your models to models/
cp your-model.bin models/
# (optional) Edit the .env file to set things like context size and threads
# vim .env
# start with docker-compose
docker compose up -d --build
# Now API is accessible at localhost:8080
curl http://localhost:8080/v1/models
# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "your-model.bin",
"prompt": "A long time ago in a galaxy far, far away",
"temperature": 0.7
}'
```
Note: The API doesn't inject a default prompt for talking to the model, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.
You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
```
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{{.Input}}
### Response:
```
See the [prompt-templates](https://github.com/go-skynet/llama-cli/tree/master/prompt-templates) directory in this repository for templates for most popular models.
## Container images
The `llama-cli` [container images](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest) come preloaded with the [alpaca.cpp](https://github.com/antimatter15/alpaca.cpp) model, enabling you to start making predictions immediately! To begin, run:
`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
To begin, run:
```
docker run -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000
docker run -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model ...
```
Where `--model` is the path of the model you want to use.
Note: you need to mount a volume to the docker container in order to load a model, for instance:
```
# assuming your model is in /path/to/your/models/foo.bin
docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
```
You will receive a response like the following:
@@ -19,7 +87,7 @@ An alpaca is a member of the South American Camelid family, which includes the l
## Basic usage
To use llama-cli, specify a pre-trained GPT-based model, an input text, and an instruction for text generation. llama-cli takes the following arguments:
To use llama-cli, specify a pre-trained GPT-based model, an input text, and an instruction for text generation. llama-cli takes the following arguments when running from the CLI:
```
llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--template <template_path>] [--tokens <num_tokens>] [--threads <num_threads>] [--temperature <temperature>] [--topp <top_p>] [--topk <top_k>]
@@ -30,13 +98,13 @@ llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--
| template | TEMPLATE | | A file containing a template for output formatting (optional). |
| instruction | INSTRUCTION | | Input prompt text or instruction. "-" for STDIN. |
| input | INPUT | - | Path to text or "-" for STDIN. |
| model | MODEL_PATH | | The path to the pre-trained GPT-based model. |
| model | MODEL | | The path to the pre-trained GPT-based model. |
| tokens | TOKENS | 128 | The maximum number of tokens to generate. |
| threads | THREADS | NumCPU() | The number of threads to use for text generation. |
| temperature | TEMPERATURE | 0.95 | Sampling temperature for model output. |
| temperature | TEMPERATURE | 0.95 | Sampling temperature for model output. ( values between `0.1` and `1.0` ) |
| top_p | TOP_P | 0.85 | The cumulative probability for top-p sampling. |
| top_k | TOP_K | 20 | The number of top-k tokens to consider for text generation. |
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
Here's an example of using `llama-cli`:
@@ -46,22 +114,15 @@ llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
This will generate text based on the given model and instruction.
## Advanced usage
## API
`llama-cli` also provides an API for running text generation as a service. You can start the API server using the following command:
`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.
Example of starting the API with `docker`:
```bash
docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
```
llama-cli api --model <model_path> [--address <address>] [--threads <num_threads>]
```
The API takes takes the following arguments:
| Parameter | Environment Variable | Default Value | Description |
| ------------ | -------------------- | ------------- | -------------------------------------- |
| model | MODEL_PATH | | The path to the pre-trained GPT-based model. |
| threads | THREADS | CPU cores | The number of threads to use for text generation. |
| address | ADDRESS | :8080 | The address and port to listen on. |
And you'll see:
```
@@ -75,28 +136,128 @@ And you'll see:
└───────────────────────────────────────────────────┘
```
Once the server is running, you can make requests to it using HTTP. For example, to generate text based on an instruction, you can send a POST request to the `/predict` endpoint with the instruction as the request body:
Note: Models have to end up with `.bin`.
You can control the API server options with command line arguments:
```
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
"text": "What is an alpaca?",
"topP": 0.8,
"topK": 50,
"temperature": 0.7,
"tokens": 100
}'
llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
```
Example of starting the API with `docker`:
The API takes takes the following:
| Parameter | Environment Variable | Default Value | Description |
| ------------ | -------------------- | ------------- | -------------------------------------- |
| models-path | MODELS_PATH | | The path where you have models (ending with `.bin`). |
| threads | THREADS | CPU cores | The number of threads to use for text generation. |
| address | ADDRESS | :8080 | The address and port to listen on. |
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
Once the server is running, you can start making requests to it using HTTP, using the OpenAI API.
### Supported OpenAI API endpoints
You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create).
Following the list of endpoints/parameters supported.
#### Chat completions
For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
```
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "ggml-koala-7b-model-q4_0-r2.bin",
"messages": [{"role": "user", "content": "Say this is a test!"}],
"temperature": 0.7
}'
```
Available additional parameters: `top_p`, `top_k`, `max_tokens`
#### Completions
For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
```
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "ggml-koala-7b-model-q4_0-r2.bin",
"prompt": "A long time ago in a galaxy far, far away",
"temperature": 0.7
}'
```
Available additional parameters: `top_p`, `top_k`, `max_tokens`
#### List models
You can list all the models available with:
```
curl http://localhost:8080/v1/models
```
## Using other models
gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):
```bash
docker run -ti --rm quay.io/go-skynet/llama-cli:latest api
wget -O tokenizer.model https://huggingface.co/decapoda-research/llama-30b-hf/resolve/main/tokenizer.model
mkdir models
cp gpt4all.. models/
git clone https://gist.github.com/eiz/828bddec6162a023114ce19146cb2b82
pip install sentencepiece
python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
# There will be a new model with the ".tmp" extension, you have to use that one!
```
### Windows compatibility
It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/llama-cli/issues/2
### Kubernetes
You can run the API directly in Kubernetes:
```bash
kubectl apply -f https://raw.githubusercontent.com/go-skynet/llama-cli/master/kubernetes/deployment.yaml
```
```
### Build locally
Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.
In order to build the `llama-cli` container image locally you can use `docker`:
```
# build the image as "alpaca-image"
docker build -t llama-cli .
docker run llama-cli --instruction "What's an alpaca?"
```
Or build the binary with:
```
# build the image as "alpaca-image"
docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +build
# run the binary
./llama-cli --instruction "What's an alpaca?"
```
## Short-term roadmap
- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
- Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
- [x] Multi-model support
- Have a webUI!
## License
MIT
## Acknowledgements
- [llama.cpp](https://github.com/ggerganov/llama.cpp)
- https://github.com/tatsu-lab/stanford_alpaca
- https://github.com/cornelk/llama-go for the initial ideas
- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)

83
api.go
View File

@@ -1,83 +0,0 @@
package main
import (
"strconv"
llama "github.com/go-skynet/llama/go"
"github.com/gofiber/fiber/v2"
)
func api(model, listenAddr string, threads int) error {
app := fiber.New()
l, err := llama.New(model)
if err != nil {
return err
}
/*
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
"text": "What is an alpaca?",
"topP": 0.8,
"topK": 50,
"temperature": 0.7,
"tokens": 100
}'
*/
// Endpoint to generate the prediction
app.Post("/predict", func(c *fiber.Ctx) error {
// Get input data from the request body
input := new(struct {
Text string `json:"text"`
})
if err := c.BodyParser(input); err != nil {
return err
}
// Set the parameters for the language model prediction
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
if err != nil {
return err
}
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
if err != nil {
return err
}
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
if err != nil {
return err
}
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
if err != nil {
return err
}
// Generate the prediction using the language model
prediction, err := l.Predict(
input.Text,
llama.SetTemperature(temperature),
llama.SetTopP(topP),
llama.SetTopK(topK),
llama.SetTokens(tokens),
llama.SetThreads(threads),
)
if err != nil {
return err
}
// Return the prediction in the response body
return c.JSON(struct {
Prediction string `json:"prediction"`
}{
Prediction: prediction,
})
})
// Start the server
app.Listen(":8080")
return nil
}

290
api/api.go Normal file
View File

@@ -0,0 +1,290 @@
package api
import (
"fmt"
"strings"
"sync"
model "github.com/go-skynet/llama-cli/pkg/model"
gptj "github.com/go-skynet/go-gpt4all-j.cpp"
llama "github.com/go-skynet/go-llama.cpp"
"github.com/gofiber/fiber/v2"
"github.com/gofiber/fiber/v2/middleware/cors"
"github.com/gofiber/fiber/v2/middleware/recover"
)
type OpenAIResponse struct {
Created int `json:"created,omitempty"`
Object string `json:"chat.completion,omitempty"`
ID string `json:"id,omitempty"`
Model string `json:"model,omitempty"`
Choices []Choice `json:"choices,omitempty"`
}
type Choice struct {
Index int `json:"index,omitempty"`
FinishReason string `json:"finish_reason,omitempty"`
Message *Message `json:"message,omitempty"`
Text string `json:"text,omitempty"`
}
type Message struct {
Role string `json:"role,omitempty"`
Content string `json:"content,omitempty"`
}
type OpenAIModel struct {
ID string `json:"id"`
Object string `json:"object"`
}
type OpenAIRequest struct {
Model string `json:"model"`
// Prompt is read only by completion API calls
Prompt string `json:"prompt"`
// Messages is read only by chat/completion API calls
Messages []Message `json:"messages"`
Echo bool `json:"echo"`
// Common options between all the API calls
TopP float64 `json:"top_p"`
TopK int `json:"top_k"`
Temperature float64 `json:"temperature"`
Maxtokens int `json:"max_tokens"`
N int `json:"n"`
// Custom parameters - not present in the OpenAI API
Batch int `json:"batch"`
F16 bool `json:"f16kv"`
IgnoreEOS bool `json:"ignore_eos"`
Seed int `json:"seed"`
}
// https://platform.openai.com/docs/api-reference/completions
func openAIEndpoint(chat bool, loader *model.ModelLoader, threads, ctx int, f16 bool, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
var err error
var model *llama.LLama
var gptModel *gptj.GPTJ
input := new(OpenAIRequest)
// Get input data from the request body
if err := c.BodyParser(input); err != nil {
return err
}
if input.Model == "" {
return fmt.Errorf("no model specified")
} else {
// Try to load the model with both
var llamaerr error
llamaOpts := []llama.ModelOption{}
if ctx != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(ctx))
}
if f16 {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
model, llamaerr = loader.LoadLLaMAModel(input.Model, llamaOpts...)
if llamaerr != nil {
gptModel, err = loader.LoadGPTJModel(input.Model)
if err != nil {
return fmt.Errorf("llama: %s gpt: %s", llamaerr.Error(), err.Error()) // llama failed first, so we want to catch both errors
}
}
}
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
if input.Model != "" {
mutexMap.Lock()
l, ok := mutexes[input.Model]
if !ok {
m := &sync.Mutex{}
mutexes[input.Model] = m
l = m
}
mutexMap.Unlock()
l.Lock()
defer l.Unlock()
} else {
defaultMutex.Lock()
defer defaultMutex.Unlock()
}
// Set the parameters for the language model prediction
topP := input.TopP
if topP == 0 {
topP = 0.7
}
topK := input.TopK
if topK == 0 {
topK = 80
}
temperature := input.Temperature
if temperature == 0 {
temperature = 0.9
}
tokens := input.Maxtokens
if tokens == 0 {
tokens = 512
}
predInput := input.Prompt
if chat {
mess := []string{}
for _, i := range input.Messages {
mess = append(mess, i.Content)
}
predInput = strings.Join(mess, "\n")
}
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
templatedInput, err := loader.TemplatePrefix(input.Model, struct {
Input string
}{Input: predInput})
if err == nil {
predInput = templatedInput
}
result := []Choice{}
n := input.N
if input.N == 0 {
n = 1
}
var predFunc func() (string, error)
switch {
case gptModel != nil:
predFunc = func() (string, error) {
// Generate the prediction using the language model
predictOptions := []gptj.PredictOption{
gptj.SetTemperature(temperature),
gptj.SetTopP(topP),
gptj.SetTopK(topK),
gptj.SetTokens(tokens),
gptj.SetThreads(threads),
}
if input.Batch != 0 {
predictOptions = append(predictOptions, gptj.SetBatch(input.Batch))
}
if input.Seed != 0 {
predictOptions = append(predictOptions, gptj.SetSeed(input.Seed))
}
return gptModel.Predict(
predInput,
predictOptions...,
)
}
case model != nil:
predFunc = func() (string, error) {
// Generate the prediction using the language model
predictOptions := []llama.PredictOption{
llama.SetTemperature(temperature),
llama.SetTopP(topP),
llama.SetTopK(topK),
llama.SetTokens(tokens),
llama.SetThreads(threads),
}
if input.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(input.Batch))
}
if input.F16 {
predictOptions = append(predictOptions, llama.EnableF16KV)
}
if input.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}
if input.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(input.Seed))
}
return model.Predict(
predInput,
predictOptions...,
)
}
}
for i := 0; i < n; i++ {
var prediction string
prediction, err := predFunc()
if err != nil {
return err
}
if input.Echo {
prediction = predInput + prediction
}
if chat {
result = append(result, Choice{Message: &Message{Role: "assistant", Content: prediction}})
} else {
result = append(result, Choice{Text: prediction})
}
}
// Return the prediction in the response body
return c.JSON(OpenAIResponse{
Model: input.Model,
Choices: result,
})
}
}
func Start(loader *model.ModelLoader, listenAddr string, threads, ctxSize int, f16 bool) error {
app := fiber.New()
// Default middleware config
app.Use(recover.New())
app.Use(cors.New())
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
var mutex = &sync.Mutex{}
mu := map[string]*sync.Mutex{}
var mumutex = &sync.Mutex{}
// openAI compatible API endpoint
app.Post("/v1/chat/completions", openAIEndpoint(true, loader, threads, ctxSize, f16, mutex, mumutex, mu))
app.Post("/v1/completions", openAIEndpoint(false, loader, threads, ctxSize, f16, mutex, mumutex, mu))
app.Get("/v1/models", func(c *fiber.Ctx) error {
models, err := loader.ListModels()
if err != nil {
return err
}
dataModels := []OpenAIModel{}
for _, m := range models {
dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
}
return c.JSON(struct {
Object string `json:"object"`
Data []OpenAIModel `json:"data"`
}{
Object: "list",
Data: dataModels,
})
})
// Start the server
app.Listen(listenAddr)
return nil
}

30
docker-compose.yaml Normal file
View File

@@ -0,0 +1,30 @@
version: '3.6'
services:
# chatgpt:
# image: ghcr.io/mckaywrigley/chatbot-ui:main
# # platform: linux/amd64
# ports:
# - 3000:3000
# environment:
# - 'OPENAI_API_KEY=sk-000000000000000'
# - 'OPENAI_API_HOST=http://api:8080'
api:
image: quay.io/go-skynet/llama-cli:latest
build:
context: .
dockerfile: Dockerfile
# args:
# BUILD_TYPE: generic # Uncomment to build CPU generic code that works on most HW
ports:
- 8080:8080
environment:
- MODELS_PATH=$MODELS_PATH
- CONTEXT_SIZE=$CONTEXT_SIZE
- THREADS=$THREADS
volumes:
- ./models:/models:cached
command: api

5
go.mod
View File

@@ -3,7 +3,7 @@ module github.com/go-skynet/llama-cli
go 1.19
require (
github.com/go-skynet/llama v0.0.0-20230319223917-0076188dd548
github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640
github.com/gofiber/fiber/v2 v2.42.0
github.com/urfave/cli/v2 v2.25.0
)
@@ -11,6 +11,7 @@ require (
require (
github.com/andybalholm/brotli v1.0.4 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/klauspost/compress v1.15.9 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
@@ -26,5 +27,5 @@ require (
github.com/valyala/fasthttp v1.44.0 // indirect
github.com/valyala/tcplisten v1.0.0 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect
golang.org/x/sys v0.6.0 // indirect
)

19
go.sum
View File

@@ -2,10 +2,16 @@ github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/go-skynet/llama v0.0.0-20230319223917-0076188dd548 h1:wXcEESf+zNXidrSZoDKBoIJQDDMzUwVysLIbCkWGYvM=
github.com/go-skynet/llama v0.0.0-20230319223917-0076188dd548/go.mod h1:ZtYsAIud4cvP9VTTI9uhdgR1uCwaO/gGKnZZ95h9i7w=
github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94 h1:rtrrMvlIq+g0/ltXjDdLeNtz0uc4wJ4Qs15GFU4ba4c=
github.com/go-skynet/go-gpt4all-j.cpp v0.0.0-20230419091210-303cf2a59a94/go.mod h1:5VZ9XbcINI0XcHhkcX8GPK8TplFGAzu1Hrg4tNiMCtI=
github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640 h1:8SSVbQ3yvq7JnfLCLF4USV0PkQnnduUkaNCv/hHDa3E=
github.com/go-skynet/go-llama.cpp v0.0.0-20230415213228-bac222030640/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
github.com/gofiber/fiber/v2 v2.42.0 h1:Fnp7ybWvS+sjNQsFvkhf4G8OhXswvB6Vee8hM/LyS+8=
github.com/gofiber/fiber/v2 v2.42.0/go.mod h1:3+SGNjqMh5VQH5Vz2Wdi43zTIV16ktlFd3x3R6O1Zlc=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
@@ -17,6 +23,8 @@ github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPn
github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
@@ -50,6 +58,7 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -59,17 +68,21 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=

View File

@@ -39,4 +39,4 @@ spec:
ports:
- protocol: TCP
port: 8080
targetPort: 8080
targetPort: 8080

97
main.go
View File

@@ -8,7 +8,10 @@ import (
"runtime"
"text/template"
llama "github.com/go-skynet/llama/go"
llama "github.com/go-skynet/go-llama.cpp"
api "github.com/go-skynet/llama-cli/api"
model "github.com/go-skynet/llama-cli/pkg/model"
"github.com/urfave/cli/v2"
)
@@ -46,12 +49,49 @@ func templateString(t string, in interface{}) (string, error) {
return buf.String(), nil
}
var modelFlags = []cli.Flag{
&cli.StringFlag{
Name: "model",
EnvVars: []string{"MODEL"},
},
&cli.IntFlag{
Name: "tokens",
EnvVars: []string{"TOKENS"},
Value: 128,
},
&cli.IntFlag{
Name: "context-size",
EnvVars: []string{"CONTEXT_SIZE"},
Value: 512,
},
&cli.IntFlag{
Name: "threads",
EnvVars: []string{"THREADS"},
Value: runtime.NumCPU(),
},
&cli.Float64Flag{
Name: "temperature",
EnvVars: []string{"TEMPERATURE"},
Value: 0.95,
},
&cli.Float64Flag{
Name: "topp",
EnvVars: []string{"TOP_P"},
Value: 0.85,
},
&cli.IntFlag{
Name: "topk",
EnvVars: []string{"TOP_K"},
Value: 20,
},
}
func main() {
app := &cli.App{
Name: "llama-cli",
Version: "0.1",
Usage: "llama-cli --model ... --instruction 'What is an alpaca?'",
Flags: []cli.Flag{
Flags: append(modelFlags,
&cli.StringFlag{
Name: "template",
EnvVars: []string{"TEMPLATE"},
@@ -63,37 +103,7 @@ func main() {
&cli.StringFlag{
Name: "input",
EnvVars: []string{"INPUT"},
},
&cli.StringFlag{
Name: "model",
EnvVars: []string{"MODEL_PATH"},
},
&cli.IntFlag{
Name: "tokens",
EnvVars: []string{"TOKENS"},
Value: 128,
},
&cli.IntFlag{
Name: "threads",
EnvVars: []string{"THREADS"},
Value: runtime.NumCPU(),
},
&cli.Float64Flag{
Name: "temperature",
EnvVars: []string{"TEMPERATURE"},
Value: 0.95,
},
&cli.Float64Flag{
Name: "topp",
EnvVars: []string{"TOP_P"},
Value: 0.85,
},
&cli.IntFlag{
Name: "topk",
EnvVars: []string{"TOP_K"},
Value: 20,
},
},
}),
Description: `Run llama.cpp inference`,
UsageText: `
llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
@@ -107,25 +117,35 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
Copyright: "go-skynet authors",
Commands: []*cli.Command{
{
Name: "api",
Flags: []cli.Flag{
&cli.BoolFlag{
Name: "f16",
EnvVars: []string{"F16"},
},
&cli.IntFlag{
Name: "threads",
EnvVars: []string{"THREADS"},
Value: runtime.NumCPU(),
},
&cli.StringFlag{
Name: "model",
EnvVars: []string{"MODEL_PATH"},
Name: "models-path",
EnvVars: []string{"MODELS_PATH"},
},
&cli.StringFlag{
Name: "address",
EnvVars: []string{"ADDRESS"},
Value: ":8080",
},
&cli.IntFlag{
Name: "context-size",
EnvVars: []string{"CONTEXT_SIZE"},
Value: 512,
},
},
Action: func(ctx *cli.Context) error {
return api(ctx.String("model"), ctx.String("address"), ctx.Int("threads"))
return api.Start(model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"), ctx.Int("context-size"), ctx.Bool("f16"))
},
},
},
@@ -179,11 +199,14 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
fmt.Println("Templating the input failed:", err.Error())
os.Exit(1)
}
l, err := llama.New(ctx.String("model"))
opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
l, err := llama.New(ctx.String("model"), opts...)
if err != nil {
fmt.Println("Loading the model failed:", err.Error())
os.Exit(1)
}
res, err := l.Predict(
str,
llama.SetTemperature(ctx.Float64("temperature")),

0
models/.keep Normal file
View File

168
pkg/model/loader.go Normal file
View File

@@ -0,0 +1,168 @@
package model
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"sync"
"text/template"
gptj "github.com/go-skynet/go-gpt4all-j.cpp"
llama "github.com/go-skynet/go-llama.cpp"
)
type ModelLoader struct {
modelPath string
mu sync.Mutex
models map[string]*llama.LLama
gptmodels map[string]*gptj.GPTJ
promptsTemplates map[string]*template.Template
}
func NewModelLoader(modelPath string) *ModelLoader {
return &ModelLoader{modelPath: modelPath, gptmodels: make(map[string]*gptj.GPTJ), models: make(map[string]*llama.LLama), promptsTemplates: make(map[string]*template.Template)}
}
func (ml *ModelLoader) ListModels() ([]string, error) {
files, err := ioutil.ReadDir(ml.modelPath)
if err != nil {
return []string{}, err
}
models := []string{}
for _, file := range files {
if strings.HasSuffix(file.Name(), ".bin") {
models = append(models, strings.TrimRight(file.Name(), ".bin"))
}
}
return models, nil
}
func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string, error) {
ml.mu.Lock()
defer ml.mu.Unlock()
m, ok := ml.promptsTemplates[modelName]
if !ok {
// try to find a s.bin
modelBin := fmt.Sprintf("%s.bin", modelName)
m, ok = ml.promptsTemplates[modelBin]
if !ok {
return "", fmt.Errorf("no prompt template available")
}
}
var buf bytes.Buffer
if err := m.Execute(&buf, in); err != nil {
return "", err
}
return buf.String(), nil
}
func (ml *ModelLoader) loadTemplate(modelName, modelFile string) error {
modelTemplateFile := fmt.Sprintf("%s.tmpl", modelFile)
// Check if the model path exists
if _, err := os.Stat(modelTemplateFile); err != nil {
return nil
}
dat, err := os.ReadFile(modelTemplateFile)
if err != nil {
return err
}
// Parse the template
tmpl, err := template.New("prompt").Parse(string(dat))
if err != nil {
return err
}
ml.promptsTemplates[modelName] = tmpl
return nil
}
func (ml *ModelLoader) LoadGPTJModel(modelName string) (*gptj.GPTJ, error) {
ml.mu.Lock()
defer ml.mu.Unlock()
// Check if we already have a loaded model
modelFile := filepath.Join(ml.modelPath, modelName)
if m, ok := ml.gptmodels[modelFile]; ok {
return m, nil
}
// Check if the model path exists
if _, err := os.Stat(modelFile); os.IsNotExist(err) {
// try to find a s.bin
modelBin := fmt.Sprintf("%s.bin", modelFile)
if _, err := os.Stat(modelBin); os.IsNotExist(err) {
return nil, err
} else {
modelName = fmt.Sprintf("%s.bin", modelName)
modelFile = modelBin
}
}
// Load the model and keep it in memory for later use
model, err := gptj.New(modelFile)
if err != nil {
return nil, err
}
// If there is a prompt template, load it
if err := ml.loadTemplate(modelName, modelFile); err != nil {
return nil, err
}
ml.gptmodels[modelFile] = model
return model, err
}
func (ml *ModelLoader) LoadLLaMAModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
ml.mu.Lock()
defer ml.mu.Unlock()
// Check if we already have a loaded model
modelFile := filepath.Join(ml.modelPath, modelName)
if m, ok := ml.models[modelFile]; ok {
return m, nil
}
// TODO: This needs refactoring, it's really bad to have it in here
// Check if we have a GPTJ model loaded instead
if _, ok := ml.gptmodels[modelFile]; ok {
return nil, fmt.Errorf("this model is a GPTJ one")
}
// Check if the model path exists
if _, err := os.Stat(modelFile); os.IsNotExist(err) {
// try to find a s.bin
modelBin := fmt.Sprintf("%s.bin", modelFile)
if _, err := os.Stat(modelBin); os.IsNotExist(err) {
return nil, err
} else {
modelName = fmt.Sprintf("%s.bin", modelName)
modelFile = modelBin
}
}
// Load the model and keep it in memory for later use
model, err := llama.New(modelFile, opts...)
if err != nil {
return nil, err
}
// If there is a prompt template, load it
if err := ml.loadTemplate(modelName, modelFile); err != nil {
return nil, err
}
ml.models[modelFile] = model
return model, err
}

View File

@@ -0,0 +1,6 @@
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{{.Input}}
### Response:

View File

@@ -0,0 +1,4 @@
The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
### Prompt:
{{.Input}}
### Response:

View File

@@ -0,0 +1 @@
BEGINNING OF CONVERSATION: USER: {{.Input}} GPT:

View File

@@ -0,0 +1,6 @@
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{{.Input}}
### Response: