mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a9cd6b3ca3 | ||
|
|
e786576b95 | ||
|
|
d426571789 | ||
|
|
a896a2b5ad | ||
|
|
8273cd5c04 | ||
|
|
16f1281d38 | ||
|
|
8042e9a2d6 | ||
|
|
624092cb99 | ||
|
|
a422a883ac | ||
|
|
7858a97254 | ||
|
|
5556aa46dd | ||
|
|
eb4257f946 | ||
|
|
ae30bd346d |
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@@ -0,0 +1 @@
|
||||
models/*.bin
|
||||
69
.github/workflows/image.yml
vendored
69
.github/workflows/image.yml
vendored
@@ -2,6 +2,7 @@
|
||||
name: 'build container images'
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
@@ -12,68 +13,42 @@ jobs:
|
||||
docker:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Release space from worker
|
||||
run: |
|
||||
echo "Listing top largest packages"
|
||||
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
|
||||
head -n 30 <<< "${pkgs}"
|
||||
echo
|
||||
df -h
|
||||
echo
|
||||
sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
|
||||
sudo apt-get remove --auto-remove android-sdk-platform-tools || true
|
||||
sudo apt-get purge --auto-remove android-sdk-platform-tools || true
|
||||
sudo rm -rf /usr/local/lib/android
|
||||
sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo apt-get remove -y '^mono-.*' || true
|
||||
sudo apt-get remove -y '^ghc-.*' || true
|
||||
sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
|
||||
sudo apt-get remove -y 'php.*' || true
|
||||
sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
|
||||
sudo apt-get remove -y '^google-.*' || true
|
||||
sudo apt-get remove -y azure-cli || true
|
||||
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
|
||||
sudo apt-get remove -y '^gfortran-.*' || true
|
||||
sudo apt-get autoremove -y
|
||||
sudo apt-get clean
|
||||
echo
|
||||
echo "Listing top largest packages"
|
||||
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
|
||||
head -n 30 <<< "${pkgs}"
|
||||
echo
|
||||
sudo rm -rfv build || true
|
||||
df -h
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Prepare
|
||||
id: prep
|
||||
run: |
|
||||
DOCKER_IMAGE=quay.io/go-skynet/llama-cli
|
||||
VERSION=latest
|
||||
VERSION=master
|
||||
SHORTREF=${GITHUB_SHA::8}
|
||||
|
||||
# If this is git tag, use the tag name as a docker tag
|
||||
if [[ $GITHUB_REF == refs/tags/* ]]; then
|
||||
VERSION=${GITHUB_REF#refs/tags/}
|
||||
fi
|
||||
TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
|
||||
|
||||
# If the VERSION looks like a version number, assume that
|
||||
# this is the most recent version of the image and also
|
||||
# tag it 'latest'.
|
||||
if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
||||
if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
||||
TAGS="$TAGS,${DOCKER_IMAGE}:latest"
|
||||
fi
|
||||
|
||||
# Set output parameters.
|
||||
echo ::set-output name=tags::${TAGS}
|
||||
echo ::set-output name=docker_image::${DOCKER_IMAGE}
|
||||
echo ::set-output name=image::${DOCKER_IMAGE}:${VERSION}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@master
|
||||
with:
|
||||
platforms: all
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@master
|
||||
|
||||
- name: Login to DockerHub
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v2
|
||||
@@ -81,9 +56,23 @@ jobs:
|
||||
registry: quay.io
|
||||
username: ${{ secrets.QUAY_USERNAME }}
|
||||
password: ${{ secrets.QUAY_PASSWORD }}
|
||||
- uses: earthly/actions/setup-earthly@v1
|
||||
- name: Build
|
||||
run: |
|
||||
earthly config "global.conversion_parallelism" "1"
|
||||
earthly config "global.buildkit_max_parallelism" "1"
|
||||
earthly --push +image-all --IMAGE=${{ steps.prep.outputs.image }}
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
builder: ${{ steps.buildx.outputs.name }}
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: ${{ steps.prep.outputs.tags }}
|
||||
- name: Build PRs
|
||||
if: github.event_name == 'pull_request'
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
builder: ${{ steps.buildx.outputs.name }}
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
platforms: linux/amd64
|
||||
push: false
|
||||
tags: ${{ steps.prep.outputs.tags }}
|
||||
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
llama-cli
|
||||
models/*.bin
|
||||
19
Dockerfile
Normal file
19
Dockerfile
Normal file
@@ -0,0 +1,19 @@
|
||||
ARG GO_VERSION=1.20
|
||||
ARG DEBIAN_VERSION=11
|
||||
|
||||
FROM golang:$GO_VERSION as builder
|
||||
|
||||
WORKDIR /build
|
||||
RUN git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp
|
||||
RUN cd go-llama.cpp && make libbinding.a
|
||||
COPY go.mod ./
|
||||
COPY go.sum ./
|
||||
RUN go mod download
|
||||
RUN apt-get update
|
||||
COPY . .
|
||||
RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
|
||||
RUN C_INCLUDE_PATH=/build/go-llama.cpp LIBRARY_PATH=/build/go-llama.cpp go build -o llama-cli ./
|
||||
|
||||
FROM debian:$DEBIAN_VERSION
|
||||
COPY --from=builder /build/llama-cli /usr/bin/llama-cli
|
||||
ENTRYPOINT [ "/usr/bin/llama-cli" ]
|
||||
31
Earthfile
31
Earthfile
@@ -1,32 +1,5 @@
|
||||
VERSION 0.7
|
||||
|
||||
go-deps:
|
||||
ARG GO_VERSION=1.20
|
||||
FROM golang:$GO_VERSION
|
||||
WORKDIR /build
|
||||
COPY go.mod ./
|
||||
COPY go.sum ./
|
||||
RUN go mod download
|
||||
RUN apt-get update
|
||||
SAVE ARTIFACT go.mod AS LOCAL go.mod
|
||||
SAVE ARTIFACT go.sum AS LOCAL go.sum
|
||||
|
||||
build:
|
||||
FROM +go-deps
|
||||
WORKDIR /build
|
||||
RUN git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp
|
||||
RUN cd go-llama.cpp && make libbinding.a
|
||||
COPY . .
|
||||
RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
|
||||
RUN C_INCLUDE_PATH=$GOPATH/src/github.com/go-skynet/go-llama.cpp LIBRARY_PATH=$GOPATH/src/github.com/go-skynet/go-llama.cpp go build -o llama-cli ./
|
||||
SAVE ARTIFACT llama-cli AS LOCAL llama-cli
|
||||
|
||||
image:
|
||||
FROM +go-deps
|
||||
ARG IMAGE=alpaca-cli-nomodel
|
||||
COPY +build/llama-cli /llama-cli
|
||||
ENTRYPOINT [ "/llama-cli" ]
|
||||
SAVE IMAGE --push $IMAGE
|
||||
|
||||
image-all:
|
||||
BUILD --platform=linux/amd64 --platform=linux/arm64 +image
|
||||
FROM DOCKERFILE -f Dockerfile .
|
||||
SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli
|
||||
|
||||
148
README.md
148
README.md
@@ -1,16 +1,70 @@
|
||||
## :camel: llama-cli
|
||||
|
||||
|
||||
llama-cli is a straightforward golang CLI interface for [llama.cpp](https://github.com/ggerganov/llama.cpp), providing a simple API and a command line interface that allows text generation using a GPT-based model like llama directly from the terminal. It is also compatible with [gpt4all](https://github.com/nomic-ai/gpt4all) and [alpaca](https://github.com/tatsu-lab/stanford_alpaca).
|
||||
llama-cli is a straightforward golang CLI interface and API compatible with OpenAI for [llama.cpp](https://github.com/ggerganov/llama.cpp), it supports multiple-models and also provides a simple command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
|
||||
|
||||
`llama-cli` uses https://github.com/go-skynet/llama, which is a fork of [llama.cpp](https://github.com/ggerganov/llama.cpp) providing golang binding.
|
||||
It is compatible with the models supported by `llama.cpp`. You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
|
||||
|
||||
`llama-cli` doesn't shell-out, it uses https://github.com/go-skynet/go-llama.cpp, which is a golang binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
|
||||
|
||||
## Usage
|
||||
|
||||
You can use `docker-compose`:
|
||||
|
||||
```bash
|
||||
|
||||
git clone https://github.com/go-skynet/llama-cli
|
||||
cd llama-cli
|
||||
|
||||
# copy your models to models/
|
||||
cp your-model.bin models/
|
||||
|
||||
# (optional) Edit the .env file to set the number of concurrent threads used for inference
|
||||
# echo "THREADS=14" > .env
|
||||
|
||||
# start with docker-compose
|
||||
docker compose up -d --build
|
||||
|
||||
# Now API is accessible at localhost:8080
|
||||
curl http://localhost:8080/v1/models
|
||||
# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
|
||||
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "your-model.bin",
|
||||
"prompt": "A long time ago in a galaxy far, far away",
|
||||
"temperature": 0.7
|
||||
}'
|
||||
|
||||
|
||||
```
|
||||
|
||||
Note: You can use a use a default template for every model in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
|
||||
|
||||
```
|
||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
{{.Input}}
|
||||
|
||||
### Response:
|
||||
```
|
||||
|
||||
## Container images
|
||||
|
||||
`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
|
||||
|
||||
To begin, run:
|
||||
|
||||
```
|
||||
docker run -ti --rm quay.io/go-skynet/llama-cli:v0.4 --instruction "What's an alpaca?" --topk 10000 --model ...
|
||||
docker run -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model ...
|
||||
```
|
||||
|
||||
Where `--model` is the path of the model you want to use.
|
||||
|
||||
Note: you need to mount a volume to the docker container in order to load a model, for instance:
|
||||
|
||||
```
|
||||
# assuming your model is in /path/to/your/models/foo.bin
|
||||
docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
|
||||
```
|
||||
|
||||
You will receive a response like the following:
|
||||
@@ -39,8 +93,6 @@ llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--
|
||||
| top_p | TOP_P | 0.85 | The cumulative probability for top-p sampling. |
|
||||
| top_k | TOP_K | 20 | The number of top-k tokens to consider for text generation. |
|
||||
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
|
||||
| alpaca | ALPACA | true | Set to true for alpaca models. |
|
||||
| gpt4all | GPT4ALL | false | Set to true for gpt4all models. |
|
||||
|
||||
Here's an example of using `llama-cli`:
|
||||
|
||||
@@ -50,14 +102,14 @@ llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
|
||||
|
||||
This will generate text based on the given model and instruction.
|
||||
|
||||
## Advanced usage
|
||||
## API
|
||||
|
||||
`llama-cli` also provides an API for running text generation as a service. The model will be pre-loaded and kept in memory.
|
||||
`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.
|
||||
|
||||
Example of starting the API with `docker`:
|
||||
|
||||
```bash
|
||||
docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.4 api --context-size 700 --threads 4
|
||||
docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
|
||||
```
|
||||
|
||||
And you'll see:
|
||||
@@ -72,36 +124,68 @@ And you'll see:
|
||||
└───────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Note: Models have to end up with `.bin`.
|
||||
|
||||
You can control the API server options with command line arguments:
|
||||
|
||||
```
|
||||
llama-cli api --model <model_path> [--address <address>] [--threads <num_threads>]
|
||||
llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
|
||||
```
|
||||
|
||||
The API takes takes the following:
|
||||
|
||||
| Parameter | Environment Variable | Default Value | Description |
|
||||
| ------------ | -------------------- | ------------- | -------------------------------------- |
|
||||
| model | MODEL_PATH | | The path to the pre-trained GPT-based model. |
|
||||
| models-path | MODELS_PATH | | The path where you have models (ending with `.bin`). |
|
||||
| threads | THREADS | CPU cores | The number of threads to use for text generation. |
|
||||
| address | ADDRESS | :8080 | The address and port to listen on. |
|
||||
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
|
||||
| alpaca | ALPACA | true | Set to true for alpaca models. |
|
||||
| gpt4all | GPT4ALL | false | Set to true for gpt4all models. |
|
||||
|
||||
Once the server is running, you can start making requests to it using HTTP, using the OpenAI API.
|
||||
|
||||
Once the server is running, you can start making requests to it using HTTP. For example, to generate text based on an instruction, you can send a POST request to the `/predict` endpoint with the instruction as the request body:
|
||||
### Supported OpenAI API endpoints
|
||||
|
||||
You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create).
|
||||
|
||||
Following the list of endpoints/parameters supported.
|
||||
|
||||
#### Chat completions
|
||||
|
||||
For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
|
||||
|
||||
```
|
||||
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
|
||||
"text": "What is an alpaca?",
|
||||
"topP": 0.8,
|
||||
"topK": 50,
|
||||
"temperature": 0.7,
|
||||
"tokens": 100
|
||||
}'
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "ggml-koala-7b-model-q4_0-r2.bin",
|
||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
Available additional parameters: `top_p`, `top_k`, `max_tokens`
|
||||
|
||||
#### Completions
|
||||
|
||||
For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
|
||||
```
|
||||
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "ggml-koala-7b-model-q4_0-r2.bin",
|
||||
"prompt": "A long time ago in a galaxy far, far away",
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
Available additional parameters: `top_p`, `top_k`, `max_tokens`
|
||||
|
||||
#### List models
|
||||
|
||||
You can list all the models available with:
|
||||
|
||||
```
|
||||
curl http://localhost:8080/v1/models
|
||||
```
|
||||
|
||||
## Web interface
|
||||
|
||||
There is also available a simple web interface (for instance, http://localhost:8080/) which can be used as a playground.
|
||||
|
||||
Note: The API doesn't inject a template for talking to the instance, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release, for instance:
|
||||
@@ -115,18 +199,9 @@ Below is an instruction that describes a task. Write a response that appropriate
|
||||
### Response:
|
||||
```
|
||||
|
||||
|
||||
## Using other models
|
||||
|
||||
You can specify a model binary to be used for inference with `--model`.
|
||||
|
||||
13B and 30B alpaca models are known to work:
|
||||
|
||||
```
|
||||
# Download the model image, extract the model
|
||||
# Use the model with llama-cli
|
||||
docker run -v $PWD:/models -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.4 api --model /models/model.bin
|
||||
```
|
||||
|
||||
gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):
|
||||
|
||||
```bash
|
||||
@@ -154,7 +229,7 @@ import (
|
||||
|
||||
func main() {
|
||||
|
||||
cli := client.NewClient("http://ip:30007")
|
||||
cli := client.NewClient("http://ip:port")
|
||||
|
||||
out, err := cli.Predict("What's an alpaca?")
|
||||
if err != nil {
|
||||
@@ -185,9 +260,8 @@ In order to build the `llama-cli` container image locally you can use `docker`:
|
||||
|
||||
```
|
||||
# build the image as "alpaca-image"
|
||||
docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +image --IMAGE=alpaca-image
|
||||
# run the image
|
||||
docker run alpaca-image --instruction "What's an alpaca?"
|
||||
docker build -t llama-cli .
|
||||
docker run llama-cli --instruction "What's an alpaca?"
|
||||
```
|
||||
|
||||
Or build the binary with:
|
||||
@@ -201,11 +275,11 @@ docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v
|
||||
|
||||
## Short-term roadmap
|
||||
|
||||
- Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
|
||||
- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
|
||||
- Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
|
||||
- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
|
||||
- Multi-model support
|
||||
- Full Deployment and compatibility with https://github.com/mckaywrigley/chatbot-ui
|
||||
- [x] Multi-model support
|
||||
- Have a webUI!
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package main
|
||||
package api
|
||||
|
||||
import (
|
||||
"embed"
|
||||
@@ -8,6 +8,8 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
model "github.com/go-skynet/llama-cli/pkg/model"
|
||||
|
||||
llama "github.com/go-skynet/go-llama.cpp"
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/gofiber/fiber/v2/middleware/cors"
|
||||
@@ -40,20 +42,31 @@ type OpenAIModel struct {
|
||||
Object string `json:"object"`
|
||||
}
|
||||
|
||||
type OpenAIRequest struct {
|
||||
Model string `json:"model"`
|
||||
|
||||
// Prompt is read only by completion API calls
|
||||
Prompt string `json:"prompt"`
|
||||
// Messages is readh only by chat/completion API calls
|
||||
Messages []Message `json:"messages"`
|
||||
|
||||
// Common options between all the API calls
|
||||
TopP float64 `json:"top_p"`
|
||||
TopK int `json:"top_k"`
|
||||
Temperature float64 `json:"temperature"`
|
||||
Maxtokens int `json:"max_tokens"`
|
||||
}
|
||||
|
||||
//go:embed index.html
|
||||
var indexHTML embed.FS
|
||||
|
||||
func completionEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
|
||||
func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
|
||||
var err error
|
||||
var model *llama.LLama
|
||||
|
||||
input := new(OpenAIRequest)
|
||||
// Get input data from the request body
|
||||
input := new(struct {
|
||||
Model string `json:"model"`
|
||||
Prompt string `json:"prompt"`
|
||||
})
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -88,128 +101,35 @@ func completionEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads
|
||||
}
|
||||
|
||||
// Set the parameters for the language model prediction
|
||||
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
|
||||
if err != nil {
|
||||
return err
|
||||
topP := input.TopP
|
||||
if topP == 0 {
|
||||
topP = 0.7
|
||||
}
|
||||
topK := input.TopK
|
||||
if topK == 0 {
|
||||
topK = 80
|
||||
}
|
||||
|
||||
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
|
||||
if err != nil {
|
||||
return err
|
||||
temperature := input.Temperature
|
||||
if temperature == 0 {
|
||||
temperature = 0.9
|
||||
}
|
||||
|
||||
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
|
||||
if err != nil {
|
||||
return err
|
||||
tokens := input.Maxtokens
|
||||
if tokens == 0 {
|
||||
tokens = 512
|
||||
}
|
||||
|
||||
predInput := input.Prompt
|
||||
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||||
templatedInput, err := loader.TemplatePrefix(input.Model, struct {
|
||||
Input string
|
||||
}{Input: input.Prompt})
|
||||
if err == nil {
|
||||
predInput = templatedInput
|
||||
}
|
||||
|
||||
// Generate the prediction using the language model
|
||||
prediction, err := model.Predict(
|
||||
predInput,
|
||||
llama.SetTemperature(temperature),
|
||||
llama.SetTopP(topP),
|
||||
llama.SetTopK(topK),
|
||||
llama.SetTokens(tokens),
|
||||
llama.SetThreads(threads),
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Return the prediction in the response body
|
||||
return c.JSON(OpenAIResponse{
|
||||
Model: input.Model,
|
||||
Choices: []Choice{{Text: prediction}},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func chatEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
|
||||
var err error
|
||||
var model *llama.LLama
|
||||
|
||||
// Get input data from the request body
|
||||
input := new(struct {
|
||||
Messages []Message `json:"messages"`
|
||||
Model string `json:"model"`
|
||||
})
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if input.Model == "" {
|
||||
if defaultModel == nil {
|
||||
return fmt.Errorf("no default model loaded, and no model specified")
|
||||
if chat {
|
||||
mess := []string{}
|
||||
for _, i := range input.Messages {
|
||||
mess = append(mess, i.Content)
|
||||
}
|
||||
model = defaultModel
|
||||
} else {
|
||||
model, err = loader.LoadModel(input.Model)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
|
||||
if input.Model != "" {
|
||||
mutexMap.Lock()
|
||||
l, ok := mutexes[input.Model]
|
||||
if !ok {
|
||||
m := &sync.Mutex{}
|
||||
mutexes[input.Model] = m
|
||||
l = m
|
||||
}
|
||||
mutexMap.Unlock()
|
||||
l.Lock()
|
||||
defer l.Unlock()
|
||||
} else {
|
||||
defaultMutex.Lock()
|
||||
defer defaultMutex.Unlock()
|
||||
predInput = strings.Join(mess, "\n")
|
||||
}
|
||||
|
||||
// Set the parameters for the language model prediction
|
||||
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
mess := []string{}
|
||||
for _, i := range input.Messages {
|
||||
mess = append(mess, i.Content)
|
||||
}
|
||||
|
||||
predInput := strings.Join(mess, "\n")
|
||||
|
||||
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||||
templatedInput, err := loader.TemplatePrefix(input.Model, struct {
|
||||
Input string
|
||||
@@ -231,15 +151,23 @@ func chatEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, d
|
||||
return err
|
||||
}
|
||||
|
||||
if chat {
|
||||
// Return the chat prediction in the response body
|
||||
return c.JSON(OpenAIResponse{
|
||||
Model: input.Model,
|
||||
Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
|
||||
})
|
||||
}
|
||||
|
||||
// Return the prediction in the response body
|
||||
return c.JSON(OpenAIResponse{
|
||||
Model: input.Model,
|
||||
Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
|
||||
Choices: []Choice{{Text: prediction}},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, threads int) error {
|
||||
func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
|
||||
app := fiber.New()
|
||||
|
||||
// Default middleware config
|
||||
@@ -252,8 +180,8 @@ func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, thre
|
||||
var mumutex = &sync.Mutex{}
|
||||
|
||||
// openAI compatible API endpoint
|
||||
app.Post("/v1/chat/completions", chatEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
|
||||
app.Post("/v1/completions", completionEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
|
||||
app.Post("/v1/chat/completions", openAIEndpoint(true, defaultModel, loader, threads, mutex, mumutex, mu))
|
||||
app.Post("/v1/completions", openAIEndpoint(false, defaultModel, loader, threads, mutex, mumutex, mu))
|
||||
app.Get("/v1/models", func(c *fiber.Ctx) error {
|
||||
models, err := loader.ListModels()
|
||||
if err != nil {
|
||||
15
docker-compose.yaml
Normal file
15
docker-compose.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
version: '3.6'
|
||||
|
||||
services:
|
||||
api:
|
||||
image: quay.io/go-skynet/llama-cli:latest
|
||||
build: .
|
||||
volumes:
|
||||
- ./models:/models
|
||||
ports:
|
||||
- 8080:8080
|
||||
environment:
|
||||
- MODELS_PATH=/models
|
||||
- CONTEXT_SIZE=700
|
||||
- THREADS=$THREADS
|
||||
command: api
|
||||
@@ -25,7 +25,7 @@ spec:
|
||||
- name: llama
|
||||
args:
|
||||
- api
|
||||
image: quay.io/go-skynet/llama-cli:v0.3
|
||||
image: quay.io/go-skynet/llama-cli:latest
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
|
||||
5
main.go
5
main.go
@@ -9,6 +9,9 @@ import (
|
||||
"text/template"
|
||||
|
||||
llama "github.com/go-skynet/go-llama.cpp"
|
||||
api "github.com/go-skynet/llama-cli/api"
|
||||
model "github.com/go-skynet/llama-cli/pkg/model"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
|
||||
@@ -159,7 +162,7 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
|
||||
}
|
||||
}
|
||||
|
||||
return api(defaultModel, NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
|
||||
return api.Start(defaultModel, model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
0
models/.keep
Normal file
0
models/.keep
Normal file
@@ -1,4 +1,4 @@
|
||||
package main
|
||||
package model
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
Reference in New Issue
Block a user