Compare commits

..

13 Commits
v0.6 ... v0.8.1

Author SHA1 Message Date
mudler
a9cd6b3ca3 ci: Fix tag detection for 'latest' 2023-04-13 01:37:09 +02:00
mudler
e786576b95 Update README 2023-04-13 01:28:15 +02:00
Ettore Di Giacinto
d426571789 Merge pull request #16 from go-skynet/fix_arm
Drop armv7 builds
2023-04-13 01:21:58 +02:00
mudler
a896a2b5ad Drop armv7 builds 2023-04-13 01:21:40 +02:00
Ettore Di Giacinto
8273cd5c04 Merge pull request #15 from go-skynet/docker-compose
Add docker-compose file
2023-04-13 01:17:44 +02:00
mudler
16f1281d38 Minor workflow fixes 2023-04-13 01:16:13 +02:00
mudler
8042e9a2d6 Add docker-compose
Fixes #14

Signed-off-by: mudler <mudler@c3os.io>
2023-04-13 01:13:14 +02:00
mudler
624092cb99 Update README 2023-04-12 00:07:30 +02:00
mudler
a422a883ac Minor rephrasing 2023-04-12 00:04:15 +02:00
mudler
7858a97254 Update README 2023-04-12 00:02:47 +02:00
mudler
5556aa46dd Small refinements and refactors 2023-04-12 00:02:39 +02:00
mudler
eb4257f946 Add .gitignore 2023-04-11 23:44:00 +02:00
mudler
ae30bd346d Reorganize repository layout 2023-04-11 23:43:43 +02:00
14 changed files with 236 additions and 231 deletions

1
.dockerignore Normal file
View File

@@ -0,0 +1 @@
models/*.bin

1
.env Normal file
View File

@@ -0,0 +1 @@
THREADS=14

View File

@@ -2,6 +2,7 @@
name: 'build container images'
on:
pull_request:
push:
branches:
- master
@@ -12,68 +13,42 @@ jobs:
docker:
runs-on: ubuntu-latest
steps:
- name: Release space from worker
run: |
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
df -h
echo
sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
sudo apt-get remove --auto-remove android-sdk-platform-tools || true
sudo apt-get purge --auto-remove android-sdk-platform-tools || true
sudo rm -rf /usr/local/lib/android
sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
sudo rm -rf /usr/share/dotnet
sudo apt-get remove -y '^mono-.*' || true
sudo apt-get remove -y '^ghc-.*' || true
sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
sudo apt-get remove -y 'php.*' || true
sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
sudo apt-get remove -y '^google-.*' || true
sudo apt-get remove -y azure-cli || true
sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
sudo apt-get remove -y '^gfortran-.*' || true
sudo apt-get autoremove -y
sudo apt-get clean
echo
echo "Listing top largest packages"
pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
head -n 30 <<< "${pkgs}"
echo
sudo rm -rfv build || true
df -h
- name: Checkout
uses: actions/checkout@v3
- name: Prepare
id: prep
run: |
DOCKER_IMAGE=quay.io/go-skynet/llama-cli
VERSION=latest
VERSION=master
SHORTREF=${GITHUB_SHA::8}
# If this is git tag, use the tag name as a docker tag
if [[ $GITHUB_REF == refs/tags/* ]]; then
VERSION=${GITHUB_REF#refs/tags/}
fi
TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
# If the VERSION looks like a version number, assume that
# this is the most recent version of the image and also
# tag it 'latest'.
if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
TAGS="$TAGS,${DOCKER_IMAGE}:latest"
fi
# Set output parameters.
echo ::set-output name=tags::${TAGS}
echo ::set-output name=docker_image::${DOCKER_IMAGE}
echo ::set-output name=image::${DOCKER_IMAGE}:${VERSION}
- name: Set up QEMU
uses: docker/setup-qemu-action@master
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@master
- name: Login to DockerHub
if: github.event_name != 'pull_request'
uses: docker/login-action@v2
@@ -81,9 +56,23 @@ jobs:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_PASSWORD }}
- uses: earthly/actions/setup-earthly@v1
- name: Build
run: |
earthly config "global.conversion_parallelism" "1"
earthly config "global.buildkit_max_parallelism" "1"
earthly --push +image-all --IMAGE=${{ steps.prep.outputs.image }}
if: github.event_name != 'pull_request'
uses: docker/build-push-action@v4
with:
builder: ${{ steps.buildx.outputs.name }}
context: .
file: ./Dockerfile
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.prep.outputs.tags }}
- name: Build PRs
if: github.event_name == 'pull_request'
uses: docker/build-push-action@v4
with:
builder: ${{ steps.buildx.outputs.name }}
context: .
file: ./Dockerfile
platforms: linux/amd64
push: false
tags: ${{ steps.prep.outputs.tags }}

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
llama-cli
models/*.bin

19
Dockerfile Normal file
View File

@@ -0,0 +1,19 @@
ARG GO_VERSION=1.20
ARG DEBIAN_VERSION=11
FROM golang:$GO_VERSION as builder
WORKDIR /build
RUN git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp
RUN cd go-llama.cpp && make libbinding.a
COPY go.mod ./
COPY go.sum ./
RUN go mod download
RUN apt-get update
COPY . .
RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
RUN C_INCLUDE_PATH=/build/go-llama.cpp LIBRARY_PATH=/build/go-llama.cpp go build -o llama-cli ./
FROM debian:$DEBIAN_VERSION
COPY --from=builder /build/llama-cli /usr/bin/llama-cli
ENTRYPOINT [ "/usr/bin/llama-cli" ]

View File

@@ -1,32 +1,5 @@
VERSION 0.7
go-deps:
ARG GO_VERSION=1.20
FROM golang:$GO_VERSION
WORKDIR /build
COPY go.mod ./
COPY go.sum ./
RUN go mod download
RUN apt-get update
SAVE ARTIFACT go.mod AS LOCAL go.mod
SAVE ARTIFACT go.sum AS LOCAL go.sum
build:
FROM +go-deps
WORKDIR /build
RUN git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp
RUN cd go-llama.cpp && make libbinding.a
COPY . .
RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
RUN C_INCLUDE_PATH=$GOPATH/src/github.com/go-skynet/go-llama.cpp LIBRARY_PATH=$GOPATH/src/github.com/go-skynet/go-llama.cpp go build -o llama-cli ./
SAVE ARTIFACT llama-cli AS LOCAL llama-cli
image:
FROM +go-deps
ARG IMAGE=alpaca-cli-nomodel
COPY +build/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]
SAVE IMAGE --push $IMAGE
image-all:
BUILD --platform=linux/amd64 --platform=linux/arm64 +image
FROM DOCKERFILE -f Dockerfile .
SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli

148
README.md
View File

@@ -1,16 +1,70 @@
## :camel: llama-cli
llama-cli is a straightforward golang CLI interface for [llama.cpp](https://github.com/ggerganov/llama.cpp), providing a simple API and a command line interface that allows text generation using a GPT-based model like llama directly from the terminal. It is also compatible with [gpt4all](https://github.com/nomic-ai/gpt4all) and [alpaca](https://github.com/tatsu-lab/stanford_alpaca).
llama-cli is a straightforward golang CLI interface and API compatible with OpenAI for [llama.cpp](https://github.com/ggerganov/llama.cpp), it supports multiple-models and also provides a simple command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
`llama-cli` uses https://github.com/go-skynet/llama, which is a fork of [llama.cpp](https://github.com/ggerganov/llama.cpp) providing golang binding.
It is compatible with the models supported by `llama.cpp`. You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
`llama-cli` doesn't shell-out, it uses https://github.com/go-skynet/go-llama.cpp, which is a golang binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
## Usage
You can use `docker-compose`:
```bash
git clone https://github.com/go-skynet/llama-cli
cd llama-cli
# copy your models to models/
cp your-model.bin models/
# (optional) Edit the .env file to set the number of concurrent threads used for inference
# echo "THREADS=14" > .env
# start with docker-compose
docker compose up -d --build
# Now API is accessible at localhost:8080
curl http://localhost:8080/v1/models
# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "your-model.bin",
"prompt": "A long time ago in a galaxy far, far away",
"temperature": 0.7
}'
```
Note: You can use a use a default template for every model in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
```
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{{.Input}}
### Response:
```
## Container images
`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
To begin, run:
```
docker run -ti --rm quay.io/go-skynet/llama-cli:v0.4 --instruction "What's an alpaca?" --topk 10000 --model ...
docker run -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model ...
```
Where `--model` is the path of the model you want to use.
Note: you need to mount a volume to the docker container in order to load a model, for instance:
```
# assuming your model is in /path/to/your/models/foo.bin
docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
```
You will receive a response like the following:
@@ -39,8 +93,6 @@ llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--
| top_p | TOP_P | 0.85 | The cumulative probability for top-p sampling. |
| top_k | TOP_K | 20 | The number of top-k tokens to consider for text generation. |
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
| alpaca | ALPACA | true | Set to true for alpaca models. |
| gpt4all | GPT4ALL | false | Set to true for gpt4all models. |
Here's an example of using `llama-cli`:
@@ -50,14 +102,14 @@ llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
This will generate text based on the given model and instruction.
## Advanced usage
## API
`llama-cli` also provides an API for running text generation as a service. The model will be pre-loaded and kept in memory.
`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.
Example of starting the API with `docker`:
```bash
docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.4 api --context-size 700 --threads 4
docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
```
And you'll see:
@@ -72,36 +124,68 @@ And you'll see:
└───────────────────────────────────────────────────┘
```
Note: Models have to end up with `.bin`.
You can control the API server options with command line arguments:
```
llama-cli api --model <model_path> [--address <address>] [--threads <num_threads>]
llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
```
The API takes takes the following:
| Parameter | Environment Variable | Default Value | Description |
| ------------ | -------------------- | ------------- | -------------------------------------- |
| model | MODEL_PATH | | The path to the pre-trained GPT-based model. |
| models-path | MODELS_PATH | | The path where you have models (ending with `.bin`). |
| threads | THREADS | CPU cores | The number of threads to use for text generation. |
| address | ADDRESS | :8080 | The address and port to listen on. |
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
| alpaca | ALPACA | true | Set to true for alpaca models. |
| gpt4all | GPT4ALL | false | Set to true for gpt4all models. |
Once the server is running, you can start making requests to it using HTTP, using the OpenAI API.
Once the server is running, you can start making requests to it using HTTP. For example, to generate text based on an instruction, you can send a POST request to the `/predict` endpoint with the instruction as the request body:
### Supported OpenAI API endpoints
You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create).
Following the list of endpoints/parameters supported.
#### Chat completions
For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
```
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
"text": "What is an alpaca?",
"topP": 0.8,
"topK": 50,
"temperature": 0.7,
"tokens": 100
}'
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "ggml-koala-7b-model-q4_0-r2.bin",
"messages": [{"role": "user", "content": "Say this is a test!"}],
"temperature": 0.7
}'
```
Available additional parameters: `top_p`, `top_k`, `max_tokens`
#### Completions
For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
```
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "ggml-koala-7b-model-q4_0-r2.bin",
"prompt": "A long time ago in a galaxy far, far away",
"temperature": 0.7
}'
```
Available additional parameters: `top_p`, `top_k`, `max_tokens`
#### List models
You can list all the models available with:
```
curl http://localhost:8080/v1/models
```
## Web interface
There is also available a simple web interface (for instance, http://localhost:8080/) which can be used as a playground.
Note: The API doesn't inject a template for talking to the instance, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release, for instance:
@@ -115,18 +199,9 @@ Below is an instruction that describes a task. Write a response that appropriate
### Response:
```
## Using other models
You can specify a model binary to be used for inference with `--model`.
13B and 30B alpaca models are known to work:
```
# Download the model image, extract the model
# Use the model with llama-cli
docker run -v $PWD:/models -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:v0.4 api --model /models/model.bin
```
gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):
```bash
@@ -154,7 +229,7 @@ import (
func main() {
cli := client.NewClient("http://ip:30007")
cli := client.NewClient("http://ip:port")
out, err := cli.Predict("What's an alpaca?")
if err != nil {
@@ -185,9 +260,8 @@ In order to build the `llama-cli` container image locally you can use `docker`:
```
# build the image as "alpaca-image"
docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +image --IMAGE=alpaca-image
# run the image
docker run alpaca-image --instruction "What's an alpaca?"
docker build -t llama-cli .
docker run llama-cli --instruction "What's an alpaca?"
```
Or build the binary with:
@@ -201,11 +275,11 @@ docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v
## Short-term roadmap
- Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
- Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
- Multi-model support
- Full Deployment and compatibility with https://github.com/mckaywrigley/chatbot-ui
- [x] Multi-model support
- Have a webUI!
## License

View File

@@ -1,4 +1,4 @@
package main
package api
import (
"embed"
@@ -8,6 +8,8 @@ import (
"strings"
"sync"
model "github.com/go-skynet/llama-cli/pkg/model"
llama "github.com/go-skynet/go-llama.cpp"
"github.com/gofiber/fiber/v2"
"github.com/gofiber/fiber/v2/middleware/cors"
@@ -40,20 +42,31 @@ type OpenAIModel struct {
Object string `json:"object"`
}
type OpenAIRequest struct {
Model string `json:"model"`
// Prompt is read only by completion API calls
Prompt string `json:"prompt"`
// Messages is readh only by chat/completion API calls
Messages []Message `json:"messages"`
// Common options between all the API calls
TopP float64 `json:"top_p"`
TopK int `json:"top_k"`
Temperature float64 `json:"temperature"`
Maxtokens int `json:"max_tokens"`
}
//go:embed index.html
var indexHTML embed.FS
func completionEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
var err error
var model *llama.LLama
input := new(OpenAIRequest)
// Get input data from the request body
input := new(struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
})
if err := c.BodyParser(input); err != nil {
return err
}
@@ -88,128 +101,35 @@ func completionEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads
}
// Set the parameters for the language model prediction
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
if err != nil {
return err
topP := input.TopP
if topP == 0 {
topP = 0.7
}
topK := input.TopK
if topK == 0 {
topK = 80
}
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
if err != nil {
return err
temperature := input.Temperature
if temperature == 0 {
temperature = 0.9
}
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
if err != nil {
return err
}
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
if err != nil {
return err
tokens := input.Maxtokens
if tokens == 0 {
tokens = 512
}
predInput := input.Prompt
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
templatedInput, err := loader.TemplatePrefix(input.Model, struct {
Input string
}{Input: input.Prompt})
if err == nil {
predInput = templatedInput
}
// Generate the prediction using the language model
prediction, err := model.Predict(
predInput,
llama.SetTemperature(temperature),
llama.SetTopP(topP),
llama.SetTopK(topK),
llama.SetTokens(tokens),
llama.SetThreads(threads),
)
if err != nil {
return err
}
// Return the prediction in the response body
return c.JSON(OpenAIResponse{
Model: input.Model,
Choices: []Choice{{Text: prediction}},
})
}
}
func chatEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
var err error
var model *llama.LLama
// Get input data from the request body
input := new(struct {
Messages []Message `json:"messages"`
Model string `json:"model"`
})
if err := c.BodyParser(input); err != nil {
return err
}
if input.Model == "" {
if defaultModel == nil {
return fmt.Errorf("no default model loaded, and no model specified")
if chat {
mess := []string{}
for _, i := range input.Messages {
mess = append(mess, i.Content)
}
model = defaultModel
} else {
model, err = loader.LoadModel(input.Model)
if err != nil {
return err
}
}
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
if input.Model != "" {
mutexMap.Lock()
l, ok := mutexes[input.Model]
if !ok {
m := &sync.Mutex{}
mutexes[input.Model] = m
l = m
}
mutexMap.Unlock()
l.Lock()
defer l.Unlock()
} else {
defaultMutex.Lock()
defer defaultMutex.Unlock()
predInput = strings.Join(mess, "\n")
}
// Set the parameters for the language model prediction
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
if err != nil {
return err
}
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
if err != nil {
return err
}
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
if err != nil {
return err
}
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
if err != nil {
return err
}
mess := []string{}
for _, i := range input.Messages {
mess = append(mess, i.Content)
}
predInput := strings.Join(mess, "\n")
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
templatedInput, err := loader.TemplatePrefix(input.Model, struct {
Input string
@@ -231,15 +151,23 @@ func chatEndpoint(defaultModel *llama.LLama, loader *ModelLoader, threads int, d
return err
}
if chat {
// Return the chat prediction in the response body
return c.JSON(OpenAIResponse{
Model: input.Model,
Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
})
}
// Return the prediction in the response body
return c.JSON(OpenAIResponse{
Model: input.Model,
Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
Choices: []Choice{{Text: prediction}},
})
}
}
func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, threads int) error {
func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
app := fiber.New()
// Default middleware config
@@ -252,8 +180,8 @@ func api(defaultModel *llama.LLama, loader *ModelLoader, listenAddr string, thre
var mumutex = &sync.Mutex{}
// openAI compatible API endpoint
app.Post("/v1/chat/completions", chatEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
app.Post("/v1/completions", completionEndpoint(defaultModel, loader, threads, mutex, mumutex, mu))
app.Post("/v1/chat/completions", openAIEndpoint(true, defaultModel, loader, threads, mutex, mumutex, mu))
app.Post("/v1/completions", openAIEndpoint(false, defaultModel, loader, threads, mutex, mumutex, mu))
app.Get("/v1/models", func(c *fiber.Ctx) error {
models, err := loader.ListModels()
if err != nil {

View File

15
docker-compose.yaml Normal file
View File

@@ -0,0 +1,15 @@
version: '3.6'
services:
api:
image: quay.io/go-skynet/llama-cli:latest
build: .
volumes:
- ./models:/models
ports:
- 8080:8080
environment:
- MODELS_PATH=/models
- CONTEXT_SIZE=700
- THREADS=$THREADS
command: api

View File

@@ -25,7 +25,7 @@ spec:
- name: llama
args:
- api
image: quay.io/go-skynet/llama-cli:v0.3
image: quay.io/go-skynet/llama-cli:latest
---
apiVersion: v1
kind: Service

View File

@@ -9,6 +9,9 @@ import (
"text/template"
llama "github.com/go-skynet/go-llama.cpp"
api "github.com/go-skynet/llama-cli/api"
model "github.com/go-skynet/llama-cli/pkg/model"
"github.com/urfave/cli/v2"
)
@@ -159,7 +162,7 @@ echo "An Alpaca (Vicugna pacos) is a domesticated species of South American came
}
}
return api(defaultModel, NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
return api.Start(defaultModel, model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
},
},
},

0
models/.keep Normal file
View File

View File

@@ -1,4 +1,4 @@
package main
package model
import (
"bytes"