mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
37 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eb137c8a84 | ||
|
|
4e2061636e | ||
|
|
e3ef171968 | ||
|
|
12d83a4184 | ||
|
|
045412e8dd | ||
|
|
9896a9a58b | ||
|
|
b9011bda59 | ||
|
|
2b2f5fa36a | ||
|
|
43c557dc5c | ||
|
|
7abb2c9bd7 | ||
|
|
7a9ea4480a | ||
|
|
31bcc558de | ||
|
|
676e15f785 | ||
|
|
3e71c90949 | ||
|
|
550ae9c968 | ||
|
|
1c872ec326 | ||
|
|
05f35b182c | ||
|
|
79791438fe | ||
|
|
bf20cc34f6 | ||
|
|
5cba71de70 | ||
|
|
4b7e83056d | ||
|
|
ed954d66c3 | ||
|
|
f816dfae65 | ||
|
|
142bcd66ca | ||
|
|
1c4fbaae20 | ||
|
|
d517a54e28 | ||
|
|
c905512bb0 | ||
|
|
1254951fab | ||
|
|
80f50e6ccd | ||
|
|
7fec26f5d3 | ||
|
|
a9a875ee2b | ||
|
|
db5ac715f3 | ||
|
|
0b330d90ad | ||
|
|
63601fabd1 | ||
|
|
1370b4482f | ||
|
|
b062f3142b | ||
|
|
c37175271f |
3
.devcontainer/Dockerfile
Normal file
3
.devcontainer/Dockerfile
Normal file
@@ -0,0 +1,3 @@
|
||||
ARG GO_VERSION=1.20
|
||||
FROM mcr.microsoft.com/devcontainers/go:0-$GO_VERSION-bullseye
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
46
.devcontainer/devcontainer.json
Normal file
46
.devcontainer/devcontainer.json
Normal file
@@ -0,0 +1,46 @@
|
||||
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
|
||||
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-docker-compose
|
||||
{
|
||||
"name": "Existing Docker Compose (Extend)",
|
||||
|
||||
// Update the 'dockerComposeFile' list if you have more compose files or use different names.
|
||||
// The .devcontainer/docker-compose.yml file contains any overrides you need/want to make.
|
||||
"dockerComposeFile": [
|
||||
"../docker-compose.yaml",
|
||||
"docker-compose.yml"
|
||||
],
|
||||
|
||||
// The 'service' property is the name of the service for the container that VS Code should
|
||||
// use. Update this value and .devcontainer/docker-compose.yml to the real service name.
|
||||
"service": "api",
|
||||
|
||||
// The optional 'workspaceFolder' property is the path VS Code should open by default when
|
||||
// connected. This is typically a file mount in .devcontainer/docker-compose.yml
|
||||
"workspaceFolder": "/workspace",
|
||||
|
||||
"features": {
|
||||
"ghcr.io/devcontainers/features/go:1": {},
|
||||
"ghcr.io/azutake/devcontainer-features/go-packages-install:0": {}
|
||||
},
|
||||
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
// "forwardPorts": [],
|
||||
|
||||
// Uncomment the next line if you want start specific services in your Docker Compose config.
|
||||
// "runServices": [],
|
||||
|
||||
// Uncomment the next line if you want to keep your containers running after VS Code shuts down.
|
||||
// "shutdownAction": "none",
|
||||
|
||||
// Uncomment the next line to run commands after the container is created.
|
||||
"postCreateCommand": "make prepare"
|
||||
|
||||
// Configure tool-specific properties.
|
||||
// "customizations": {},
|
||||
|
||||
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
|
||||
// "remoteUser": "devcontainer"
|
||||
}
|
||||
26
.devcontainer/docker-compose.yml
Normal file
26
.devcontainer/docker-compose.yml
Normal file
@@ -0,0 +1,26 @@
|
||||
version: '3.6'
|
||||
services:
|
||||
# Update this to the name of the service you want to work with in your docker-compose.yml file
|
||||
api:
|
||||
# Uncomment if you want to override the service's Dockerfile to one in the .devcontainer
|
||||
# folder. Note that the path of the Dockerfile and context is relative to the *primary*
|
||||
# docker-compose.yml file (the first in the devcontainer.json "dockerComposeFile"
|
||||
# array). The sample below assumes your primary file is in the root of your project.
|
||||
#
|
||||
build:
|
||||
context: .
|
||||
dockerfile: .devcontainer/Dockerfile
|
||||
|
||||
volumes:
|
||||
# Update this to wherever you want VS Code to mount the folder of your project
|
||||
- .:/workspace:cached
|
||||
|
||||
# Uncomment the next four lines if you will use a ptrace-based debugger like C++, Go, and Rust.
|
||||
# cap_add:
|
||||
# - SYS_PTRACE
|
||||
# security_opt:
|
||||
# - seccomp:unconfined
|
||||
|
||||
# Overrides default command so things don't shut down after the process ends.
|
||||
command: /bin/sh -c "while sleep 1000; do :; done"
|
||||
|
||||
@@ -1 +1 @@
|
||||
models/*.bin
|
||||
models
|
||||
|
||||
6
.env
6
.env
@@ -1 +1,5 @@
|
||||
THREADS=14
|
||||
# THREADS=14
|
||||
# CONTEXT_SIZE=512
|
||||
MODELS_PATH=/models
|
||||
# DEBUG=true
|
||||
# BUILD_TYPE=generic
|
||||
|
||||
2
.github/workflows/image.yml
vendored
2
.github/workflows/image.yml
vendored
@@ -19,7 +19,7 @@ jobs:
|
||||
- name: Prepare
|
||||
id: prep
|
||||
run: |
|
||||
DOCKER_IMAGE=quay.io/go-skynet/llama-cli
|
||||
DOCKER_IMAGE=quay.io/go-skynet/local-ai
|
||||
VERSION=master
|
||||
SHORTREF=${GITHUB_SHA::8}
|
||||
|
||||
|
||||
44
.github/workflows/test.yml
vendored
Normal file
44
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
---
|
||||
name: 'tests'
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
tags:
|
||||
- '*'
|
||||
|
||||
jobs:
|
||||
ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
- name: Test
|
||||
run: |
|
||||
make test
|
||||
|
||||
macOS-latest:
|
||||
runs-on: macOS-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew update
|
||||
brew install sdl2
|
||||
- name: Test
|
||||
run: |
|
||||
make test
|
||||
17
.gitignore
vendored
17
.gitignore
vendored
@@ -1,2 +1,15 @@
|
||||
llama-cli
|
||||
models/*.bin
|
||||
# go-llama build artifacts
|
||||
go-llama
|
||||
go-gpt4all-j
|
||||
go-gpt2
|
||||
|
||||
# LocalAI build binary
|
||||
LocalAI
|
||||
local-ai
|
||||
# prevent above rules from omitting the helm chart
|
||||
!charts/*
|
||||
|
||||
# Ignore models
|
||||
models/*.bin
|
||||
models/ggml-*
|
||||
test-models/
|
||||
@@ -1,5 +1,5 @@
|
||||
# Make sure to check the documentation at http://goreleaser.com
|
||||
project_name: llama-cli
|
||||
project_name: local-ai
|
||||
builds:
|
||||
- ldflags:
|
||||
- -w -s
|
||||
|
||||
20
.vscode/launch.json
vendored
Normal file
20
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Launch Go",
|
||||
"type": "go",
|
||||
"request": "launch",
|
||||
"mode": "debug",
|
||||
"program": "${workspaceFolder}/main.go",
|
||||
"args": [
|
||||
"api"
|
||||
],
|
||||
"env": {
|
||||
"C_INCLUDE_PATH": "/workspace/go-llama:/workspace/go-gpt4all-j:/workspace/go-gpt2",
|
||||
"LIBRARY_PATH": "/workspace/go-llama:/workspace/go-gpt4all-j:/workspace/go-gpt2",
|
||||
"DEBUG": "true"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
18
Dockerfile
18
Dockerfile
@@ -1,18 +0,0 @@
|
||||
ARG GO_VERSION=1.20
|
||||
ARG DEBIAN_VERSION=11
|
||||
FROM golang:$GO_VERSION as builder
|
||||
WORKDIR /build
|
||||
ARG GO_LLAMA_CPP_TAG=llama.cpp-2f7c8e0
|
||||
RUN git clone -b $GO_LLAMA_CPP_TAG --recurse-submodules https://github.com/go-skynet/go-llama.cpp
|
||||
RUN cd go-llama.cpp && make libbinding.a
|
||||
COPY go.mod ./
|
||||
COPY go.sum ./
|
||||
RUN go mod download
|
||||
RUN apt-get update
|
||||
COPY . .
|
||||
RUN go mod edit -replace github.com/go-skynet/go-llama.cpp=/build/go-llama.cpp
|
||||
RUN C_INCLUDE_PATH=/build/go-llama.cpp LIBRARY_PATH=/build/go-llama.cpp go build -o llama-cli ./
|
||||
|
||||
FROM debian:$DEBIAN_VERSION
|
||||
COPY --from=builder /build/llama-cli /usr/bin/llama-cli
|
||||
ENTRYPOINT [ "/usr/bin/llama-cli" ]
|
||||
@@ -1,5 +0,0 @@
|
||||
VERSION 0.7
|
||||
|
||||
build:
|
||||
FROM DOCKERFILE -f Dockerfile .
|
||||
SAVE ARTIFACT /usr/bin/llama-cli AS LOCAL llama-cli
|
||||
21
LICENSE
21
LICENSE
@@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 go-skynet authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
293
README.md
293
README.md
@@ -1,293 +0,0 @@
|
||||
## :camel: llama-cli
|
||||
|
||||
|
||||
llama-cli is a straightforward golang CLI interface and API compatible with OpenAI for [llama.cpp](https://github.com/ggerganov/llama.cpp), it supports multiple-models and also provides a simple command line interface that allows text generation using a GPT-based model like llama directly from the terminal.
|
||||
|
||||
It is compatible with the models supported by `llama.cpp`. You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
|
||||
|
||||
`llama-cli` doesn't shell-out, it uses https://github.com/go-skynet/go-llama.cpp, which is a golang binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
|
||||
|
||||
## Usage
|
||||
|
||||
You can use `docker-compose`:
|
||||
|
||||
```bash
|
||||
|
||||
git clone https://github.com/go-skynet/llama-cli
|
||||
cd llama-cli
|
||||
|
||||
# copy your models to models/
|
||||
cp your-model.bin models/
|
||||
|
||||
# (optional) Edit the .env file to set the number of concurrent threads used for inference
|
||||
# echo "THREADS=14" > .env
|
||||
|
||||
# start with docker-compose
|
||||
docker compose up -d --build
|
||||
|
||||
# Now API is accessible at localhost:8080
|
||||
curl http://localhost:8080/v1/models
|
||||
# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
|
||||
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "your-model.bin",
|
||||
"prompt": "A long time ago in a galaxy far, far away",
|
||||
"temperature": 0.7
|
||||
}'
|
||||
|
||||
|
||||
```
|
||||
|
||||
Note: You can use a default template for every model in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
|
||||
|
||||
```
|
||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
{{.Input}}
|
||||
|
||||
### Response:
|
||||
```
|
||||
|
||||
## Container images
|
||||
|
||||
`llama-cli` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/llama-cli?tab=tags&tag=latest)
|
||||
|
||||
To begin, run:
|
||||
|
||||
```
|
||||
docker run -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model ...
|
||||
```
|
||||
|
||||
Where `--model` is the path of the model you want to use.
|
||||
|
||||
Note: you need to mount a volume to the docker container in order to load a model, for instance:
|
||||
|
||||
```
|
||||
# assuming your model is in /path/to/your/models/foo.bin
|
||||
docker run -v /path/to/your/models:/models -ti --rm quay.io/go-skynet/llama-cli:latest --instruction "What's an alpaca?" --topk 10000 --model /models/foo.bin
|
||||
```
|
||||
|
||||
You will receive a response like the following:
|
||||
|
||||
```
|
||||
An alpaca is a member of the South American Camelid family, which includes the llama, guanaco and vicuña. It is a domesticated species that originates from the Andes mountain range in South America. Alpacas are used in the textile industry for their fleece, which is much softer than wool. Alpacas are also used for meat, milk, and fiber.
|
||||
```
|
||||
|
||||
## Basic usage
|
||||
|
||||
To use llama-cli, specify a pre-trained GPT-based model, an input text, and an instruction for text generation. llama-cli takes the following arguments when running from the CLI:
|
||||
|
||||
```
|
||||
llama-cli --model <model_path> --instruction <instruction> [--input <input>] [--template <template_path>] [--tokens <num_tokens>] [--threads <num_threads>] [--temperature <temperature>] [--topp <top_p>] [--topk <top_k>]
|
||||
```
|
||||
|
||||
| Parameter | Environment Variable | Default Value | Description |
|
||||
| ------------ | -------------------- | ------------- | -------------------------------------- |
|
||||
| template | TEMPLATE | | A file containing a template for output formatting (optional). |
|
||||
| instruction | INSTRUCTION | | Input prompt text or instruction. "-" for STDIN. |
|
||||
| input | INPUT | - | Path to text or "-" for STDIN. |
|
||||
| model | MODEL_PATH | | The path to the pre-trained GPT-based model. |
|
||||
| tokens | TOKENS | 128 | The maximum number of tokens to generate. |
|
||||
| threads | THREADS | NumCPU() | The number of threads to use for text generation. |
|
||||
| temperature | TEMPERATURE | 0.95 | Sampling temperature for model output. ( values between `0.1` and `1.0` ) |
|
||||
| top_p | TOP_P | 0.85 | The cumulative probability for top-p sampling. |
|
||||
| top_k | TOP_K | 20 | The number of top-k tokens to consider for text generation. |
|
||||
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
|
||||
|
||||
Here's an example of using `llama-cli`:
|
||||
|
||||
```
|
||||
llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
|
||||
```
|
||||
|
||||
This will generate text based on the given model and instruction.
|
||||
|
||||
## API
|
||||
|
||||
`llama-cli` also provides an API for running text generation as a service. The models once loaded the first time will be kept in memory.
|
||||
|
||||
Example of starting the API with `docker`:
|
||||
|
||||
```bash
|
||||
docker run -p 8080:8080 -ti --rm quay.io/go-skynet/llama-cli:latest api --models-path /path/to/models --context-size 700 --threads 4
|
||||
```
|
||||
|
||||
And you'll see:
|
||||
```
|
||||
┌───────────────────────────────────────────────────┐
|
||||
│ Fiber v2.42.0 │
|
||||
│ http://127.0.0.1:8080 │
|
||||
│ (bound on host 0.0.0.0 and port 8080) │
|
||||
│ │
|
||||
│ Handlers ............. 1 Processes ........... 1 │
|
||||
│ Prefork ....... Disabled PID ................. 1 │
|
||||
└───────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Note: Models have to end up with `.bin`.
|
||||
|
||||
You can control the API server options with command line arguments:
|
||||
|
||||
```
|
||||
llama-cli api --models-path <model_path> [--address <address>] [--threads <num_threads>]
|
||||
```
|
||||
|
||||
The API takes takes the following:
|
||||
|
||||
| Parameter | Environment Variable | Default Value | Description |
|
||||
| ------------ | -------------------- | ------------- | -------------------------------------- |
|
||||
| models-path | MODELS_PATH | | The path where you have models (ending with `.bin`). |
|
||||
| threads | THREADS | CPU cores | The number of threads to use for text generation. |
|
||||
| address | ADDRESS | :8080 | The address and port to listen on. |
|
||||
| context-size | CONTEXT_SIZE | 512 | Default token context size. |
|
||||
|
||||
Once the server is running, you can start making requests to it using HTTP, using the OpenAI API.
|
||||
|
||||
### Supported OpenAI API endpoints
|
||||
|
||||
You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create).
|
||||
|
||||
Following the list of endpoints/parameters supported.
|
||||
|
||||
#### Chat completions
|
||||
|
||||
For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
|
||||
|
||||
```
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "ggml-koala-7b-model-q4_0-r2.bin",
|
||||
"messages": [{"role": "user", "content": "Say this is a test!"}],
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
Available additional parameters: `top_p`, `top_k`, `max_tokens`
|
||||
|
||||
#### Completions
|
||||
|
||||
For example, to generate a comletion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
|
||||
```
|
||||
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "ggml-koala-7b-model-q4_0-r2.bin",
|
||||
"prompt": "A long time ago in a galaxy far, far away",
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
Available additional parameters: `top_p`, `top_k`, `max_tokens`
|
||||
|
||||
#### List models
|
||||
|
||||
You can list all the models available with:
|
||||
|
||||
```
|
||||
curl http://localhost:8080/v1/models
|
||||
```
|
||||
|
||||
## Web interface
|
||||
|
||||
There is also available a simple web interface (for instance, http://localhost:8080/) which can be used as a playground.
|
||||
|
||||
Note: The API doesn't inject a template for talking to the instance, while the CLI does. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release, for instance:
|
||||
|
||||
```
|
||||
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
{instruction}
|
||||
|
||||
### Response:
|
||||
```
|
||||
|
||||
|
||||
## Using other models
|
||||
|
||||
gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):
|
||||
|
||||
```bash
|
||||
wget -O tokenizer.model https://huggingface.co/decapoda-research/llama-30b-hf/resolve/main/tokenizer.model
|
||||
mkdir models
|
||||
cp gpt4all.. models/
|
||||
git clone https://gist.github.com/eiz/828bddec6162a023114ce19146cb2b82
|
||||
pip install sentencepiece
|
||||
python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
|
||||
# There will be a new model with the ".tmp" extension, you have to use that one!
|
||||
```
|
||||
|
||||
### Golang client API
|
||||
|
||||
The `llama-cli` codebase has also a small client in go that can be used alongside with the api:
|
||||
|
||||
```golang
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
client "github.com/go-skynet/llama-cli/client"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
cli := client.NewClient("http://ip:port")
|
||||
|
||||
out, err := cli.Predict("What's an alpaca?")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
fmt.Println(out)
|
||||
}
|
||||
```
|
||||
|
||||
### Windows compatibility
|
||||
|
||||
It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/llama-cli/issues/2
|
||||
|
||||
### Kubernetes
|
||||
|
||||
You can run the API directly in Kubernetes:
|
||||
|
||||
```bash
|
||||
kubectl apply -f https://raw.githubusercontent.com/go-skynet/llama-cli/master/kubernetes/deployment.yaml
|
||||
```
|
||||
|
||||
### Build locally
|
||||
|
||||
Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.
|
||||
|
||||
In order to build the `llama-cli` container image locally you can use `docker`:
|
||||
|
||||
```
|
||||
# build the image as "alpaca-image"
|
||||
docker build -t llama-cli .
|
||||
docker run llama-cli --instruction "What's an alpaca?"
|
||||
```
|
||||
|
||||
Or build the binary with:
|
||||
|
||||
```
|
||||
# build the image as "alpaca-image"
|
||||
docker run --privileged -v /var/run/docker.sock:/var/run/docker.sock --rm -t -v "$(pwd)":/workspace -v earthly-tmp:/tmp/earthly:rw earthly/earthly:v0.7.2 +build
|
||||
# run the binary
|
||||
./llama-cli --instruction "What's an alpaca?"
|
||||
```
|
||||
|
||||
## Short-term roadmap
|
||||
|
||||
- [x] Mimic OpenAI API (https://github.com/go-skynet/llama-cli/issues/10)
|
||||
- Binary releases (https://github.com/go-skynet/llama-cli/issues/6)
|
||||
- Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
|
||||
- [x] Multi-model support
|
||||
- Have a webUI!
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
- [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
||||
- https://github.com/tatsu-lab/stanford_alpaca
|
||||
- https://github.com/cornelk/llama-go for the initial ideas
|
||||
- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)
|
||||
276
api/api.go
276
api/api.go
@@ -1,276 +0,0 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
model "github.com/go-skynet/llama-cli/pkg/model"
|
||||
|
||||
llama "github.com/go-skynet/go-llama.cpp"
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/gofiber/fiber/v2/middleware/cors"
|
||||
"github.com/gofiber/fiber/v2/middleware/filesystem"
|
||||
"github.com/gofiber/fiber/v2/middleware/recover"
|
||||
)
|
||||
|
||||
type OpenAIResponse struct {
|
||||
Created int `json:"created,omitempty"`
|
||||
Object string `json:"chat.completion,omitempty"`
|
||||
ID string `json:"id,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Choices []Choice `json:"choices,omitempty"`
|
||||
}
|
||||
|
||||
type Choice struct {
|
||||
Index int `json:"index,omitempty"`
|
||||
FinishReason string `json:"finish_reason,omitempty"`
|
||||
Message Message `json:"message,omitempty"`
|
||||
Text string `json:"text,omitempty"`
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role,omitempty"`
|
||||
Content string `json:"content,omitempty"`
|
||||
}
|
||||
|
||||
type OpenAIModel struct {
|
||||
ID string `json:"id"`
|
||||
Object string `json:"object"`
|
||||
}
|
||||
|
||||
type OpenAIRequest struct {
|
||||
Model string `json:"model"`
|
||||
|
||||
// Prompt is read only by completion API calls
|
||||
Prompt string `json:"prompt"`
|
||||
|
||||
// Messages is read only by chat/completion API calls
|
||||
Messages []Message `json:"messages"`
|
||||
|
||||
// Common options between all the API calls
|
||||
TopP float64 `json:"top_p"`
|
||||
TopK int `json:"top_k"`
|
||||
Temperature float64 `json:"temperature"`
|
||||
Maxtokens int `json:"max_tokens"`
|
||||
}
|
||||
|
||||
//go:embed index.html
|
||||
var indexHTML embed.FS
|
||||
|
||||
func openAIEndpoint(chat bool, defaultModel *llama.LLama, loader *model.ModelLoader, threads int, defaultMutex *sync.Mutex, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
var err error
|
||||
var model *llama.LLama
|
||||
|
||||
input := new(OpenAIRequest)
|
||||
// Get input data from the request body
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if input.Model == "" {
|
||||
if defaultModel == nil {
|
||||
return fmt.Errorf("no default model loaded, and no model specified")
|
||||
}
|
||||
model = defaultModel
|
||||
} else {
|
||||
model, err = loader.LoadModel(input.Model)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
|
||||
if input.Model != "" {
|
||||
mutexMap.Lock()
|
||||
l, ok := mutexes[input.Model]
|
||||
if !ok {
|
||||
m := &sync.Mutex{}
|
||||
mutexes[input.Model] = m
|
||||
l = m
|
||||
}
|
||||
mutexMap.Unlock()
|
||||
l.Lock()
|
||||
defer l.Unlock()
|
||||
} else {
|
||||
defaultMutex.Lock()
|
||||
defer defaultMutex.Unlock()
|
||||
}
|
||||
|
||||
// Set the parameters for the language model prediction
|
||||
topP := input.TopP
|
||||
if topP == 0 {
|
||||
topP = 0.7
|
||||
}
|
||||
topK := input.TopK
|
||||
if topK == 0 {
|
||||
topK = 80
|
||||
}
|
||||
|
||||
temperature := input.Temperature
|
||||
if temperature == 0 {
|
||||
temperature = 0.9
|
||||
}
|
||||
|
||||
tokens := input.Maxtokens
|
||||
if tokens == 0 {
|
||||
tokens = 512
|
||||
}
|
||||
|
||||
predInput := input.Prompt
|
||||
if chat {
|
||||
mess := []string{}
|
||||
for _, i := range input.Messages {
|
||||
mess = append(mess, i.Content)
|
||||
}
|
||||
|
||||
predInput = strings.Join(mess, "\n")
|
||||
}
|
||||
|
||||
// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
|
||||
templatedInput, err := loader.TemplatePrefix(input.Model, struct {
|
||||
Input string
|
||||
}{Input: predInput})
|
||||
if err == nil {
|
||||
predInput = templatedInput
|
||||
}
|
||||
|
||||
// Generate the prediction using the language model
|
||||
prediction, err := model.Predict(
|
||||
predInput,
|
||||
llama.SetTemperature(temperature),
|
||||
llama.SetTopP(topP),
|
||||
llama.SetTopK(topK),
|
||||
llama.SetTokens(tokens),
|
||||
llama.SetThreads(threads),
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if chat {
|
||||
// Return the chat prediction in the response body
|
||||
return c.JSON(OpenAIResponse{
|
||||
Model: input.Model,
|
||||
Choices: []Choice{{Message: Message{Role: "assistant", Content: prediction}}},
|
||||
})
|
||||
}
|
||||
|
||||
// Return the prediction in the response body
|
||||
return c.JSON(OpenAIResponse{
|
||||
Model: input.Model,
|
||||
Choices: []Choice{{Text: prediction}},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Start(defaultModel *llama.LLama, loader *model.ModelLoader, listenAddr string, threads int) error {
|
||||
app := fiber.New()
|
||||
|
||||
// Default middleware config
|
||||
app.Use(recover.New())
|
||||
app.Use(cors.New())
|
||||
|
||||
// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
|
||||
var mutex = &sync.Mutex{}
|
||||
mu := map[string]*sync.Mutex{}
|
||||
var mumutex = &sync.Mutex{}
|
||||
|
||||
// openAI compatible API endpoint
|
||||
app.Post("/v1/chat/completions", openAIEndpoint(true, defaultModel, loader, threads, mutex, mumutex, mu))
|
||||
app.Post("/v1/completions", openAIEndpoint(false, defaultModel, loader, threads, mutex, mumutex, mu))
|
||||
app.Get("/v1/models", func(c *fiber.Ctx) error {
|
||||
models, err := loader.ListModels()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dataModels := []OpenAIModel{}
|
||||
for _, m := range models {
|
||||
dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
|
||||
}
|
||||
return c.JSON(struct {
|
||||
Object string `json:"object"`
|
||||
Data []OpenAIModel `json:"data"`
|
||||
}{
|
||||
Object: "list",
|
||||
Data: dataModels,
|
||||
})
|
||||
})
|
||||
|
||||
app.Use("/", filesystem.New(filesystem.Config{
|
||||
Root: http.FS(indexHTML),
|
||||
NotFoundFile: "index.html",
|
||||
}))
|
||||
|
||||
/*
|
||||
curl --location --request POST 'http://localhost:8080/predict' --header 'Content-Type: application/json' --data-raw '{
|
||||
"text": "What is an alpaca?",
|
||||
"topP": 0.8,
|
||||
"topK": 50,
|
||||
"temperature": 0.7,
|
||||
"tokens": 100
|
||||
}'
|
||||
*/
|
||||
// Endpoint to generate the prediction
|
||||
app.Post("/predict", func(c *fiber.Ctx) error {
|
||||
mutex.Lock()
|
||||
defer mutex.Unlock()
|
||||
// Get input data from the request body
|
||||
input := new(struct {
|
||||
Text string `json:"text"`
|
||||
})
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Set the parameters for the language model prediction
|
||||
topP, err := strconv.ParseFloat(c.Query("topP", "0.9"), 64) // Default value of topP is 0.9
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
topK, err := strconv.Atoi(c.Query("topK", "40")) // Default value of topK is 40
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
temperature, err := strconv.ParseFloat(c.Query("temperature", "0.5"), 64) // Default value of temperature is 0.5
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tokens, err := strconv.Atoi(c.Query("tokens", "128")) // Default value of tokens is 128
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Generate the prediction using the language model
|
||||
prediction, err := defaultModel.Predict(
|
||||
input.Text,
|
||||
llama.SetTemperature(temperature),
|
||||
llama.SetTopP(topP),
|
||||
llama.SetTopK(topK),
|
||||
llama.SetTokens(tokens),
|
||||
llama.SetThreads(threads),
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Return the prediction in the response body
|
||||
return c.JSON(struct {
|
||||
Prediction string `json:"prediction"`
|
||||
}{
|
||||
Prediction: prediction,
|
||||
})
|
||||
})
|
||||
|
||||
// Start the server
|
||||
app.Listen(listenAddr)
|
||||
return nil
|
||||
}
|
||||
120
api/index.html
120
api/index.html
@@ -1,120 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>llama-cli</title>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css" crossorigin="anonymous" referrerpolicy="no-referrer" />
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
|
||||
</head>
|
||||
<style>
|
||||
@keyframes rotating {
|
||||
from {
|
||||
transform: rotate(0deg);
|
||||
}
|
||||
to {
|
||||
transform: rotate(360deg);
|
||||
}
|
||||
}
|
||||
|
||||
.waiting {
|
||||
animation: rotating 1s linear infinite;
|
||||
}
|
||||
|
||||
</style>
|
||||
<body>
|
||||
|
||||
<div class="container mt-5" x-data="{ templates:[
|
||||
{
|
||||
name: 'Alpaca: Instruction without input',
|
||||
text: `Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
{{.Instruction}}
|
||||
|
||||
### Response:`,
|
||||
},
|
||||
{
|
||||
name: 'Alpaca: Instruction with input',
|
||||
text: `Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
{{.Instruction}}
|
||||
|
||||
### Input:
|
||||
{{.Input}}
|
||||
|
||||
### Response:`,
|
||||
}
|
||||
], selectedTemplate: '', selectedTemplateText: '' }">
|
||||
<h1>llama-cli API</h1>
|
||||
<div class="form-group">
|
||||
<label for="inputText">Input Text:</label>
|
||||
<textarea class="form-control" id="inputText" rows="6" placeholder="Your text input here..." x-text="selectedTemplateText"></textarea>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="templateSelect">Select Template:</label>
|
||||
<select class="form-control" id="templateSelect" x-model="selectedTemplateText">
|
||||
<option value="">None</option>
|
||||
<template x-for="(template, index) in templates" :key="index">
|
||||
<option :value="template.text" x-text="template.name"></option>
|
||||
</template>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="topP">Top P:</label>
|
||||
<input type="range" step="0.01" min="0" max="1" class="form-control" id="topP" value="0.20" name="topP" onchange="this.nextElementSibling.value = this.value" required>
|
||||
<output>0.20</output>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="topK">Top K:</label>
|
||||
<input type="number" class="form-control" id="topK" value="10000" name="topK" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="temperature">Temperature:</label>
|
||||
<input type="range" step="0.01" min="0" max="1" value="0.9" class="form-control" id="temperature" name="temperature" onchange="this.nextElementSibling.value = this.value" required>
|
||||
<output>0.9</output>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="tokens">Tokens:</label>
|
||||
<input type="number" class="form-control" id="tokens" name="tokens" value="128" required>
|
||||
</div>
|
||||
<button class="btn btn-primary" x-on:click="submitRequest()">Submit <i class="fas fa-paper-plane"></i></button>
|
||||
<hr>
|
||||
<div class="form-group">
|
||||
<label for="outputText">Output Text:</label>
|
||||
<textarea class="form-control" id="outputText" rows="5" readonly></textarea>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
|
||||
<script>
|
||||
function submitRequest() {
|
||||
var button = document.querySelector("i.fa-paper-plane");
|
||||
button.classList.add("waiting");
|
||||
var text = document.getElementById("inputText").value;
|
||||
var url = "/predict";
|
||||
var data = {
|
||||
"text": text,
|
||||
"topP": document.getElementById("topP").value,
|
||||
"topK": document.getElementById("topK").value,
|
||||
"temperature": document.getElementById("temperature").value,
|
||||
"tokens": document.getElementById("tokens").value
|
||||
};
|
||||
fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
body: JSON.stringify(data)
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
document.getElementById("outputText").value = data.prediction;
|
||||
button.classList.remove("waiting");
|
||||
})
|
||||
.catch(error => { console.error(error); button.classList.remove("waiting"); });
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,75 +0,0 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type Prediction struct {
|
||||
Prediction string `json:"prediction"`
|
||||
}
|
||||
|
||||
type Client struct {
|
||||
baseURL string
|
||||
client *http.Client
|
||||
endpoint string
|
||||
}
|
||||
|
||||
func NewClient(baseURL string) *Client {
|
||||
return &Client{
|
||||
baseURL: baseURL,
|
||||
client: &http.Client{},
|
||||
endpoint: "/predict",
|
||||
}
|
||||
}
|
||||
|
||||
type InputData struct {
|
||||
Text string `json:"text"`
|
||||
TopP float64 `json:"topP,omitempty"`
|
||||
TopK int `json:"topK,omitempty"`
|
||||
Temperature float64 `json:"temperature,omitempty"`
|
||||
Tokens int `json:"tokens,omitempty"`
|
||||
}
|
||||
|
||||
func (c *Client) Predict(text string, opts ...InputOption) (string, error) {
|
||||
input := NewInputData(opts...)
|
||||
input.Text = text
|
||||
|
||||
// encode input data to JSON format
|
||||
inputBytes, err := json.Marshal(input)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// create HTTP request
|
||||
url := c.baseURL + c.endpoint
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBuffer(inputBytes))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// set request headers
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
// send request and get response
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("request failed with status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// decode response body to Prediction struct
|
||||
var prediction Prediction
|
||||
err = json.NewDecoder(resp.Body).Decode(&prediction)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return prediction.Prediction, nil
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
package client
|
||||
|
||||
import "net/http"
|
||||
|
||||
type ClientOption func(c *Client)
|
||||
|
||||
func WithHTTPClient(httpClient *http.Client) ClientOption {
|
||||
return func(c *Client) {
|
||||
c.client = httpClient
|
||||
}
|
||||
}
|
||||
|
||||
func WithEndpoint(endpoint string) ClientOption {
|
||||
return func(c *Client) {
|
||||
c.endpoint = endpoint
|
||||
}
|
||||
}
|
||||
|
||||
type InputOption func(d *InputData)
|
||||
|
||||
func NewInputData(opts ...InputOption) *InputData {
|
||||
data := &InputData{}
|
||||
for _, opt := range opts {
|
||||
opt(data)
|
||||
}
|
||||
return data
|
||||
}
|
||||
|
||||
func WithTopP(topP float64) InputOption {
|
||||
return func(d *InputData) {
|
||||
d.TopP = topP
|
||||
}
|
||||
}
|
||||
|
||||
func WithTopK(topK int) InputOption {
|
||||
return func(d *InputData) {
|
||||
d.TopK = topK
|
||||
}
|
||||
}
|
||||
|
||||
func WithTemperature(temperature float64) InputOption {
|
||||
return func(d *InputData) {
|
||||
d.Temperature = temperature
|
||||
}
|
||||
}
|
||||
|
||||
func WithTokens(tokens int) InputOption {
|
||||
return func(d *InputData) {
|
||||
d.Tokens = tokens
|
||||
}
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
version: '3.6'
|
||||
|
||||
services:
|
||||
api:
|
||||
image: quay.io/go-skynet/llama-cli:latest
|
||||
build: .
|
||||
volumes:
|
||||
- ./models:/models
|
||||
ports:
|
||||
- 8080:8080
|
||||
environment:
|
||||
- MODELS_PATH=/models
|
||||
- CONTEXT_SIZE=700
|
||||
- THREADS=$THREADS
|
||||
command: api
|
||||
30
go.mod
30
go.mod
@@ -1,30 +0,0 @@
|
||||
module github.com/go-skynet/llama-cli
|
||||
|
||||
go 1.19
|
||||
|
||||
require (
|
||||
github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4
|
||||
github.com/gofiber/fiber/v2 v2.42.0
|
||||
github.com/urfave/cli/v2 v2.25.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/andybalholm/brotli v1.0.4 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
|
||||
github.com/google/uuid v1.3.0 // indirect
|
||||
github.com/klauspost/compress v1.15.9 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.17 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.14 // indirect
|
||||
github.com/philhofer/fwd v1.1.1 // indirect
|
||||
github.com/rivo/uniseg v0.2.0 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 // indirect
|
||||
github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d // indirect
|
||||
github.com/tinylib/msgp v1.1.6 // indirect
|
||||
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||
github.com/valyala/fasthttp v1.44.0 // indirect
|
||||
github.com/valyala/tcplisten v1.0.0 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
|
||||
golang.org/x/sys v0.6.0 // indirect
|
||||
)
|
||||
86
go.sum
86
go.sum
@@ -1,86 +0,0 @@
|
||||
github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
|
||||
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
|
||||
github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4 h1:u/y9MlPHOeIj636IQmrf9ptMjjdgCVIcsfb7lMFh39M=
|
||||
github.com/go-skynet/go-llama.cpp v0.0.0-20230415155049-9260bfd28bc4/go.mod h1:35AKIEMY+YTKCBJIa/8GZcNGJ2J+nQk1hQiWo/OnEWw=
|
||||
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
|
||||
github.com/gofiber/fiber/v2 v2.42.0 h1:Fnp7ybWvS+sjNQsFvkhf4G8OhXswvB6Vee8hM/LyS+8=
|
||||
github.com/gofiber/fiber/v2 v2.42.0/go.mod h1:3+SGNjqMh5VQH5Vz2Wdi43zTIV16ktlFd3x3R6O1Zlc=
|
||||
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
|
||||
github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
|
||||
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
|
||||
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
|
||||
github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
|
||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng=
|
||||
github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
|
||||
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
|
||||
github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
|
||||
github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ=
|
||||
github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU=
|
||||
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94 h1:rmMl4fXJhKMNWl+K+r/fq4FbbKI+Ia2m9hYBLm2h4G4=
|
||||
github.com/savsgio/dictpool v0.0.0-20221023140959-7bf2e61cea94/go.mod h1:90zrgN3D/WJsDd1iXHT96alCoN2KJo6/4x1DZC3wZs8=
|
||||
github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d h1:Q+gqLBOPkFGHyCJxXMRqtUgUbTjI8/Ze8vu8GGyNFwo=
|
||||
github.com/savsgio/gotils v0.0.0-20220530130905-52f3993e8d6d/go.mod h1:Gy+0tqhJvgGlqnTF8CVGP0AaGRjwBtXs/a5PA0Y3+A4=
|
||||
github.com/tinylib/msgp v1.1.6 h1:i+SbKraHhnrf9M5MYmvQhFnbLhAXSDWF8WWsuyRdocw=
|
||||
github.com/tinylib/msgp v1.1.6/go.mod h1:75BAfg2hauQhs3qedfdDZmWAPcFMAvJE5b9rGOMufyw=
|
||||
github.com/urfave/cli/v2 v2.25.0 h1:ykdZKuQey2zq0yin/l7JOm9Mh+pg72ngYMeB0ABn6q8=
|
||||
github.com/urfave/cli/v2 v2.25.0/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
|
||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
||||
github.com/valyala/fasthttp v1.44.0 h1:R+gLUhldIsfg1HokMuQjdQ5bh9nuXHPIfvkYUu9eR5Q=
|
||||
github.com/valyala/fasthttp v1.44.0/go.mod h1:f6VbjjoI3z1NDOZOv17o6RvtRSWxC77seBFc2uWtgiY=
|
||||
github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
|
||||
github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
|
||||
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
|
||||
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/crypto v0.0.0-20220214200702-86341886e292/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
|
||||
golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.8.0 h1:57P1ETyNKtuIjB4SRd15iJxuhj8Gc416Y78H3qgMh68=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20201022035929-9cf592e881e9/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
@@ -1,42 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: llama
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama
|
||||
namespace: llama
|
||||
labels:
|
||||
app: llama
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: llama
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: llama
|
||||
name: llama
|
||||
spec:
|
||||
containers:
|
||||
- name: llama
|
||||
args:
|
||||
- api
|
||||
image: quay.io/go-skynet/llama-cli:latest
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama
|
||||
namespace: llama
|
||||
spec:
|
||||
selector:
|
||||
app: llama
|
||||
type: LoadBalancer
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8080
|
||||
targetPort: 8080
|
||||
248
main.go
248
main.go
@@ -1,248 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"runtime"
|
||||
"text/template"
|
||||
|
||||
llama "github.com/go-skynet/go-llama.cpp"
|
||||
api "github.com/go-skynet/llama-cli/api"
|
||||
model "github.com/go-skynet/llama-cli/pkg/model"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
|
||||
// Define the template string
|
||||
var emptyInput string = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
{{.Instruction}}
|
||||
|
||||
### Response:`
|
||||
|
||||
var nonEmptyInput string = `Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
||||
|
||||
### Instruction:
|
||||
{{.Instruction}}
|
||||
|
||||
### Input:
|
||||
{{.Input}}
|
||||
|
||||
### Response:
|
||||
`
|
||||
|
||||
func llamaFromOptions(ctx *cli.Context) (*llama.LLama, error) {
|
||||
opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
|
||||
return llama.New(ctx.String("model"), opts...)
|
||||
}
|
||||
|
||||
func templateString(t string, in interface{}) (string, error) {
|
||||
// Parse the template
|
||||
tmpl, err := template.New("prompt").Parse(t)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
err = tmpl.Execute(&buf, in)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
var modelFlags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "model",
|
||||
EnvVars: []string{"MODEL_PATH"},
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "tokens",
|
||||
EnvVars: []string{"TOKENS"},
|
||||
Value: 128,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "context-size",
|
||||
EnvVars: []string{"CONTEXT_SIZE"},
|
||||
Value: 512,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "threads",
|
||||
EnvVars: []string{"THREADS"},
|
||||
Value: runtime.NumCPU(),
|
||||
},
|
||||
&cli.Float64Flag{
|
||||
Name: "temperature",
|
||||
EnvVars: []string{"TEMPERATURE"},
|
||||
Value: 0.95,
|
||||
},
|
||||
&cli.Float64Flag{
|
||||
Name: "topp",
|
||||
EnvVars: []string{"TOP_P"},
|
||||
Value: 0.85,
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "topk",
|
||||
EnvVars: []string{"TOP_K"},
|
||||
Value: 20,
|
||||
},
|
||||
}
|
||||
|
||||
func main() {
|
||||
app := &cli.App{
|
||||
Name: "llama-cli",
|
||||
Version: "0.1",
|
||||
Usage: "llama-cli --model ... --instruction 'What is an alpaca?'",
|
||||
Flags: append(modelFlags,
|
||||
&cli.StringFlag{
|
||||
Name: "template",
|
||||
EnvVars: []string{"TEMPLATE"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "instruction",
|
||||
EnvVars: []string{"INSTRUCTION"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "input",
|
||||
EnvVars: []string{"INPUT"},
|
||||
}),
|
||||
Description: `Run llama.cpp inference`,
|
||||
UsageText: `
|
||||
llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "What's an alpaca?"
|
||||
|
||||
An Alpaca (Vicugna pacos) is a domesticated species of South American camelid, related to llamas and originally from Peru but now found throughout much of Andean region. They are bred for their fleeces which can be spun into wool or knitted items such as hats, sweaters, blankets etc
|
||||
|
||||
echo "An Alpaca (Vicugna pacos) is a domesticated species of South American camelid, related to llamas and originally from Peru but now found throughout much of Andean region. They are bred for their fleeces which can be spun into wool or knitted items such as hats, sweaters, blankets etc" | llama-cli --model ~/ggml-alpaca-7b-q4.bin --instruction "Proofread, improving clarity and flow" --input "-"
|
||||
|
||||
An Alpaca (Vicugna pacos) is a domesticated species from South America that's related to llamas. Originating in Peru but now found throughout the Andean region, they are bred for their fleeces which can be spun into wool or knitted items such as hats and sweaters—blankets too!
|
||||
`,
|
||||
Copyright: "go-skynet authors",
|
||||
Commands: []*cli.Command{
|
||||
{
|
||||
|
||||
Name: "api",
|
||||
Flags: []cli.Flag{
|
||||
&cli.IntFlag{
|
||||
Name: "threads",
|
||||
EnvVars: []string{"THREADS"},
|
||||
Value: runtime.NumCPU(),
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "models-path",
|
||||
EnvVars: []string{"MODELS_PATH"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "default-model",
|
||||
EnvVars: []string{"default-model"},
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "address",
|
||||
EnvVars: []string{"ADDRESS"},
|
||||
Value: ":8080",
|
||||
},
|
||||
&cli.IntFlag{
|
||||
Name: "context-size",
|
||||
EnvVars: []string{"CONTEXT_SIZE"},
|
||||
Value: 512,
|
||||
},
|
||||
},
|
||||
Action: func(ctx *cli.Context) error {
|
||||
|
||||
var defaultModel *llama.LLama
|
||||
defModel := ctx.String("default-model")
|
||||
if defModel != "" {
|
||||
opts := []llama.ModelOption{llama.SetContext(ctx.Int("context-size"))}
|
||||
var err error
|
||||
defaultModel, err = llama.New(ctx.String("default-model"), opts...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return api.Start(defaultModel, model.NewModelLoader(ctx.String("models-path")), ctx.String("address"), ctx.Int("threads"))
|
||||
},
|
||||
},
|
||||
},
|
||||
Action: func(ctx *cli.Context) error {
|
||||
|
||||
instruction := ctx.String("instruction")
|
||||
input := ctx.String("input")
|
||||
templ := ctx.String("template")
|
||||
|
||||
promptTemplate := ""
|
||||
|
||||
if input != "" {
|
||||
promptTemplate = nonEmptyInput
|
||||
} else {
|
||||
promptTemplate = emptyInput
|
||||
}
|
||||
|
||||
if templ != "" {
|
||||
dat, err := os.ReadFile(templ)
|
||||
if err != nil {
|
||||
fmt.Printf("Failed reading file: %s", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
promptTemplate = string(dat)
|
||||
}
|
||||
|
||||
if instruction == "-" {
|
||||
dat, err := ioutil.ReadAll(os.Stdin)
|
||||
if err != nil {
|
||||
fmt.Printf("reading stdin failed: %s", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
instruction = string(dat)
|
||||
}
|
||||
|
||||
if input == "-" {
|
||||
dat, err := ioutil.ReadAll(os.Stdin)
|
||||
if err != nil {
|
||||
fmt.Printf("reading stdin failed: %s", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
input = string(dat)
|
||||
}
|
||||
|
||||
str, err := templateString(promptTemplate, struct {
|
||||
Instruction string
|
||||
Input string
|
||||
}{Instruction: instruction, Input: input})
|
||||
|
||||
if err != nil {
|
||||
fmt.Println("Templating the input failed:", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
l, err := llamaFromOptions(ctx)
|
||||
if err != nil {
|
||||
fmt.Println("Loading the model failed:", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
res, err := l.Predict(
|
||||
str,
|
||||
llama.SetTemperature(ctx.Float64("temperature")),
|
||||
llama.SetTopP(ctx.Float64("topp")),
|
||||
llama.SetTopK(ctx.Int("topk")),
|
||||
llama.SetTokens(ctx.Int("tokens")),
|
||||
llama.SetThreads(ctx.Int("threads")),
|
||||
)
|
||||
if err != nil {
|
||||
fmt.Printf("predicting failed: %s", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Println(res)
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
err := app.Run(os.Args)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
@@ -1,114 +0,0 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"text/template"
|
||||
|
||||
llama "github.com/go-skynet/go-llama.cpp"
|
||||
)
|
||||
|
||||
type ModelLoader struct {
|
||||
modelPath string
|
||||
mu sync.Mutex
|
||||
models map[string]*llama.LLama
|
||||
promptsTemplates map[string]*template.Template
|
||||
}
|
||||
|
||||
func NewModelLoader(modelPath string) *ModelLoader {
|
||||
return &ModelLoader{modelPath: modelPath, models: make(map[string]*llama.LLama), promptsTemplates: make(map[string]*template.Template)}
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) ListModels() ([]string, error) {
|
||||
files, err := ioutil.ReadDir(ml.modelPath)
|
||||
if err != nil {
|
||||
return []string{}, err
|
||||
}
|
||||
|
||||
models := []string{}
|
||||
for _, file := range files {
|
||||
if strings.HasSuffix(file.Name(), ".bin") {
|
||||
models = append(models, strings.TrimRight(file.Name(), ".bin"))
|
||||
}
|
||||
}
|
||||
|
||||
return models, nil
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) TemplatePrefix(modelName string, in interface{}) (string, error) {
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
|
||||
m, ok := ml.promptsTemplates[modelName]
|
||||
if !ok {
|
||||
// try to find a s.bin
|
||||
modelBin := fmt.Sprintf("%s.bin", modelName)
|
||||
m, ok = ml.promptsTemplates[modelBin]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("no prompt template available")
|
||||
}
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
|
||||
if err := m.Execute(&buf, in); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) LoadModel(modelName string, opts ...llama.ModelOption) (*llama.LLama, error) {
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
|
||||
// Check if we already have a loaded model
|
||||
modelFile := filepath.Join(ml.modelPath, modelName)
|
||||
|
||||
if m, ok := ml.models[modelFile]; ok {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// Check if the model path exists
|
||||
if _, err := os.Stat(modelFile); os.IsNotExist(err) {
|
||||
// try to find a s.bin
|
||||
modelBin := fmt.Sprintf("%s.bin", modelFile)
|
||||
if _, err := os.Stat(modelBin); os.IsNotExist(err) {
|
||||
return nil, err
|
||||
} else {
|
||||
modelName = fmt.Sprintf("%s.bin", modelName)
|
||||
modelFile = modelBin
|
||||
}
|
||||
}
|
||||
|
||||
// Load the model and keep it in memory for later use
|
||||
model, err := llama.New(modelFile, opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// If there is a prompt template, load it
|
||||
|
||||
modelTemplateFile := fmt.Sprintf("%s.tmpl", modelFile)
|
||||
// Check if the model path exists
|
||||
if _, err := os.Stat(modelTemplateFile); err == nil {
|
||||
dat, err := os.ReadFile(modelTemplateFile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Parse the template
|
||||
tmpl, err := template.New("prompt").Parse(string(dat))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
ml.promptsTemplates[modelName] = tmpl
|
||||
}
|
||||
|
||||
ml.models[modelFile] = model
|
||||
return model, err
|
||||
}
|
||||
Reference in New Issue
Block a user