mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
209 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1634b219a | ||
|
|
6257e2f510 | ||
|
|
65ca754166 | ||
|
|
a0f0505f0d | ||
|
|
be6c4e6061 | ||
|
|
1996e6f4c9 | ||
|
|
671cd42917 | ||
|
|
568a01bf5c | ||
|
|
164abb8c9f | ||
|
|
ed2946feac | ||
|
|
bdd351b372 | ||
|
|
ad5e7d376a | ||
|
|
6e78d8cd9d | ||
|
|
614125f268 | ||
|
|
f41965bfb5 | ||
|
|
85a3cc8d8f | ||
|
|
ea8675d473 | ||
|
|
08a54c1812 | ||
|
|
8c7439b96e | ||
|
|
a9e42a76fa | ||
|
|
1a3b3d3e67 | ||
|
|
759d35e6b5 | ||
|
|
825e85bcc5 | ||
|
|
62165d556c | ||
|
|
78459889d8 | ||
|
|
0fdc6a92f6 | ||
|
|
8586a0167a | ||
|
|
f1d16a45c5 | ||
|
|
2023627d7f | ||
|
|
d5e1958a1f | ||
|
|
f9c58a01d3 | ||
|
|
4500650000 | ||
|
|
5674e671d0 | ||
|
|
0f44c3f69c | ||
|
|
f9069daf03 | ||
|
|
5f58841a3a | ||
|
|
287200e687 | ||
|
|
b653883c0a | ||
|
|
6b8a402353 | ||
|
|
d9b63fae7c | ||
|
|
377cdcabbf | ||
|
|
92a7f40141 | ||
|
|
e06daf437a | ||
|
|
d19bea4af2 | ||
|
|
fbca9f82fd | ||
|
|
04f284d202 | ||
|
|
cfd6112256 | ||
|
|
debc0974a6 | ||
|
|
03bbbea039 | ||
|
|
55af0b1c68 | ||
|
|
c8bfb72104 | ||
|
|
1b8a663001 | ||
|
|
a9abfa2b61 | ||
|
|
092bb0bd6b | ||
|
|
e28e80857b | ||
|
|
905473c739 | ||
|
|
aa0564a1c6 | ||
|
|
2553de0187 | ||
|
|
408dfe62ee | ||
|
|
648ffdf449 | ||
|
|
04c0841ca9 | ||
|
|
43144c4743 | ||
|
|
a778668bcd | ||
|
|
4b131a7090 | ||
|
|
d06a052d54 | ||
|
|
b5115903bf | ||
|
|
afaff175d0 | ||
|
|
4686877c6d | ||
|
|
e5586e8781 | ||
|
|
3acd767ac4 | ||
|
|
5488fc3bc1 | ||
|
|
0965c6cd68 | ||
|
|
db704199dc | ||
|
|
2cc3b7128e | ||
|
|
88b99d30bb | ||
|
|
307a835199 | ||
|
|
f84b55d1ef | ||
|
|
139209353f | ||
|
|
a30058b80f | ||
|
|
53f406dc35 | ||
|
|
2649407f44 | ||
|
|
0a8f627cce | ||
|
|
76d4e88e0c | ||
|
|
d4d2a76f8f | ||
|
|
7d306c6431 | ||
|
|
44bdacac61 | ||
|
|
6bd6e2bdeb | ||
|
|
2908ff3f6b | ||
|
|
f19277b8e2 | ||
|
|
32de75c683 | ||
|
|
164a9e972f | ||
|
|
d747f2c89b | ||
|
|
58662db48e | ||
|
|
078942fc9f | ||
|
|
6dfee99575 | ||
|
|
ad62156d54 | ||
|
|
1689740269 | ||
|
|
50a3b54e34 | ||
|
|
e94a50e9db | ||
|
|
4e0f3cc980 | ||
|
|
2a8cbad122 | ||
|
|
453c45d022 | ||
|
|
4550abbfce | ||
|
|
f2ba1cfb01 | ||
|
|
8c4196faf3 | ||
|
|
b0f4556c0f | ||
|
|
fa5c98549a | ||
|
|
3d12d2037c | ||
|
|
d6522e69ca | ||
|
|
ef1507d000 | ||
|
|
a3d69872e3 | ||
|
|
33b2d38dd0 | ||
|
|
74408bdc77 | ||
|
|
8c4f720fb5 | ||
|
|
8002ad27cb | ||
|
|
1b8a77433a | ||
|
|
a370a11115 | ||
|
|
aa87eff283 | ||
|
|
0d784f46e5 | ||
|
|
c54cfd3609 | ||
|
|
6555994060 | ||
|
|
0893d3cbbe | ||
|
|
90cacb9692 | ||
|
|
69d2902b0a | ||
|
|
c1752cbb83 | ||
|
|
b8e129f2a6 | ||
|
|
cc6fac1688 | ||
|
|
043cb94436 | ||
|
|
bbdf78615e | ||
|
|
e332ff8066 | ||
|
|
26d99ed1c7 | ||
|
|
1da8d8b9db | ||
|
|
bf8f8671d1 | ||
|
|
51cba89682 | ||
|
|
3e8e71f8b6 | ||
|
|
4edd8c80b4 | ||
|
|
fd70a22196 | ||
|
|
56f4deb938 | ||
|
|
9bd7f3f995 | ||
|
|
ee21b00a8d | ||
|
|
1f43678d53 | ||
|
|
20c0e128c0 | ||
|
|
5c3d1d81e6 | ||
|
|
c22b3187a7 | ||
|
|
54f2657870 | ||
|
|
cef7f8a014 | ||
|
|
bf8e50a11d | ||
|
|
6c6cd8bbe0 | ||
|
|
00d6c2a966 | ||
|
|
415cf31aa3 | ||
|
|
f55053bfba | ||
|
|
e24654ada0 | ||
|
|
c4cecba07f | ||
|
|
38cad0b8dc | ||
|
|
052af98dcd | ||
|
|
56d8f5163c | ||
|
|
b6af4f4467 | ||
|
|
a5b08f43ff | ||
|
|
c15f506fd5 | ||
|
|
a2a63460e9 | ||
|
|
2fcea486eb | ||
|
|
5c9d26e39b | ||
|
|
191bc2e50a | ||
|
|
fbb9facda4 | ||
|
|
c6a819e92f | ||
|
|
a50cde69a2 | ||
|
|
e5bd74878e | ||
|
|
dc98b2ea44 | ||
|
|
acf119828f | ||
|
|
a53392f919 | ||
|
|
eee1fb2c75 | ||
|
|
8826ca93b3 | ||
|
|
5049629381 | ||
|
|
92136a5d34 | ||
|
|
075e5015c0 | ||
|
|
46fd4ff6db | ||
|
|
4a4e44bf55 | ||
|
|
22247ad92c | ||
|
|
d0f2bf3181 | ||
|
|
0e4e101101 | ||
|
|
f4b1bd8f6d | ||
|
|
e95cb8eaac | ||
|
|
db1159b651 | ||
|
|
a9a3a07c3b | ||
|
|
06c8339862 | ||
|
|
2394f7833f | ||
|
|
36e19928eb | ||
|
|
abc27e0dc4 | ||
|
|
42d6b9e0cc | ||
|
|
c866b77586 | ||
|
|
5356b81b7f | ||
|
|
30fe163100 | ||
|
|
afb5bbc1b8 | ||
|
|
12a8d0e46f | ||
|
|
09c7d8d458 | ||
|
|
149cc1eb13 | ||
|
|
a5ce987bdb | ||
|
|
2edc732c33 | ||
|
|
fec01d9e69 | ||
|
|
9ca5ef339a | ||
|
|
a8003f2b7c | ||
|
|
25deb4ba95 | ||
|
|
3d3db1d74f | ||
|
|
cabb1602e8 | ||
|
|
25e7661de2 | ||
|
|
cbfab81c35 | ||
|
|
925315ab5c | ||
|
|
5213e79f5c | ||
|
|
7fe6d0ad2b |
@@ -9,6 +9,7 @@
|
||||
# Param 2: email
|
||||
#
|
||||
config_user() {
|
||||
echo "Configuring git for $1 <$2>"
|
||||
local gcn=$(git config --global user.name)
|
||||
if [ -z "${gcn}" ]; then
|
||||
echo "Setting up git user / remote"
|
||||
@@ -24,6 +25,7 @@ config_user() {
|
||||
# Param 2: remote url
|
||||
#
|
||||
config_remote() {
|
||||
echo "Adding git remote and fetching $2 as $1"
|
||||
local gr=$(git remote -v | grep $1)
|
||||
if [ -z "${gr}" ]; then
|
||||
git remote add $1 $2
|
||||
|
||||
11
.github/check_and_update.py
vendored
11
.github/check_and_update.py
vendored
@@ -29,9 +29,14 @@ def calculate_sha256(file_path):
|
||||
def manual_safety_check_hf(repo_id):
|
||||
scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
|
||||
scan = scanResponse.json()
|
||||
if scan['hasUnsafeFile']:
|
||||
return scan
|
||||
return None
|
||||
# Check if 'hasUnsafeFile' exists in the response
|
||||
if 'hasUnsafeFile' in scan:
|
||||
if scan['hasUnsafeFile']:
|
||||
return scan
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
download_type, repo_id_or_url = parse_uri(uri)
|
||||
|
||||
|
||||
7
.github/ci/modelslist.go
vendored
7
.github/ci/modelslist.go
vendored
@@ -6,6 +6,7 @@ import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
@@ -279,6 +280,12 @@ func main() {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure that all arbitrary text content is sanitized before display
|
||||
for i, m := range models {
|
||||
models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
|
||||
models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
|
||||
}
|
||||
|
||||
// render the template
|
||||
data := struct {
|
||||
Models []*GalleryModel
|
||||
|
||||
4
.github/workflows/deploy-explorer.yaml
vendored
4
.github/workflows/deploy-explorer.yaml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
run: |
|
||||
CGO_ENABLED=0 make build-api
|
||||
- name: rm
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
uses: appleboy/ssh-action@v1.1.0
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
rm: true
|
||||
target: ./local-ai
|
||||
- name: restarting
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
uses: appleboy/ssh-action@v1.1.0
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
|
||||
117
.github/workflows/image.yml
vendored
117
.github/workflows/image.yml
vendored
@@ -13,6 +13,78 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
hipblas-jobs:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
tag-latest: ${{ matrix.tag-latest }}
|
||||
tag-suffix: ${{ matrix.tag-suffix }}
|
||||
ffmpeg: ${{ matrix.ffmpeg }}
|
||||
image-type: ${{ matrix.image-type }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
cuda-major-version: ${{ matrix.cuda-major-version }}
|
||||
cuda-minor-version: ${{ matrix.cuda-minor-version }}
|
||||
platforms: ${{ matrix.platforms }}
|
||||
runs-on: ${{ matrix.runs-on }}
|
||||
base-image: ${{ matrix.base-image }}
|
||||
grpc-base-image: ${{ matrix.grpc-base-image }}
|
||||
aio: ${{ matrix.aio }}
|
||||
makeflags: ${{ matrix.makeflags }}
|
||||
latest-image: ${{ matrix.latest-image }}
|
||||
latest-image-aio: ${{ matrix.latest-image-aio }}
|
||||
secrets:
|
||||
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
|
||||
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
|
||||
strategy:
|
||||
# Pushing with all jobs in parallel
|
||||
# eats the bandwidth of all the nodes
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
include:
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-hipblas-ffmpeg'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
aio: "-aio-gpu-hipblas"
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
latest-image: 'latest-gpu-hipblas'
|
||||
latest-image-aio: 'latest-aio-gpu-hipblas'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'extras'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-core'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
self-hosted-jobs:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
@@ -39,7 +111,7 @@ jobs:
|
||||
strategy:
|
||||
# Pushing with all jobs in parallel
|
||||
# eats the bandwidth of all the nodes
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
|
||||
matrix:
|
||||
include:
|
||||
# Extra images
|
||||
@@ -122,29 +194,6 @@ jobs:
|
||||
base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-hipblas-ffmpeg'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
aio: "-aio-gpu-hipblas"
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
latest-image: 'latest-gpu-hipblas'
|
||||
latest-image-aio: 'latest-aio-gpu-hipblas'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'extras'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'sycl_f16'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
@@ -212,26 +261,6 @@ jobs:
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-core'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
|
||||
core-image-build:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
|
||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
uses: securego/gosec@master
|
||||
uses: securego/gosec@v2.21.4
|
||||
with:
|
||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||
|
||||
11
.github/workflows/test.yml
vendored
11
.github/workflows/test.yml
vendored
@@ -178,13 +178,22 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
# Install protoc
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||
- name: Build images
|
||||
run: |
|
||||
docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
|
||||
BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
|
||||
- name: Test
|
||||
run: |
|
||||
LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
|
||||
PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
|
||||
make run-e2e-aio
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
|
||||
- [Documentation](#documentation)
|
||||
- [Community and Communication](#community-and-communication)
|
||||
|
||||
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
|
||||
|
||||
## Coding Guidelines
|
||||
|
||||
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
|
||||
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
|
||||
|
||||
## Testing
|
||||
|
||||
@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
|
||||
- You can reach out via the Github issue tracker.
|
||||
- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
|
||||
- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
|
||||
|
||||
---
|
||||
|
||||
54
Dockerfile
54
Dockerfile
@@ -9,11 +9,13 @@ FROM ${BASE_IMAGE} AS requirements-core
|
||||
USER root
|
||||
|
||||
ARG GO_VERSION=1.22.6
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
@@ -21,13 +23,25 @@ RUN apt-get update && \
|
||||
build-essential \
|
||||
ccache \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
curl libssl-dev \
|
||||
git \
|
||||
unzip upx-ucl && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# Install Go
|
||||
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
|
||||
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
|
||||
@@ -188,6 +202,8 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
|
||||
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
|
||||
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
|
||||
ARG GRPC_VERSION=v1.65.0
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
|
||||
ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
|
||||
|
||||
@@ -196,12 +212,24 @@ WORKDIR /build
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
build-essential \
|
||||
cmake \
|
||||
build-essential curl libssl-dev \
|
||||
git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
|
||||
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
|
||||
# and running make install in the target container
|
||||
@@ -297,10 +325,10 @@ COPY .git .
|
||||
RUN make prepare
|
||||
|
||||
## Build the binary
|
||||
## If it's CUDA, we want to skip some of the llama-compat backends to save space
|
||||
## We only leave the most CPU-optimized variant and the fallback for the cublas build
|
||||
## (both will use CUDA for the actual computation)
|
||||
RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
|
||||
## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
|
||||
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
|
||||
## (both will use CUDA or hipblas for the actual computation)
|
||||
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
||||
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
|
||||
else \
|
||||
make build; \
|
||||
@@ -338,9 +366,8 @@ RUN if [ "${FFMPEG}" = "true" ]; then \
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ssh less && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
ssh less wget
|
||||
# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
|
||||
|
||||
RUN go install github.com/go-delve/delve/cmd/dlv@latest
|
||||
|
||||
@@ -418,9 +445,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/transformers-musicgen \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/exllama \
|
||||
; fi
|
||||
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
|
||||
36
Makefile
36
Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
|
||||
# llama.cpp versions
|
||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
|
||||
CPPLLAMA_VERSION?=96776405a17034dcfd53d3ddf5d142d34bdbb657
|
||||
|
||||
# go-rwkv version
|
||||
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
|
||||
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
|
||||
WHISPER_CPP_VERSION?=fdbfb460ed546452a5d53611bba66d10d842e719
|
||||
|
||||
# bert.cpp version
|
||||
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
|
||||
@@ -359,6 +359,9 @@ clean-tests:
|
||||
rm -rf test-dir
|
||||
rm -rf core/http/backend-assets
|
||||
|
||||
clean-dc: clean
|
||||
cp -r /build/backend-assets /workspace/backend-assets
|
||||
|
||||
## Build:
|
||||
build: prepare backend-assets grpcs ## Build the project
|
||||
$(info ${GREEN}I local-ai build info:${RESET})
|
||||
@@ -465,15 +468,15 @@ run-e2e-image:
|
||||
ls -liah $(abspath ./tests/e2e-fixtures)
|
||||
docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
|
||||
|
||||
run-e2e-aio:
|
||||
run-e2e-aio: protogen-go
|
||||
@echo 'Running e2e AIO tests'
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
|
||||
|
||||
test-e2e:
|
||||
@echo 'Running e2e tests'
|
||||
BUILD_TYPE=$(BUILD_TYPE) \
|
||||
LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
|
||||
|
||||
teardown-e2e:
|
||||
rm -rf $(TEST_DIR) || true
|
||||
@@ -481,24 +484,24 @@ teardown-e2e:
|
||||
|
||||
test-llama: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-llama-gguf: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-tts: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-stablediffusion: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-stores: backend-assets/grpc/local-store
|
||||
mkdir -p tests/integration/backend-assets/grpc
|
||||
cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
|
||||
|
||||
test-container:
|
||||
docker build --target requirements -t local-ai-test-container .
|
||||
@@ -534,10 +537,10 @@ protogen-go-clean:
|
||||
$(RM) bin/*
|
||||
|
||||
.PHONY: protogen-python
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||
|
||||
.PHONY: protogen-python-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||
|
||||
.PHONY: autogptq-protogen
|
||||
autogptq-protogen:
|
||||
@@ -571,14 +574,6 @@ diffusers-protogen:
|
||||
diffusers-protogen-clean:
|
||||
$(MAKE) -C backend/python/diffusers protogen-clean
|
||||
|
||||
.PHONY: exllama-protogen
|
||||
exllama-protogen:
|
||||
$(MAKE) -C backend/python/exllama protogen
|
||||
|
||||
.PHONY: exllama-protogen-clean
|
||||
exllama-protogen-clean:
|
||||
$(MAKE) -C backend/python/exllama protogen-clean
|
||||
|
||||
.PHONY: exllama2-protogen
|
||||
exllama2-protogen:
|
||||
$(MAKE) -C backend/python/exllama2 protogen
|
||||
@@ -675,7 +670,6 @@ prepare-extra-conda-environments: protogen-python
|
||||
$(MAKE) -C backend/python/parler-tts
|
||||
$(MAKE) -C backend/python/vall-e-x
|
||||
$(MAKE) -C backend/python/openvoice
|
||||
$(MAKE) -C backend/python/exllama
|
||||
$(MAKE) -C backend/python/exllama2
|
||||
|
||||
prepare-test-extra: protogen-python
|
||||
|
||||
10
README.md
10
README.md
@@ -68,9 +68,7 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
|
||||
[💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
||||
|
||||
## 🔥🔥 Hot topics / Roadmap
|
||||
|
||||
[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
## 📰 Latest project news
|
||||
|
||||
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
|
||||
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
|
||||
@@ -83,8 +81,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
|
||||
- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
|
||||
|
||||
Hot topics (looking for contributors):
|
||||
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
|
||||
## 🔥🔥 Hot topics (looking for help):
|
||||
|
||||
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
|
||||
- Realtime API https://github.com/mudler/LocalAI/issues/3714
|
||||
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
|
||||
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
|
||||
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
|
||||
|
||||
@@ -2,7 +2,7 @@ backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
name: gpt-4-vision-preview
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
|
||||
@@ -2,7 +2,7 @@ backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
name: gpt-4-vision-preview
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
|
||||
@@ -2,7 +2,7 @@ backend: llama-cpp
|
||||
context_size: 4096
|
||||
mmap: false
|
||||
f16: false
|
||||
name: gpt-4-vision-preview
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
|
||||
@@ -26,6 +26,19 @@ service Backend {
|
||||
rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
|
||||
|
||||
rpc Rerank(RerankRequest) returns (RerankResult) {}
|
||||
|
||||
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
|
||||
}
|
||||
|
||||
// Define the empty request
|
||||
message MetricsRequest {}
|
||||
|
||||
message MetricsResponse {
|
||||
int32 slot_id = 1;
|
||||
string prompt_json_for_slot = 2; // Stores the prompt as a JSON string.
|
||||
float tokens_per_second = 3;
|
||||
int32 tokens_generated = 4;
|
||||
int32 prompt_tokens_processed = 5;
|
||||
}
|
||||
|
||||
message RerankRequest {
|
||||
@@ -134,6 +147,9 @@ message PredictOptions {
|
||||
repeated string Images = 42;
|
||||
bool UseTokenizerTemplate = 43;
|
||||
repeated Message Messages = 44;
|
||||
repeated string Videos = 45;
|
||||
repeated string Audios = 46;
|
||||
string CorrelationId = 47;
|
||||
}
|
||||
|
||||
// The response message containing the result
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <getopt.h>
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "log.h"
|
||||
#include "stb_image.h"
|
||||
#include "common.h"
|
||||
#include "json.hpp"
|
||||
@@ -112,7 +113,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
std::string ret;
|
||||
for (; begin != end; ++begin)
|
||||
{
|
||||
ret += llama_token_to_piece(ctx, *begin);
|
||||
ret += common_token_to_piece(ctx, *begin);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -120,7 +121,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
// format incomplete utf-8 multibyte character for output
|
||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||
{
|
||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||
// (size > 1 meaning it's already a known token)
|
||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||
@@ -202,8 +203,8 @@ struct llama_client_slot
|
||||
std::string stopping_word;
|
||||
|
||||
// sampling
|
||||
struct gpt_sampler_params sparams;
|
||||
gpt_sampler *ctx_sampling = nullptr;
|
||||
struct common_sampler_params sparams;
|
||||
common_sampler *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
@@ -256,7 +257,7 @@ struct llama_client_slot
|
||||
images.clear();
|
||||
}
|
||||
|
||||
bool has_budget(gpt_params &global_params) {
|
||||
bool has_budget(common_params &global_params) {
|
||||
if (params.n_predict == -1 && global_params.n_predict == -1)
|
||||
{
|
||||
return true; // limitless
|
||||
@@ -397,7 +398,7 @@ struct llama_server_context
|
||||
|
||||
clip_ctx *clp_ctx = nullptr;
|
||||
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
@@ -440,7 +441,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
bool load_model(const gpt_params ¶ms_)
|
||||
bool load_model(const common_params ¶ms_)
|
||||
{
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
@@ -448,7 +449,7 @@ struct llama_server_context
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -457,12 +458,12 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
common_init_result common_init = common_init_from_params(params);
|
||||
model = common_init.model;
|
||||
ctx = common_init.context;
|
||||
if (model == nullptr)
|
||||
{
|
||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||
LOG_ERR("unable to load model: %s", params.model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -470,7 +471,7 @@ struct llama_server_context
|
||||
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
|
||||
const int n_embd_llm = llama_n_embd(model);
|
||||
if (n_embd_clip != n_embd_llm) {
|
||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
|
||||
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
return false;
|
||||
@@ -489,11 +490,21 @@ struct llama_server_context
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||
if (res < 0) {
|
||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||
LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
|
||||
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
||||
}
|
||||
}
|
||||
|
||||
llama_client_slot* get_active_slot() {
|
||||
for (llama_client_slot& slot : slots) {
|
||||
// Check if the slot is currently processing
|
||||
if (slot.is_processing()) {
|
||||
return &slot; // Return the active slot
|
||||
}
|
||||
}
|
||||
return nullptr; // No active slot found
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
// create slots
|
||||
all_slots_are_idle = true;
|
||||
@@ -567,12 +578,12 @@ struct llama_server_context
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
@@ -589,7 +600,7 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
@@ -618,7 +629,7 @@ struct llama_server_context
|
||||
|
||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||
slot_params default_params;
|
||||
gpt_sampler_params default_sparams;
|
||||
common_sampler_params default_sparams;
|
||||
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
@@ -758,7 +769,7 @@ struct llama_server_context
|
||||
}
|
||||
else if (el[0].is_string())
|
||||
{
|
||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks)
|
||||
{
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
@@ -790,7 +801,7 @@ struct llama_server_context
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -812,10 +823,11 @@ struct llama_server_context
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_ERROR("failed to load image", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
|
||||
__func__,
|
||||
slot->id,
|
||||
img_sl.id
|
||||
);
|
||||
return false;
|
||||
}
|
||||
LOG_VERBOSE("image loaded", {
|
||||
@@ -853,12 +865,12 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
|
||||
LOG("ERROR: Image with id: %i, not found.\n", img_id);
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
} catch (const std::invalid_argument& e) {
|
||||
LOG_TEE("Invalid image number id in prompt\n");
|
||||
LOG("Invalid image number id in prompt\n");
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
@@ -873,9 +885,9 @@ struct llama_server_context
|
||||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
gpt_sampler_free(slot->ctx_sampling);
|
||||
common_sampler_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
|
||||
slot->ctx_sampling = common_sampler_init(model, slot->sparams);
|
||||
//llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
@@ -886,7 +898,7 @@ struct llama_server_context
|
||||
{"task_id", slot->task_id},
|
||||
});
|
||||
|
||||
// LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
|
||||
// LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -902,13 +914,13 @@ struct llama_server_context
|
||||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int i = 0; i < (int)system_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
common_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
@@ -926,7 +938,7 @@ struct llama_server_context
|
||||
};
|
||||
if (llama_decode(ctx, batch_view) != 0)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -938,7 +950,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("system prompt updated\n");
|
||||
LOG("system prompt updated\n");
|
||||
system_need_update = false;
|
||||
}
|
||||
|
||||
@@ -997,7 +1009,7 @@ struct llama_server_context
|
||||
|
||||
bool process_token(completion_token_output &result, llama_client_slot &slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
@@ -1120,7 +1132,7 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG_TEE("Error processing the given image");
|
||||
LOG("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1132,7 +1144,7 @@ struct llama_server_context
|
||||
|
||||
void send_error(task_server& task, const std::string &error)
|
||||
{
|
||||
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
|
||||
LOG("task %i - error: %s\n", task.id, error.c_str());
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
@@ -1148,7 +1160,7 @@ struct llama_server_context
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers)
|
||||
{
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -1204,7 +1216,7 @@ struct llama_server_context
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
||||
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
|
||||
if (probs_pos < probs_stop_pos)
|
||||
@@ -1256,7 +1268,7 @@ struct llama_server_context
|
||||
std::vector<completion_token_output> probs = {};
|
||||
if (!slot.params.stream && slot.stopped_word)
|
||||
{
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
||||
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
|
||||
}
|
||||
else
|
||||
@@ -1371,7 +1383,7 @@ struct llama_server_context
|
||||
};
|
||||
if (llama_decode(ctx, batch_view))
|
||||
{
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
LOG("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1389,14 +1401,14 @@ struct llama_server_context
|
||||
llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
|
||||
if (llama_decode(ctx, batch_img))
|
||||
{
|
||||
LOG_TEE("%s : failed to eval image\n", __func__);
|
||||
LOG("%s : failed to eval image\n", __func__);
|
||||
return false;
|
||||
}
|
||||
slot.n_past += n_eval;
|
||||
}
|
||||
image_idx++;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// append prefix of next image
|
||||
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
|
||||
@@ -1406,7 +1418,7 @@ struct llama_server_context
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
@@ -1538,7 +1550,7 @@ struct llama_server_context
|
||||
update_system_prompt();
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
if (all_slots_are_idle)
|
||||
{
|
||||
@@ -1572,7 +1584,7 @@ struct llama_server_context
|
||||
slot.n_past = 0;
|
||||
slot.truncated = false;
|
||||
slot.has_next_token = true;
|
||||
LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
|
||||
continue;
|
||||
// END LOCALAI changes
|
||||
@@ -1616,7 +1628,7 @@ struct llama_server_context
|
||||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
|
||||
@@ -1710,7 +1722,7 @@ struct llama_server_context
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
gpt_sampler_reset(slot.ctx_sampling);
|
||||
common_sampler_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
@@ -1722,7 +1734,7 @@ struct llama_server_context
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
gpt_sampler_accept(slot.ctx_sampling, token, false);
|
||||
common_sampler_accept(slot.ctx_sampling, token, false);
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
@@ -1814,16 +1826,17 @@ struct llama_server_context
|
||||
ga_i += ga_w/ga_n;
|
||||
}
|
||||
}
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
{
|
||||
LOG_ERROR("failed processing images", {
|
||||
"slot_id", slot.id,
|
||||
"task_id", slot.task_id,
|
||||
});
|
||||
LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
|
||||
__func__,
|
||||
slot.id,
|
||||
slot.task_id
|
||||
);
|
||||
// FIXME @phymbert: to be properly tested
|
||||
// early returning without changing the slot state will block the slot for ever
|
||||
// no one at the moment is checking the return value
|
||||
@@ -1863,10 +1876,10 @@ struct llama_server_context
|
||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
LOG("\n");
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
@@ -1876,7 +1889,7 @@ struct llama_server_context
|
||||
|
||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
||||
|
||||
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
}
|
||||
slot.n_past_se += n_tokens;
|
||||
}
|
||||
@@ -1901,11 +1914,11 @@ struct llama_server_context
|
||||
if (n_batch == 1 || ret < 0)
|
||||
{
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
|
||||
LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
n_batch /= 2;
|
||||
@@ -1930,9 +1943,9 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
|
||||
gpt_sampler_accept(slot.ctx_sampling, id, true);
|
||||
common_sampler_accept(slot.ctx_sampling, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1)
|
||||
@@ -1943,7 +1956,7 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
result.tok = id;
|
||||
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
@@ -1996,7 +2009,7 @@ static json format_partial_response(
|
||||
struct token_translator
|
||||
{
|
||||
llama_context * ctx;
|
||||
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||
std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); }
|
||||
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
|
||||
};
|
||||
|
||||
@@ -2103,6 +2116,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
data["ignore_eos"] = predict->ignoreeos();
|
||||
data["embeddings"] = predict->embeddings();
|
||||
|
||||
// Add the correlationid to json data
|
||||
data["correlation_id"] = predict->correlationid();
|
||||
|
||||
// for each image in the request, add the image data
|
||||
//
|
||||
for (int i = 0; i < predict->images_size(); i++) {
|
||||
@@ -2187,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// }
|
||||
|
||||
static void params_parse(const backend::ModelOptions* request,
|
||||
gpt_params & params) {
|
||||
common_params & params) {
|
||||
|
||||
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
|
||||
|
||||
@@ -2295,7 +2311,7 @@ public:
|
||||
|
||||
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
|
||||
// Implement LoadModel RPC
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
params_parse(request, params);
|
||||
|
||||
llama_backend_init();
|
||||
@@ -2341,6 +2357,11 @@ public:
|
||||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
reply.set_prompt_tokens(tokens_evaluated);
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
{ "id", data["correlation_id"] }
|
||||
});
|
||||
|
||||
// Send the reply
|
||||
writer->Write(reply);
|
||||
|
||||
@@ -2364,6 +2385,12 @@ public:
|
||||
std::string completion_text;
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
if (!result.error && result.stop) {
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
{ "id", data["correlation_id"] }
|
||||
});
|
||||
|
||||
completion_text = result.result_json.value("content", "");
|
||||
int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
|
||||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
@@ -2403,6 +2430,31 @@ public:
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
|
||||
llama_client_slot* active_slot = llama.get_active_slot();
|
||||
|
||||
if (active_slot != nullptr) {
|
||||
// Calculate the tokens per second using existing logic
|
||||
double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
|
||||
|
||||
// Populate the response with metrics
|
||||
response->set_slot_id(active_slot->id);
|
||||
response->set_prompt_json_for_slot(active_slot->prompt.dump());
|
||||
response->set_tokens_per_second(tokens_per_second);
|
||||
response->set_tokens_generated(active_slot->n_decoded);
|
||||
response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
|
||||
} else {
|
||||
// Handle case when no active slot exists
|
||||
response->set_slot_id(0);
|
||||
response->set_prompt_json_for_slot("");
|
||||
response->set_tokens_per_second(0);
|
||||
response->set_tokens_generated(0);
|
||||
response->set_prompt_tokens_processed(0);
|
||||
}
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
};
|
||||
|
||||
void RunServer(const std::string& server_address) {
|
||||
|
||||
@@ -2,4 +2,4 @@
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,6 +1,6 @@
|
||||
accelerate
|
||||
auto-gptq==0.7.1
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
transformers
|
||||
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
bark==0.1.5
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -1,2 +1,2 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
TTS==0.22.0
|
||||
grpcio==1.66.1
|
||||
coqui-tts
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
This method sets up the gRPC service by starting the server
|
||||
"""
|
||||
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(10)
|
||||
time.sleep(30)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchvision
|
||||
optimum[openvino]
|
||||
setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
diffusers
|
||||
opencv-python
|
||||
transformers
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
setuptools
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
pillow
|
||||
protobuf
|
||||
certifi
|
||||
|
||||
1
backend/python/exllama/.gitignore
vendored
1
backend/python/exllama/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
source
|
||||
@@ -1,25 +0,0 @@
|
||||
export CONDA_ENV_PATH = "exllama.yml"
|
||||
|
||||
.PHONY: exllama
|
||||
exllama: protogen
|
||||
bash install.sh ${CONDA_ENV_PATH}
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
@echo "Running exllama..."
|
||||
bash run.sh
|
||||
@echo "exllama run."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
$(RM) -r venv source __pycache__
|
||||
@@ -1,5 +0,0 @@
|
||||
# Creating a separate environment for the exllama project
|
||||
|
||||
```
|
||||
make exllama
|
||||
```
|
||||
@@ -1,159 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import grpc
|
||||
from concurrent import futures
|
||||
import time
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os, glob
|
||||
|
||||
from pathlib import Path
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import version as torch_version
|
||||
|
||||
from source.tokenizer import ExLlamaTokenizer
|
||||
from source.generator import ExLlamaGenerator
|
||||
from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
# Implement the BackendServicer class with the service methods
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def generate(self,prompt, max_new_tokens):
|
||||
self.generator.end_beam_search()
|
||||
|
||||
# Tokenizing the input
|
||||
ids = self.generator.tokenizer.encode(prompt)
|
||||
|
||||
self.generator.gen_begin_reuse(ids)
|
||||
initial_len = self.generator.sequence[0].shape[0]
|
||||
has_leading_space = False
|
||||
decoded_text = ''
|
||||
for i in range(max_new_tokens):
|
||||
token = self.generator.gen_single_token()
|
||||
if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
|
||||
has_leading_space = True
|
||||
|
||||
decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
|
||||
if has_leading_space:
|
||||
decoded_text = ' ' + decoded_text
|
||||
|
||||
if token.item() == self.generator.tokenizer.eos_token_id:
|
||||
break
|
||||
return decoded_text
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
def LoadModel(self, request, context):
|
||||
try:
|
||||
# https://github.com/turboderp/exllama/blob/master/example_cfg.py
|
||||
model_directory = request.ModelFile
|
||||
|
||||
# Locate files we need within that directory
|
||||
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
|
||||
model_config_path = os.path.join(model_directory, "config.json")
|
||||
st_pattern = os.path.join(model_directory, "*.safetensors")
|
||||
model_path = glob.glob(st_pattern)[0]
|
||||
|
||||
# Create config, model, tokenizer and generator
|
||||
|
||||
config = ExLlamaConfig(model_config_path) # create config from config.json
|
||||
config.model_path = model_path # supply path to model weights file
|
||||
if (request.ContextSize):
|
||||
config.max_seq_len = request.ContextSize # override max sequence length
|
||||
config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2.
|
||||
# https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
|
||||
|
||||
# Set Rope scaling.
|
||||
if (request.RopeFreqScale):
|
||||
# Alpha value for Rope scaling.
|
||||
# Higher value increases context but adds perplexity.
|
||||
# alpha_value and compress_pos_emb are mutually exclusive.
|
||||
# https://github.com/turboderp/exllama/issues/115
|
||||
config.alpha_value = request.RopeFreqScale
|
||||
config.calculate_rotary_embedding_base()
|
||||
|
||||
model = ExLlama(config) # create ExLlama instance and load the weights
|
||||
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
|
||||
|
||||
cache = ExLlamaCache(model, batch_size = 2) # create cache for inference
|
||||
generator = ExLlamaGenerator(model, tokenizer, cache) # create generator
|
||||
|
||||
self.generator= generator
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
self.cache = cache
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def Predict(self, request, context):
|
||||
penalty = 1.15
|
||||
if request.Penalty != 0.0:
|
||||
penalty = request.Penalty
|
||||
self.generator.settings.token_repetition_penalty_max = penalty
|
||||
self.generator.settings.temperature = request.Temperature
|
||||
self.generator.settings.top_k = request.TopK
|
||||
self.generator.settings.top_p = request.TopP
|
||||
|
||||
tokens = 512
|
||||
if request.Tokens != 0:
|
||||
tokens = request.Tokens
|
||||
|
||||
if self.cache.batch_size == 1:
|
||||
del self.cache
|
||||
self.cache = ExLlamaCache(self.model, batch_size=2)
|
||||
self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
|
||||
|
||||
t = self.generate(request.Prompt, tokens)
|
||||
|
||||
# Remove prompt from response if present
|
||||
if request.Prompt in t:
|
||||
t = t.replace(request.Prompt, "")
|
||||
|
||||
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
||||
|
||||
def PredictStream(self, request, context):
|
||||
# Implement PredictStream RPC
|
||||
#for reply in some_data_generator():
|
||||
# yield reply
|
||||
# Not implemented yet
|
||||
return self.Predict(request, context)
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
|
||||
# Define the signal handler function
|
||||
def signal_handler(sig, frame):
|
||||
print("Received termination signal. Shutting down...")
|
||||
server.stop(0)
|
||||
sys.exit(0)
|
||||
|
||||
# Set the signal handlers for SIGINT and SIGTERM
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||
parser.add_argument(
|
||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
serve(args.addr)
|
||||
@@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
LIMIT_TARGETS="cublas"
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
installRequirements
|
||||
|
||||
git clone https://github.com/turboderp/exllama $MY_DIR/source
|
||||
uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
|
||||
|
||||
cp -v ./*py $MY_DIR/source/
|
||||
@@ -1,3 +0,0 @@
|
||||
transformers
|
||||
accelerate
|
||||
torch
|
||||
@@ -1,4 +0,0 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,3 +0,0 @@
|
||||
torch
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +0,0 @@
|
||||
grpcio==1.66.1
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
LIMIT_TARGETS="cublas"
|
||||
BACKEND_FILE="${MY_DIR}/source/backend.py"
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
startBackend $@
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
runUnittests
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
wheel
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -2,7 +2,7 @@
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
optimum[openvino]
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
librosa==0.9.1
|
||||
faster-whisper==1.0.3
|
||||
@@ -18,6 +18,6 @@ python-dotenv
|
||||
pypinyin==0.50.0
|
||||
cn2an==0.5.22
|
||||
jieba==0.42.1
|
||||
gradio==4.38.1
|
||||
gradio==4.44.1
|
||||
langid==1.1.6
|
||||
git+https://github.com/myshell-ai/MeloTTS.git
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
librosa
|
||||
faster-whisper
|
||||
|
||||
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
This method sets up the gRPC service by starting the server
|
||||
"""
|
||||
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(10)
|
||||
time.sleep(30)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -15,5 +15,12 @@ installRequirements
|
||||
|
||||
# https://github.com/descriptinc/audiotools/issues/101
|
||||
# incompatible protobuf versions.
|
||||
PYDIR=$(ls ${MY_DIR}/venv/lib)
|
||||
curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
|
||||
PYDIR=python3.10
|
||||
pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
|
||||
|
||||
if [ ! -d ${pyenv} ]; then
|
||||
echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
|
||||
|
||||
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
llvmlite==0.43.0
|
||||
@@ -5,4 +5,4 @@ accelerate
|
||||
torch
|
||||
rerankers[transformers]
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -55,7 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
"""
|
||||
model_name = request.Model
|
||||
try:
|
||||
self.model = SentenceTransformer(model_name)
|
||||
self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
|
||||
@@ -2,5 +2,5 @@ torch
|
||||
accelerate
|
||||
transformers
|
||||
bitsandbytes
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,4 +1,4 @@
|
||||
torch
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -4,5 +4,5 @@ torch
|
||||
optimum[openvino]
|
||||
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,3 +1,5 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
certifi
|
||||
datasets
|
||||
einops
|
||||
@@ -4,4 +4,4 @@ transformers
|
||||
accelerate
|
||||
torch
|
||||
optimum[openvino]
|
||||
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
scipy==1.14.0
|
||||
certifi
|
||||
@@ -72,7 +72,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
Returns:
|
||||
A Result object that contains the result of the LoadModel operation.
|
||||
"""
|
||||
|
||||
model_name = request.Model
|
||||
|
||||
# Check to see if the Model exists in the filesystem already.
|
||||
if os.path.exists(request.ModelFile):
|
||||
model_name = request.ModelFile
|
||||
|
||||
compute = torch.float16
|
||||
if request.F16Memory == True:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -4,4 +4,4 @@ accelerate
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -5,6 +5,8 @@ import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
from typing import List
|
||||
from PIL import Image
|
||||
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
@@ -15,6 +17,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.assets.video import VideoAsset
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
@@ -105,6 +109,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
try:
|
||||
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
except Exception as err:
|
||||
print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
try:
|
||||
@@ -117,7 +122,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
print("Model loaded successfully", file=sys.stderr)
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
async def Predict(self, request, context):
|
||||
@@ -196,15 +201,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if request.Seed != 0:
|
||||
sampling_params.seed = request.Seed
|
||||
|
||||
# Extract image paths and process images
|
||||
prompt = request.Prompt
|
||||
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
|
||||
|
||||
image_paths = request.Images
|
||||
image_data = [self.load_image(img_path) for img_path in image_paths]
|
||||
|
||||
videos_path = request.Videos
|
||||
video_data = [self.load_video(video_path) for video_path in videos_path]
|
||||
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
|
||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
# Generate text
|
||||
# Generate text using the LLM engine
|
||||
request_id = random_uuid()
|
||||
outputs = self.llm.generate(prompt, sampling_params, request_id)
|
||||
print(f"Generating text with request_id: {request_id}", file=sys.stderr)
|
||||
outputs = self.llm.generate(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"image": image_data if image_data else None,
|
||||
"video": video_data if video_data else None,
|
||||
} if image_data or video_data else None,
|
||||
},
|
||||
sampling_params=sampling_params,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
# Stream the results
|
||||
generated_text = ""
|
||||
@@ -227,9 +250,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if streaming:
|
||||
return
|
||||
|
||||
# Remove the image files from /tmp folder
|
||||
for img_path in image_paths:
|
||||
try:
|
||||
os.remove(img_path)
|
||||
except Exception as e:
|
||||
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
|
||||
|
||||
# Sending the final generated text
|
||||
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
|
||||
|
||||
def load_image(self, image_path: str):
|
||||
"""
|
||||
Load an image from the given file path.
|
||||
|
||||
Args:
|
||||
image_path (str): The path to the image file.
|
||||
|
||||
Returns:
|
||||
Image: The loaded image.
|
||||
"""
|
||||
try:
|
||||
return Image.open(image_path)
|
||||
except Exception as e:
|
||||
print(f"Error loading image {image_path}: {e}", file=sys.stderr)
|
||||
return self.load_video(image_path)
|
||||
|
||||
def load_video(self, video_path: str):
|
||||
"""
|
||||
Load a video from the given file path.
|
||||
|
||||
Args:
|
||||
video_path (str): The path to the image file.
|
||||
|
||||
Returns:
|
||||
Video: The loaded video.
|
||||
"""
|
||||
try:
|
||||
video = VideoAsset(name=video_path).np_ndarrays
|
||||
return video
|
||||
except Exception as e:
|
||||
print(f"Error loading video {image_path}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
async def serve(address):
|
||||
# Start asyncio gRPC server
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
|
||||
@@ -13,4 +13,20 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
installRequirements
|
||||
# We don't embed this into the images as it is a large dependency and not always needed.
|
||||
# Besides, the speed inference are not actually usable in the current state for production use-cases.
|
||||
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
|
||||
ensureVenv
|
||||
# https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
|
||||
if [ ! -d vllm ]; then
|
||||
git clone https://github.com/vllm-project/vllm
|
||||
fi
|
||||
pushd vllm
|
||||
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
|
||||
uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
popd
|
||||
rm -rf vllm
|
||||
else
|
||||
installRequirements
|
||||
fi
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -1,3 +1,4 @@
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -1,4 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -4,4 +4,5 @@ accelerate
|
||||
torch
|
||||
transformers
|
||||
optimum[openvino]
|
||||
setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
bitsandbytes
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
||||
@@ -10,20 +10,11 @@ import (
|
||||
)
|
||||
|
||||
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
|
||||
modelFile := backendConfig.Model
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
|
||||
var inferenceModel interface{}
|
||||
var err error
|
||||
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
model.WithThreads(uint32(*backendConfig.Threads)),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
if backendConfig.Backend == "" {
|
||||
inferenceModel, err = loader.GreedyLoader(opts...)
|
||||
|
||||
@@ -8,19 +8,8 @@ import (
|
||||
)
|
||||
|
||||
func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
|
||||
threads := backendConfig.Threads
|
||||
if *threads == 0 && appConfig.Threads != 0 {
|
||||
threads = &appConfig.Threads
|
||||
}
|
||||
gRPCOpts := gRPCModelOpts(backendConfig)
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithBackendString(backendConfig.Backend),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithThreads(uint32(*threads)),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithModel(backendConfig.Model),
|
||||
model.WithLoadGRPCLoadModelOpts(gRPCOpts),
|
||||
})
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
inferenceModel, err := loader.BackendLoader(
|
||||
opts...,
|
||||
|
||||
@@ -31,24 +31,13 @@ type TokenUsage struct {
|
||||
Completion int
|
||||
}
|
||||
|
||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||
modelFile := c.Model
|
||||
threads := c.Threads
|
||||
if *threads == 0 && o.Threads != 0 {
|
||||
threads = &o.Threads
|
||||
}
|
||||
grpcOpts := gRPCModelOpts(c)
|
||||
|
||||
var inferenceModel grpc.Backend
|
||||
var err error
|
||||
|
||||
opts := modelOpts(c, o, []model.Option{
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
|
||||
model.WithAssetDir(o.AssetsDestination),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(o.Context),
|
||||
})
|
||||
opts := ModelOptions(c, o, []model.Option{})
|
||||
|
||||
if c.Backend != "" {
|
||||
opts = append(opts, model.WithBackendString(c.Backend))
|
||||
@@ -101,6 +90,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
||||
opts.Messages = protoMessages
|
||||
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
|
||||
opts.Images = images
|
||||
opts.Videos = videos
|
||||
opts.Audios = audios
|
||||
|
||||
tokenUsage := TokenUsage{}
|
||||
|
||||
|
||||
@@ -11,32 +11,65 @@ import (
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
|
||||
func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
|
||||
name := c.Name
|
||||
if name == "" {
|
||||
name = c.Model
|
||||
}
|
||||
|
||||
defOpts := []model.Option{
|
||||
model.WithBackendString(c.Backend),
|
||||
model.WithModel(c.Model),
|
||||
model.WithAssetDir(so.AssetsDestination),
|
||||
model.WithContext(so.Context),
|
||||
model.WithModelID(name),
|
||||
}
|
||||
|
||||
threads := 1
|
||||
|
||||
if c.Threads != nil {
|
||||
threads = *c.Threads
|
||||
}
|
||||
|
||||
if so.Threads != 0 {
|
||||
threads = so.Threads
|
||||
}
|
||||
|
||||
c.Threads = &threads
|
||||
|
||||
grpcOpts := grpcModelOpts(c)
|
||||
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
|
||||
|
||||
if so.SingleBackend {
|
||||
opts = append(opts, model.WithSingleActiveBackend())
|
||||
defOpts = append(defOpts, model.WithSingleActiveBackend())
|
||||
}
|
||||
|
||||
if so.ParallelBackendRequests {
|
||||
opts = append(opts, model.EnableParallelRequests)
|
||||
defOpts = append(defOpts, model.EnableParallelRequests)
|
||||
}
|
||||
|
||||
if c.GRPC.Attempts != 0 {
|
||||
opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
|
||||
defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
|
||||
}
|
||||
|
||||
if c.GRPC.AttemptsSleepTime != 0 {
|
||||
opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
|
||||
defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
|
||||
}
|
||||
|
||||
for k, v := range so.ExternalGRPCBackends {
|
||||
opts = append(opts, model.WithExternalBackend(k, v))
|
||||
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
||||
}
|
||||
|
||||
return opts
|
||||
return append(defOpts, opts...)
|
||||
}
|
||||
|
||||
func getSeed(c config.BackendConfig) int32 {
|
||||
seed := int32(*c.Seed)
|
||||
var seed int32 = config.RAND_SEED
|
||||
|
||||
if c.Seed != nil {
|
||||
seed = int32(*c.Seed)
|
||||
}
|
||||
|
||||
if seed == config.RAND_SEED {
|
||||
seed = rand.Int31()
|
||||
}
|
||||
@@ -44,11 +77,47 @@ func getSeed(c config.BackendConfig) int32 {
|
||||
return seed
|
||||
}
|
||||
|
||||
func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
b := 512
|
||||
if c.Batch != 0 {
|
||||
b = c.Batch
|
||||
}
|
||||
|
||||
f16 := false
|
||||
if c.F16 != nil {
|
||||
f16 = *c.F16
|
||||
}
|
||||
|
||||
embeddings := false
|
||||
if c.Embeddings != nil {
|
||||
embeddings = *c.Embeddings
|
||||
}
|
||||
|
||||
lowVRAM := false
|
||||
if c.LowVRAM != nil {
|
||||
lowVRAM = *c.LowVRAM
|
||||
}
|
||||
|
||||
mmap := false
|
||||
if c.MMap != nil {
|
||||
mmap = *c.MMap
|
||||
}
|
||||
|
||||
ctxSize := 1024
|
||||
if c.ContextSize != nil {
|
||||
ctxSize = *c.ContextSize
|
||||
}
|
||||
|
||||
mmlock := false
|
||||
if c.MMlock != nil {
|
||||
mmlock = *c.MMlock
|
||||
}
|
||||
|
||||
nGPULayers := 9999999
|
||||
if c.NGPULayers != nil {
|
||||
nGPULayers = *c.NGPULayers
|
||||
}
|
||||
|
||||
return &pb.ModelOptions{
|
||||
CUDA: c.CUDA || c.Diffusers.CUDA,
|
||||
SchedulerType: c.Diffusers.SchedulerType,
|
||||
@@ -56,14 +125,14 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
CFGScale: c.Diffusers.CFGScale,
|
||||
LoraAdapter: c.LoraAdapter,
|
||||
LoraScale: c.LoraScale,
|
||||
F16Memory: *c.F16,
|
||||
F16Memory: f16,
|
||||
LoraBase: c.LoraBase,
|
||||
IMG2IMG: c.Diffusers.IMG2IMG,
|
||||
CLIPModel: c.Diffusers.ClipModel,
|
||||
CLIPSubfolder: c.Diffusers.ClipSubFolder,
|
||||
CLIPSkip: int32(c.Diffusers.ClipSkip),
|
||||
ControlNet: c.Diffusers.ControlNet,
|
||||
ContextSize: int32(*c.ContextSize),
|
||||
ContextSize: int32(ctxSize),
|
||||
Seed: getSeed(c),
|
||||
NBatch: int32(b),
|
||||
NoMulMatQ: c.NoMulMatQ,
|
||||
@@ -85,16 +154,16 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
YarnBetaSlow: c.YarnBetaSlow,
|
||||
NGQA: c.NGQA,
|
||||
RMSNormEps: c.RMSNormEps,
|
||||
MLock: *c.MMlock,
|
||||
MLock: mmlock,
|
||||
RopeFreqBase: c.RopeFreqBase,
|
||||
RopeScaling: c.RopeScaling,
|
||||
Type: c.ModelType,
|
||||
RopeFreqScale: c.RopeFreqScale,
|
||||
NUMA: c.NUMA,
|
||||
Embeddings: *c.Embeddings,
|
||||
LowVRAM: *c.LowVRAM,
|
||||
NGPULayers: int32(*c.NGPULayers),
|
||||
MMap: *c.MMap,
|
||||
Embeddings: embeddings,
|
||||
LowVRAM: lowVRAM,
|
||||
NGPULayers: int32(nGPULayers),
|
||||
MMap: mmap,
|
||||
MainGPU: c.MainGPU,
|
||||
Threads: int32(*c.Threads),
|
||||
TensorSplit: c.TensorSplit,
|
||||
|
||||
@@ -9,21 +9,9 @@ import (
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
||||
bb := backend
|
||||
if bb == "" {
|
||||
return nil, fmt.Errorf("backend is required")
|
||||
}
|
||||
func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(bb),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
|
||||
rerankModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
@@ -13,7 +13,6 @@ import (
|
||||
)
|
||||
|
||||
func SoundGeneration(
|
||||
backend string,
|
||||
modelFile string,
|
||||
text string,
|
||||
duration *float32,
|
||||
@@ -25,18 +24,8 @@ func SoundGeneration(
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig,
|
||||
) (string, *proto.Result, error) {
|
||||
if backend == "" {
|
||||
return "", nil, fmt.Errorf("backend is a required parameter")
|
||||
}
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(backend),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
|
||||
|
||||
soundGenModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
|
||||
33
core/backend/token_metrics.go
Normal file
33
core/backend/token_metrics.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func TokenMetrics(
|
||||
modelFile string,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig) (*proto.MetricsResponse, error) {
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{
|
||||
model.WithModel(modelFile),
|
||||
})
|
||||
model, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if model == nil {
|
||||
return nil, fmt.Errorf("could not loadmodel model")
|
||||
}
|
||||
|
||||
res, err := model.GetTokenMetrics(context.Background(), &proto.MetricsRequest{})
|
||||
|
||||
return res, err
|
||||
}
|
||||
44
core/backend/tokenize.go
Normal file
44
core/backend/tokenize.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/grpc"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
|
||||
|
||||
modelFile := backendConfig.Model
|
||||
|
||||
var inferenceModel grpc.Backend
|
||||
var err error
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{
|
||||
model.WithModel(modelFile),
|
||||
})
|
||||
|
||||
if backendConfig.Backend == "" {
|
||||
inferenceModel, err = loader.GreedyLoader(opts...)
|
||||
} else {
|
||||
opts = append(opts, model.WithBackendString(backendConfig.Backend))
|
||||
inferenceModel, err = loader.BackendLoader(opts...)
|
||||
}
|
||||
if err != nil {
|
||||
return schema.TokenizeResponse{}, err
|
||||
}
|
||||
|
||||
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
|
||||
predictOptions.Prompt = s
|
||||
|
||||
// tokenize the string
|
||||
resp, err := inferenceModel.TokenizeString(appConfig.Context, predictOptions)
|
||||
if err != nil {
|
||||
return schema.TokenizeResponse{}, err
|
||||
}
|
||||
|
||||
return schema.TokenizeResponse{
|
||||
Tokens: resp.Tokens,
|
||||
}, nil
|
||||
|
||||
}
|
||||
@@ -14,13 +14,11 @@ import (
|
||||
|
||||
func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
|
||||
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithBackendString(model.WhisperBackend),
|
||||
model.WithModel(backendConfig.Model),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithThreads(uint32(*backendConfig.Threads)),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
})
|
||||
if backendConfig.Backend == "" {
|
||||
backendConfig.Backend = model.WhisperBackend
|
||||
}
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
transcriptionModel, err := ml.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
|
||||
@@ -28,14 +28,9 @@ func ModelTTS(
|
||||
bb = model.PiperBackend
|
||||
}
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
opts := ModelOptions(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(bb),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
ttsModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
|
||||
@@ -41,31 +41,35 @@ type RunCMD struct {
|
||||
Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
|
||||
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
|
||||
|
||||
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
|
||||
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
|
||||
CORSAllowOrigins string `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
|
||||
LibraryPath string `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
|
||||
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
|
||||
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
|
||||
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
|
||||
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
|
||||
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
|
||||
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
|
||||
Peer2Peer bool `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
|
||||
Peer2PeerDHTInterval int `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
|
||||
Peer2PeerOTPInterval int `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
|
||||
Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
|
||||
Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
|
||||
ParallelRequests bool `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
|
||||
SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
|
||||
PreloadBackendOnly bool `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
|
||||
ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
|
||||
EnableWatchdogIdle bool `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
|
||||
WatchdogIdleTimeout string `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
|
||||
EnableWatchdogBusy bool `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
|
||||
WatchdogBusyTimeout string `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
|
||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||
DisableGalleryEndpoint bool `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
|
||||
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
|
||||
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
|
||||
CORSAllowOrigins string `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
|
||||
LibraryPath string `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
|
||||
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
|
||||
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
|
||||
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
|
||||
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
|
||||
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
|
||||
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
|
||||
UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
|
||||
DisableApiKeyRequirementForHttpGet bool `env:"LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET" default:"false" help:"If true, a valid API key is not required to issue GET requests to portions of the web ui. This should only be enabled in secure testing environments" group:"hardening"`
|
||||
HttpGetExemptedEndpoints []string `env:"LOCALAI_HTTP_GET_EXEMPTED_ENDPOINTS" default:"^/$,^/browse/?$,^/talk/?$,^/p2p/?$,^/chat/?$,^/text2image/?$,^/tts/?$,^/static/.*$,^/swagger.*$" help:"If LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET is overriden to true, this is the list of endpoints to exempt. Only adjust this in case of a security incident or as a result of a personal security posture review" group:"hardening"`
|
||||
Peer2Peer bool `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
|
||||
Peer2PeerDHTInterval int `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
|
||||
Peer2PeerOTPInterval int `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
|
||||
Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
|
||||
Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
|
||||
ParallelRequests bool `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
|
||||
SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
|
||||
PreloadBackendOnly bool `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
|
||||
ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
|
||||
EnableWatchdogIdle bool `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
|
||||
WatchdogIdleTimeout string `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
|
||||
EnableWatchdogBusy bool `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
|
||||
WatchdogBusyTimeout string `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
|
||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||
DisableGalleryEndpoint bool `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
|
||||
LoadToMemory []string `env:"LOCALAI_LOAD_TO_MEMORY,LOAD_TO_MEMORY" help:"A list of models to load into memory at startup" group:"models"`
|
||||
}
|
||||
|
||||
func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
@@ -97,7 +101,11 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
|
||||
config.WithOpaqueErrors(r.OpaqueErrors),
|
||||
config.WithEnforcedPredownloadScans(!r.DisablePredownloadScan),
|
||||
config.WithSubtleKeyComparison(r.UseSubtleKeyComparison),
|
||||
config.WithDisableApiKeyRequirementForHttpGet(r.DisableApiKeyRequirementForHttpGet),
|
||||
config.WithHttpGetExemptedEndpoints(r.HttpGetExemptedEndpoints),
|
||||
config.WithP2PNetworkID(r.Peer2PeerNetworkID),
|
||||
config.WithLoadToMemory(r.LoadToMemory),
|
||||
}
|
||||
|
||||
token := ""
|
||||
|
||||
@@ -85,13 +85,14 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
|
||||
|
||||
options := config.BackendConfig{}
|
||||
options.SetDefaults()
|
||||
options.Backend = t.Backend
|
||||
|
||||
var inputFile *string
|
||||
if t.InputFile != "" {
|
||||
inputFile = &t.InputFile
|
||||
}
|
||||
|
||||
filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
|
||||
filePath, _, err := backend.SoundGeneration(t.Model, text,
|
||||
parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
|
||||
inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
|
||||
|
||||
|
||||
@@ -15,8 +15,9 @@ import (
|
||||
)
|
||||
|
||||
type UtilCMD struct {
|
||||
GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
|
||||
HFScan HFScanCMD `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
|
||||
GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
|
||||
HFScan HFScanCMD `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
|
||||
UsecaseHeuristic UsecaseHeuristicCMD `cmd:"" name:"usecase-heuristic" help:"Checks a specific model config and prints what usecase LocalAI will offer for it."`
|
||||
}
|
||||
|
||||
type GGUFInfoCMD struct {
|
||||
@@ -30,6 +31,11 @@ type HFScanCMD struct {
|
||||
ToScan []string `arg:""`
|
||||
}
|
||||
|
||||
type UsecaseHeuristicCMD struct {
|
||||
ConfigName string `name:"The config file to check"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
}
|
||||
|
||||
func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
|
||||
if u.Args == nil || len(u.Args) == 0 {
|
||||
return fmt.Errorf("no GGUF file provided")
|
||||
@@ -99,3 +105,31 @@ func (hfscmd *HFScanCMD) Run(ctx *cliContext.Context) error {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (uhcmd *UsecaseHeuristicCMD) Run(ctx *cliContext.Context) error {
|
||||
if len(uhcmd.ConfigName) == 0 {
|
||||
log.Error().Msg("ConfigName is a required parameter")
|
||||
return fmt.Errorf("config name is a required parameter")
|
||||
}
|
||||
if len(uhcmd.ModelsPath) == 0 {
|
||||
log.Error().Msg("ModelsPath is a required parameter")
|
||||
return fmt.Errorf("model path is a required parameter")
|
||||
}
|
||||
bcl := config.NewBackendConfigLoader(uhcmd.ModelsPath)
|
||||
err := bcl.LoadBackendConfig(uhcmd.ConfigName)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("ConfigName", uhcmd.ConfigName).Msg("error while loading backend")
|
||||
return err
|
||||
}
|
||||
bc, exists := bcl.GetBackendConfig(uhcmd.ConfigName)
|
||||
if !exists {
|
||||
log.Error().Str("ConfigName", uhcmd.ConfigName).Msg("ConfigName not found")
|
||||
}
|
||||
for name, uc := range config.GetAllBackendConfigUsecases() {
|
||||
if bc.HasUsecases(uc) {
|
||||
log.Info().Str("Usecase", name)
|
||||
}
|
||||
}
|
||||
log.Info().Msg("---")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"embed"
|
||||
"encoding/json"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||
@@ -16,7 +17,6 @@ type ApplicationConfig struct {
|
||||
ModelPath string
|
||||
LibPath string
|
||||
UploadLimitMB, Threads, ContextSize int
|
||||
DisableWebUI bool
|
||||
F16 bool
|
||||
Debug bool
|
||||
ImageDir string
|
||||
@@ -31,11 +31,18 @@ type ApplicationConfig struct {
|
||||
PreloadModelsFromPath string
|
||||
CORSAllowOrigins string
|
||||
ApiKeys []string
|
||||
EnforcePredownloadScans bool
|
||||
OpaqueErrors bool
|
||||
P2PToken string
|
||||
P2PNetworkID string
|
||||
|
||||
DisableWebUI bool
|
||||
EnforcePredownloadScans bool
|
||||
OpaqueErrors bool
|
||||
UseSubtleKeyComparison bool
|
||||
DisableApiKeyRequirementForHttpGet bool
|
||||
HttpGetExemptedEndpoints []*regexp.Regexp
|
||||
DisableGalleryEndpoint bool
|
||||
LoadToMemory []string
|
||||
|
||||
ModelLibraryURL string
|
||||
|
||||
Galleries []Gallery
|
||||
@@ -57,8 +64,6 @@ type ApplicationConfig struct {
|
||||
ModelsURL []string
|
||||
|
||||
WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
|
||||
|
||||
DisableGalleryEndpoint bool
|
||||
}
|
||||
|
||||
type AppOption func(*ApplicationConfig)
|
||||
@@ -327,6 +332,38 @@ func WithOpaqueErrors(opaque bool) AppOption {
|
||||
}
|
||||
}
|
||||
|
||||
func WithLoadToMemory(models []string) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.LoadToMemory = models
|
||||
}
|
||||
}
|
||||
|
||||
func WithSubtleKeyComparison(subtle bool) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.UseSubtleKeyComparison = subtle
|
||||
}
|
||||
}
|
||||
|
||||
func WithDisableApiKeyRequirementForHttpGet(required bool) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.DisableApiKeyRequirementForHttpGet = required
|
||||
}
|
||||
}
|
||||
|
||||
func WithHttpGetExemptedEndpoints(endpoints []string) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.HttpGetExemptedEndpoints = []*regexp.Regexp{}
|
||||
for _, epr := range endpoints {
|
||||
r, err := regexp.Compile(epr)
|
||||
if err == nil && r != nil {
|
||||
o.HttpGetExemptedEndpoints = append(o.HttpGetExemptedEndpoints, r)
|
||||
} else {
|
||||
log.Warn().Err(err).Str("regex", epr).Msg("Error while compiling HTTP Get Exemption regex, skipping this entry.")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ToConfigLoaderOptions returns a slice of ConfigLoader Option.
|
||||
// Some options defined at the application level are going to be passed as defaults for
|
||||
// all the configuration for the models.
|
||||
|
||||
@@ -3,11 +3,13 @@ package config
|
||||
import (
|
||||
"os"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/functions"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -27,13 +29,15 @@ type BackendConfig struct {
|
||||
schema.PredictionOptions `yaml:"parameters"`
|
||||
Name string `yaml:"name"`
|
||||
|
||||
F16 *bool `yaml:"f16"`
|
||||
Threads *int `yaml:"threads"`
|
||||
Debug *bool `yaml:"debug"`
|
||||
Roles map[string]string `yaml:"roles"`
|
||||
Embeddings *bool `yaml:"embeddings"`
|
||||
Backend string `yaml:"backend"`
|
||||
TemplateConfig TemplateConfig `yaml:"template"`
|
||||
F16 *bool `yaml:"f16"`
|
||||
Threads *int `yaml:"threads"`
|
||||
Debug *bool `yaml:"debug"`
|
||||
Roles map[string]string `yaml:"roles"`
|
||||
Embeddings *bool `yaml:"embeddings"`
|
||||
Backend string `yaml:"backend"`
|
||||
TemplateConfig TemplateConfig `yaml:"template"`
|
||||
KnownUsecaseStrings []string `yaml:"known_usecases"`
|
||||
KnownUsecases *BackendConfigUsecases `yaml:"-"`
|
||||
|
||||
PromptStrings, InputStrings []string `yaml:"-"`
|
||||
InputToken [][]int `yaml:"-"`
|
||||
@@ -192,6 +196,21 @@ type TemplateConfig struct {
|
||||
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
|
||||
// It defaults to \n
|
||||
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
|
||||
|
||||
Video string `yaml:"video"`
|
||||
Image string `yaml:"image"`
|
||||
Audio string `yaml:"audio"`
|
||||
}
|
||||
|
||||
func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
|
||||
type BCAlias BackendConfig
|
||||
var aux BCAlias
|
||||
if err := value.Decode(&aux); err != nil {
|
||||
return err
|
||||
}
|
||||
*c = BackendConfig(aux)
|
||||
c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *BackendConfig) SetFunctionCallString(s string) {
|
||||
@@ -410,3 +429,121 @@ func (c *BackendConfig) Validate() bool {
|
||||
func (c *BackendConfig) HasTemplate() bool {
|
||||
return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
|
||||
}
|
||||
|
||||
type BackendConfigUsecases int
|
||||
|
||||
const (
|
||||
FLAG_ANY BackendConfigUsecases = 0b000000000
|
||||
FLAG_CHAT BackendConfigUsecases = 0b000000001
|
||||
FLAG_COMPLETION BackendConfigUsecases = 0b000000010
|
||||
FLAG_EDIT BackendConfigUsecases = 0b000000100
|
||||
FLAG_EMBEDDINGS BackendConfigUsecases = 0b000001000
|
||||
FLAG_RERANK BackendConfigUsecases = 0b000010000
|
||||
FLAG_IMAGE BackendConfigUsecases = 0b000100000
|
||||
FLAG_TRANSCRIPT BackendConfigUsecases = 0b001000000
|
||||
FLAG_TTS BackendConfigUsecases = 0b010000000
|
||||
FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
|
||||
|
||||
// Common Subsets
|
||||
FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
|
||||
)
|
||||
|
||||
func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
|
||||
return map[string]BackendConfigUsecases{
|
||||
"FLAG_ANY": FLAG_ANY,
|
||||
"FLAG_CHAT": FLAG_CHAT,
|
||||
"FLAG_COMPLETION": FLAG_COMPLETION,
|
||||
"FLAG_EDIT": FLAG_EDIT,
|
||||
"FLAG_EMBEDDINGS": FLAG_EMBEDDINGS,
|
||||
"FLAG_RERANK": FLAG_RERANK,
|
||||
"FLAG_IMAGE": FLAG_IMAGE,
|
||||
"FLAG_TRANSCRIPT": FLAG_TRANSCRIPT,
|
||||
"FLAG_TTS": FLAG_TTS,
|
||||
"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
|
||||
"FLAG_LLM": FLAG_LLM,
|
||||
}
|
||||
}
|
||||
|
||||
func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
|
||||
if len(input) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := FLAG_ANY
|
||||
flags := GetAllBackendConfigUsecases()
|
||||
for _, str := range input {
|
||||
flag, exists := flags["FLAG_"+strings.ToUpper(str)]
|
||||
if exists {
|
||||
result |= flag
|
||||
}
|
||||
}
|
||||
return &result
|
||||
}
|
||||
|
||||
// HasUsecases examines a BackendConfig and determines which endpoints have a chance of success.
|
||||
func (c *BackendConfig) HasUsecases(u BackendConfigUsecases) bool {
|
||||
if (c.KnownUsecases != nil) && ((u & *c.KnownUsecases) == u) {
|
||||
return true
|
||||
}
|
||||
return c.GuessUsecases(u)
|
||||
}
|
||||
|
||||
// GuessUsecases is a **heuristic based** function, as the backend in question may not be loaded yet, and the config may not record what it's useful at.
|
||||
// In its current state, this function should ideally check for properties of the config like templates, rather than the direct backend name checks for the lower half.
|
||||
// This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.
|
||||
func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
|
||||
if (u & FLAG_CHAT) == FLAG_CHAT {
|
||||
if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_COMPLETION) == FLAG_COMPLETION {
|
||||
if c.TemplateConfig.Completion == "" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_EDIT) == FLAG_EDIT {
|
||||
if c.TemplateConfig.Edit == "" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_EMBEDDINGS) == FLAG_EMBEDDINGS {
|
||||
if c.Embeddings == nil || !*c.Embeddings {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_IMAGE) == FLAG_IMAGE {
|
||||
imageBackends := []string{"diffusers", "tinydream", "stablediffusion"}
|
||||
if !slices.Contains(imageBackends, c.Backend) {
|
||||
return false
|
||||
}
|
||||
|
||||
if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
}
|
||||
if (u & FLAG_RERANK) == FLAG_RERANK {
|
||||
if c.Backend != "rerankers" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_TRANSCRIPT) == FLAG_TRANSCRIPT {
|
||||
if c.Backend != "whisper" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_TTS) == FLAG_TTS {
|
||||
ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
|
||||
if !slices.Contains(ttsBackends, c.Backend) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if (u & FLAG_SOUND_GENERATION) == FLAG_SOUND_GENERATION {
|
||||
if c.Backend != "transformers-musicgen" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
35
core/config/backend_config_filter.go
Normal file
35
core/config/backend_config_filter.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package config
|
||||
|
||||
import "regexp"
|
||||
|
||||
type BackendConfigFilterFn func(string, *BackendConfig) bool
|
||||
|
||||
func NoFilterFn(_ string, _ *BackendConfig) bool { return true }
|
||||
|
||||
func BuildNameFilterFn(filter string) (BackendConfigFilterFn, error) {
|
||||
if filter == "" {
|
||||
return NoFilterFn, nil
|
||||
}
|
||||
rxp, err := regexp.Compile(filter)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return func(name string, config *BackendConfig) bool {
|
||||
if config != nil {
|
||||
return rxp.MatchString(config.Name)
|
||||
}
|
||||
return rxp.MatchString(name)
|
||||
}, nil
|
||||
}
|
||||
|
||||
func BuildUsecaseFilterFn(usecases BackendConfigUsecases) BackendConfigFilterFn {
|
||||
if usecases == FLAG_ANY {
|
||||
return NoFilterFn
|
||||
}
|
||||
return func(name string, config *BackendConfig) bool {
|
||||
if config == nil {
|
||||
return false // TODO: Potentially make this a param, for now, no known usecase to include
|
||||
}
|
||||
return config.HasUsecases(usecases)
|
||||
}
|
||||
}
|
||||
@@ -201,6 +201,26 @@ func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
|
||||
return res
|
||||
}
|
||||
|
||||
func (bcl *BackendConfigLoader) GetBackendConfigsByFilter(filter BackendConfigFilterFn) []BackendConfig {
|
||||
bcl.Lock()
|
||||
defer bcl.Unlock()
|
||||
var res []BackendConfig
|
||||
|
||||
if filter == nil {
|
||||
filter = NoFilterFn
|
||||
}
|
||||
|
||||
for n, v := range bcl.configs {
|
||||
if filter(n, &v) {
|
||||
res = append(res, v)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: I don't think this one needs to Sort on name... but we'll see what breaks.
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (bcl *BackendConfigLoader) RemoveBackendConfig(m string) {
|
||||
bcl.Lock()
|
||||
defer bcl.Unlock()
|
||||
|
||||
@@ -19,12 +19,17 @@ var _ = Describe("Test cases for config related functions", func() {
|
||||
`backend: "../foo-bar"
|
||||
name: "foo"
|
||||
parameters:
|
||||
model: "foo-bar"`)
|
||||
model: "foo-bar"
|
||||
known_usecases:
|
||||
- chat
|
||||
- COMPLETION
|
||||
`)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
config, err := readBackendConfigFromFile(tmp.Name())
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
Expect(config.Validate()).To(BeFalse())
|
||||
Expect(config.KnownUsecases).ToNot(BeNil())
|
||||
})
|
||||
It("Test Validate", func() {
|
||||
tmp, err := os.CreateTemp("", "config.yaml")
|
||||
@@ -61,4 +66,99 @@ parameters:
|
||||
Expect(config.Validate()).To(BeTrue())
|
||||
})
|
||||
})
|
||||
It("Properly handles backend usecase matching", func() {
|
||||
|
||||
a := BackendConfig{
|
||||
Name: "a",
|
||||
}
|
||||
Expect(a.HasUsecases(FLAG_ANY)).To(BeTrue()) // FLAG_ANY just means the config _exists_ essentially.
|
||||
|
||||
b := BackendConfig{
|
||||
Name: "b",
|
||||
Backend: "stablediffusion",
|
||||
}
|
||||
Expect(b.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(b.HasUsecases(FLAG_IMAGE)).To(BeTrue())
|
||||
Expect(b.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
||||
|
||||
c := BackendConfig{
|
||||
Name: "c",
|
||||
Backend: "llama-cpp",
|
||||
TemplateConfig: TemplateConfig{
|
||||
Chat: "chat",
|
||||
},
|
||||
}
|
||||
Expect(c.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(c.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
||||
Expect(c.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
|
||||
Expect(c.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
||||
|
||||
d := BackendConfig{
|
||||
Name: "d",
|
||||
Backend: "llama-cpp",
|
||||
TemplateConfig: TemplateConfig{
|
||||
Chat: "chat",
|
||||
Completion: "completion",
|
||||
},
|
||||
}
|
||||
Expect(d.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(d.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
||||
Expect(d.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
||||
Expect(d.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
||||
|
||||
trueValue := true
|
||||
e := BackendConfig{
|
||||
Name: "e",
|
||||
Backend: "llama-cpp",
|
||||
TemplateConfig: TemplateConfig{
|
||||
Completion: "completion",
|
||||
},
|
||||
Embeddings: &trueValue,
|
||||
}
|
||||
|
||||
Expect(e.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(e.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
||||
Expect(e.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
||||
Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
||||
Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())
|
||||
|
||||
f := BackendConfig{
|
||||
Name: "f",
|
||||
Backend: "piper",
|
||||
}
|
||||
Expect(f.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(f.HasUsecases(FLAG_TTS)).To(BeTrue())
|
||||
Expect(f.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
||||
|
||||
g := BackendConfig{
|
||||
Name: "g",
|
||||
Backend: "whisper",
|
||||
}
|
||||
Expect(g.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(g.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
|
||||
Expect(g.HasUsecases(FLAG_TTS)).To(BeFalse())
|
||||
|
||||
h := BackendConfig{
|
||||
Name: "h",
|
||||
Backend: "transformers-musicgen",
|
||||
}
|
||||
Expect(h.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(h.HasUsecases(FLAG_TRANSCRIPT)).To(BeFalse())
|
||||
Expect(h.HasUsecases(FLAG_TTS)).To(BeTrue())
|
||||
Expect(h.HasUsecases(FLAG_SOUND_GENERATION)).To(BeTrue())
|
||||
|
||||
knownUsecases := FLAG_CHAT | FLAG_COMPLETION
|
||||
i := BackendConfig{
|
||||
Name: "i",
|
||||
Backend: "whisper",
|
||||
// Earlier test checks parsing, this just needs to set final values
|
||||
KnownUsecases: &knownUsecases,
|
||||
}
|
||||
Expect(i.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(i.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
|
||||
Expect(i.HasUsecases(FLAG_TTS)).To(BeFalse())
|
||||
Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
||||
Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
||||
|
||||
})
|
||||
})
|
||||
|
||||
@@ -132,7 +132,7 @@ func AvailableGalleryModels(galleries []config.Gallery, basePath string) ([]*Gal
|
||||
func findGalleryURLFromReferenceURL(url string, basePath string) (string, error) {
|
||||
var refFile string
|
||||
uri := downloader.URI(url)
|
||||
err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
|
||||
err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
|
||||
refFile = string(d)
|
||||
if len(refFile) == 0 {
|
||||
return fmt.Errorf("invalid reference file at url %s: %s", url, d)
|
||||
@@ -156,7 +156,7 @@ func getGalleryModels(gallery config.Gallery, basePath string) ([]*GalleryModel,
|
||||
}
|
||||
uri := downloader.URI(gallery.URL)
|
||||
|
||||
err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
|
||||
err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
|
||||
return yaml.Unmarshal(d, &models)
|
||||
})
|
||||
if err != nil {
|
||||
|
||||
@@ -69,7 +69,7 @@ type PromptTemplate struct {
|
||||
func GetGalleryConfigFromURL(url string, basePath string) (Config, error) {
|
||||
var config Config
|
||||
uri := downloader.URI(url)
|
||||
err := uri.DownloadAndUnmarshal(basePath, func(url string, d []byte) error {
|
||||
err := uri.DownloadWithCallback(basePath, func(url string, d []byte) error {
|
||||
return yaml.Unmarshal(d, &config)
|
||||
})
|
||||
if err != nil {
|
||||
|
||||
@@ -3,13 +3,15 @@ package http
|
||||
import (
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/dave-gray101/v2keyauth"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
|
||||
"github.com/mudler/LocalAI/core/http/endpoints/localai"
|
||||
"github.com/mudler/LocalAI/core/http/endpoints/openai"
|
||||
"github.com/mudler/LocalAI/core/http/middleware"
|
||||
"github.com/mudler/LocalAI/core/http/routes"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
@@ -29,24 +31,6 @@ import (
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func readAuthHeader(c *fiber.Ctx) string {
|
||||
authHeader := c.Get("Authorization")
|
||||
|
||||
// elevenlabs
|
||||
xApiKey := c.Get("xi-api-key")
|
||||
if xApiKey != "" {
|
||||
authHeader = "Bearer " + xApiKey
|
||||
}
|
||||
|
||||
// anthropic
|
||||
xApiKey = c.Get("x-api-key")
|
||||
if xApiKey != "" {
|
||||
authHeader = "Bearer " + xApiKey
|
||||
}
|
||||
|
||||
return authHeader
|
||||
}
|
||||
|
||||
// Embed a directory
|
||||
//
|
||||
//go:embed static/*
|
||||
@@ -137,37 +121,17 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
|
||||
})
|
||||
}
|
||||
|
||||
// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
|
||||
auth := func(c *fiber.Ctx) error {
|
||||
if len(appConfig.ApiKeys) == 0 {
|
||||
return c.Next()
|
||||
}
|
||||
// Health Checks should always be exempt from auth, so register these first
|
||||
routes.HealthRoutes(app)
|
||||
|
||||
if len(appConfig.ApiKeys) == 0 {
|
||||
return c.Next()
|
||||
}
|
||||
|
||||
authHeader := readAuthHeader(c)
|
||||
if authHeader == "" {
|
||||
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
|
||||
}
|
||||
|
||||
// If it's a bearer token
|
||||
authHeaderParts := strings.Split(authHeader, " ")
|
||||
if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
|
||||
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
|
||||
}
|
||||
|
||||
apiKey := authHeaderParts[1]
|
||||
for _, key := range appConfig.ApiKeys {
|
||||
if apiKey == key {
|
||||
return c.Next()
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
|
||||
kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
|
||||
if err != nil || kaConfig == nil {
|
||||
return nil, fmt.Errorf("failed to create key auth config: %w", err)
|
||||
}
|
||||
|
||||
// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
|
||||
app.Use(v2keyauth.New(*kaConfig))
|
||||
|
||||
if appConfig.CORS {
|
||||
var c func(ctx *fiber.Ctx) error
|
||||
if appConfig.CORSAllowOrigins == "" {
|
||||
@@ -192,13 +156,13 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
|
||||
galleryService := services.NewGalleryService(appConfig)
|
||||
galleryService.Start(appConfig.Context, cl)
|
||||
|
||||
routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig, auth)
|
||||
routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService, auth)
|
||||
routes.RegisterOpenAIRoutes(app, cl, ml, appConfig, auth)
|
||||
routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
|
||||
routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
|
||||
routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
|
||||
if !appConfig.DisableWebUI {
|
||||
routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService, auth)
|
||||
routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
|
||||
}
|
||||
routes.RegisterJINARoutes(app, cl, ml, appConfig, auth)
|
||||
routes.RegisterJINARoutes(app, cl, ml, appConfig)
|
||||
|
||||
httpFS := http.FS(embedDirStatic)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/mudler/LocalAI/core/http"
|
||||
@@ -31,6 +32,9 @@ import (
|
||||
"github.com/sashabaranov/go-openai/jsonschema"
|
||||
)
|
||||
|
||||
const apiKey = "joshua"
|
||||
const bearerKey = "Bearer " + apiKey
|
||||
|
||||
const testPrompt = `### System:
|
||||
You are an AI assistant that follows instruction extremely well. Help as much as you can.
|
||||
|
||||
@@ -50,11 +54,19 @@ type modelApplyRequest struct {
|
||||
|
||||
func getModelStatus(url string) (response map[string]interface{}) {
|
||||
// Create the HTTP request
|
||||
resp, err := http.Get(url)
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", bearerKey)
|
||||
if err != nil {
|
||||
fmt.Println("Error creating request:", err)
|
||||
return
|
||||
}
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
fmt.Println("Error sending request:", err)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
@@ -72,14 +84,15 @@ func getModelStatus(url string) (response map[string]interface{}) {
|
||||
return
|
||||
}
|
||||
|
||||
func getModels(url string) (response []gallery.GalleryModel) {
|
||||
func getModels(url string) ([]gallery.GalleryModel, error) {
|
||||
response := []gallery.GalleryModel{}
|
||||
uri := downloader.URI(url)
|
||||
// TODO: No tests currently seem to exercise file:// urls. Fix?
|
||||
uri.DownloadAndUnmarshal("", func(url string, i []byte) error {
|
||||
err := uri.DownloadWithAuthorizationAndCallback("", bearerKey, func(url string, i []byte) error {
|
||||
// Unmarshal YAML data into a struct
|
||||
return json.Unmarshal(i, &response)
|
||||
})
|
||||
return
|
||||
return response, err
|
||||
}
|
||||
|
||||
func postModelApplyRequest(url string, request modelApplyRequest) (response map[string]interface{}) {
|
||||
@@ -101,6 +114,7 @@ func postModelApplyRequest(url string, request modelApplyRequest) (response map[
|
||||
return
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", bearerKey)
|
||||
|
||||
// Make the request
|
||||
client := &http.Client{}
|
||||
@@ -140,6 +154,7 @@ func postRequestJSON[B any](url string, bodyJson *B) error {
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", bearerKey)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
@@ -175,6 +190,7 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", bearerKey)
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
@@ -195,6 +211,35 @@ func postRequestResponseJSON[B1 any, B2 any](url string, reqJson *B1, respJson *
|
||||
return json.Unmarshal(body, respJson)
|
||||
}
|
||||
|
||||
func postInvalidRequest(url string) (error, int) {
|
||||
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBufferString("invalid request"))
|
||||
if err != nil {
|
||||
return err, -1
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return err, -1
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return err, -1
|
||||
}
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 400 {
|
||||
return fmt.Errorf("unexpected status code: %d, body: %s", resp.StatusCode, string(body)), resp.StatusCode
|
||||
}
|
||||
|
||||
return nil, resp.StatusCode
|
||||
}
|
||||
|
||||
//go:embed backend-assets/*
|
||||
var backendAssets embed.FS
|
||||
|
||||
@@ -260,6 +305,7 @@ var _ = Describe("API test", func() {
|
||||
config.WithContext(c),
|
||||
config.WithGalleries(galleries),
|
||||
config.WithModelPath(modelDir),
|
||||
config.WithApiKeys([]string{apiKey}),
|
||||
config.WithBackendAssets(backendAssets),
|
||||
config.WithBackendAssetsOutput(backendAssetsDir))...)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
@@ -269,7 +315,7 @@ var _ = Describe("API test", func() {
|
||||
|
||||
go app.Listen("127.0.0.1:9090")
|
||||
|
||||
defaultConfig := openai.DefaultConfig("")
|
||||
defaultConfig := openai.DefaultConfig(apiKey)
|
||||
defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
|
||||
|
||||
client2 = openaigo.NewClient("")
|
||||
@@ -295,10 +341,19 @@ var _ = Describe("API test", func() {
|
||||
Expect(err).To(HaveOccurred())
|
||||
})
|
||||
|
||||
Context("Auth Tests", func() {
|
||||
It("Should fail if the api key is missing", func() {
|
||||
err, sc := postInvalidRequest("http://127.0.0.1:9090/models/available")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(sc).To(Equal(403))
|
||||
})
|
||||
})
|
||||
|
||||
Context("Applying models", func() {
|
||||
|
||||
It("applies models from a gallery", func() {
|
||||
models := getModels("http://127.0.0.1:9090/models/available")
|
||||
models, err := getModels("http://127.0.0.1:9090/models/available")
|
||||
Expect(err).To(BeNil())
|
||||
Expect(len(models)).To(Equal(2), fmt.Sprint(models))
|
||||
Expect(models[0].Installed).To(BeFalse(), fmt.Sprint(models))
|
||||
Expect(models[1].Installed).To(BeFalse(), fmt.Sprint(models))
|
||||
@@ -331,7 +386,8 @@ var _ = Describe("API test", func() {
|
||||
Expect(content["backend"]).To(Equal("bert-embeddings"))
|
||||
Expect(content["foo"]).To(Equal("bar"))
|
||||
|
||||
models = getModels("http://127.0.0.1:9090/models/available")
|
||||
models, err = getModels("http://127.0.0.1:9090/models/available")
|
||||
Expect(err).To(BeNil())
|
||||
Expect(len(models)).To(Equal(2), fmt.Sprint(models))
|
||||
Expect(models[0].Name).To(Or(Equal("bert"), Equal("bert2")))
|
||||
Expect(models[1].Name).To(Or(Equal("bert"), Equal("bert2")))
|
||||
@@ -895,7 +951,7 @@ var _ = Describe("API test", func() {
|
||||
openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(len(resp.Choices) > 0).To(BeTrue())
|
||||
Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
|
||||
Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
|
||||
|
||||
stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
@@ -914,7 +970,7 @@ var _ = Describe("API test", func() {
|
||||
tokens++
|
||||
}
|
||||
Expect(text).ToNot(BeEmpty())
|
||||
Expect(text).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
|
||||
Expect(strings.ToLower(text)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
|
||||
|
||||
Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
|
||||
})
|
||||
|
||||
@@ -19,14 +19,16 @@ func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *mo
|
||||
if ctx.Params("model") != "" {
|
||||
modelInput = ctx.Params("model")
|
||||
}
|
||||
|
||||
if ctx.Query("model") != "" {
|
||||
modelInput = ctx.Query("model")
|
||||
}
|
||||
// Set model from bearer token, if available
|
||||
bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
|
||||
bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
|
||||
bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
|
||||
|
||||
// If no model was specified, take the first available
|
||||
if modelInput == "" && !bearerExists && firstModel {
|
||||
models, _ := services.ListModels(cl, loader, "", true)
|
||||
models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
if len(models) > 0 {
|
||||
modelInput = models[0]
|
||||
log.Debug().Msgf("No model specified, using: %s", modelInput)
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
|
||||
"github.com/chasefleming/elem-go"
|
||||
"github.com/chasefleming/elem-go/attrs"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/p2p"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
@@ -41,7 +42,7 @@ func DoneProgress(galleryID, text string, showDelete bool) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
),
|
||||
elem.If(showDelete, deleteButton(galleryID, modelName), reInstallButton(galleryID)),
|
||||
).Render()
|
||||
@@ -57,7 +58,7 @@ func ErrorProgress(err, galleryName string) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text("Error "+err),
|
||||
elem.Text("Error "+bluemonday.StrictPolicy().Sanitize(err)),
|
||||
),
|
||||
installButton(galleryName),
|
||||
).Render()
|
||||
@@ -170,7 +171,7 @@ func P2PNodeBoxes(nodes []p2p.NodeData) string {
|
||||
attrs.Props{
|
||||
"class": "text-gray-200 font-semibold ml-2 mr-1",
|
||||
},
|
||||
elem.Text(n.ID),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(n.ID)),
|
||||
),
|
||||
elem.Text("Status: "),
|
||||
elem.If(
|
||||
@@ -227,7 +228,7 @@ func StartProgressBar(uid, progress, text string) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)), //Perhaps overly defensive
|
||||
elem.Div(attrs.Props{
|
||||
"hx-get": "/browse/job/progress/" + uid,
|
||||
"hx-trigger": "every 600ms",
|
||||
@@ -249,9 +250,7 @@ func cardSpan(text, icon string) elem.Node {
|
||||
"class": icon + " pr-2",
|
||||
}),
|
||||
|
||||
elem.Text(text),
|
||||
|
||||
//elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -285,11 +284,9 @@ func searchableElement(text, icon string) elem.Node {
|
||||
elem.I(attrs.Props{
|
||||
"class": icon + " pr-2",
|
||||
}),
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
),
|
||||
),
|
||||
|
||||
//elem.Text(text),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -303,7 +300,7 @@ func link(text, url string) elem.Node {
|
||||
elem.I(attrs.Props{
|
||||
"class": "fas fa-link pr-2",
|
||||
}),
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
)
|
||||
}
|
||||
func installButton(galleryName string) elem.Node {
|
||||
@@ -387,13 +384,13 @@ func ListModels(models []*gallery.GalleryModel, processTracker ProcessTracker, g
|
||||
attrs.Props{
|
||||
"class": "mb-2 text-xl font-bold leading-tight",
|
||||
},
|
||||
elem.Text(m.Name),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(m.Name)),
|
||||
),
|
||||
elem.P(
|
||||
attrs.Props{
|
||||
"class": "mb-4 text-sm [&:not(:hover)]:truncate text-base",
|
||||
},
|
||||
elem.Text(m.Description),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(m.Description)),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -55,7 +55,7 @@ func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
}
|
||||
|
||||
// TODO: Support uploading files?
|
||||
filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
|
||||
filePath, _, err := backend.SoundGeneration(modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -45,13 +45,13 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
|
||||
config.LoadOptionContextSize(appConfig.ContextSize),
|
||||
config.LoadOptionF16(appConfig.F16),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
} else {
|
||||
modelFile = cfg.Model
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Request for model: %s", modelFile)
|
||||
|
||||
if input.Backend != "" {
|
||||
@@ -64,7 +64,7 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
|
||||
Documents: req.Documents,
|
||||
}
|
||||
|
||||
results, err := backend.Rerank(cfg.Backend, modelFile, request, ml, appConfig, *cfg)
|
||||
results, err := backend.Rerank(modelFile, request, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
60
core/http/endpoints/localai/get_token_metrics.go
Normal file
60
core/http/endpoints/localai/get_token_metrics.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package localai
|
||||
|
||||
import (
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
fiberContext "github.com/mudler/LocalAI/core/http/ctx"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
// TokenMetricsEndpoint is an endpoint to get TokensProcessed Per Second for Active SlotID
|
||||
//
|
||||
// @Summary Get TokenMetrics for Active Slot.
|
||||
// @Accept json
|
||||
// @Produce audio/x-wav
|
||||
// @Success 200 {string} binary "generated audio/wav file"
|
||||
// @Router /v1/tokenMetrics [get]
|
||||
// @Router /tokenMetrics [get]
|
||||
func TokenMetricsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
|
||||
input := new(schema.TokenMetricsRequest)
|
||||
|
||||
// Get input data from the request body
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
|
||||
if err != nil {
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
}
|
||||
|
||||
cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
|
||||
config.LoadOptionDebug(appConfig.Debug),
|
||||
config.LoadOptionThreads(appConfig.Threads),
|
||||
config.LoadOptionContextSize(appConfig.ContextSize),
|
||||
config.LoadOptionF16(appConfig.F16),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Err(err)
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
} else {
|
||||
modelFile = cfg.Model
|
||||
}
|
||||
log.Debug().Msgf("Token Metrics for model: %s", modelFile)
|
||||
|
||||
response, err := backend.TokenMetrics(modelFile, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return c.JSON(response)
|
||||
}
|
||||
}
|
||||
@@ -17,12 +17,14 @@ func SystemInformations(ml *model.ModelLoader, appConfig *config.ApplicationConf
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
loadedModels := ml.ListModels()
|
||||
for b := range appConfig.ExternalGRPCBackends {
|
||||
availableBackends = append(availableBackends, b)
|
||||
}
|
||||
return c.JSON(
|
||||
schema.SystemInformationResponse{
|
||||
Backends: availableBackends,
|
||||
Models: loadedModels,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
58
core/http/endpoints/localai/tokenize.go
Normal file
58
core/http/endpoints/localai/tokenize.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package localai
|
||||
|
||||
import (
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
fiberContext "github.com/mudler/LocalAI/core/http/ctx"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// TokenizeEndpoint exposes a REST API to tokenize the content
|
||||
// @Summary Tokenize the input.
|
||||
// @Success 200 {object} schema.TokenizeResponse "Response"
|
||||
// @Router /v1/tokenize [post]
|
||||
func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
|
||||
input := new(schema.TokenizeRequest)
|
||||
|
||||
// Get input data from the request body
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
|
||||
if err != nil {
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
}
|
||||
|
||||
cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
|
||||
config.LoadOptionDebug(appConfig.Debug),
|
||||
config.LoadOptionThreads(appConfig.Threads),
|
||||
config.LoadOptionContextSize(appConfig.ContextSize),
|
||||
config.LoadOptionF16(appConfig.F16),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Err(err)
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
} else {
|
||||
modelFile = cfg.Model
|
||||
}
|
||||
log.Debug().Msgf("Request for model: %s", modelFile)
|
||||
|
||||
tokenResponse, err := backend.ModelTokenize(input.Content, ml, *cfg, appConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.JSON(tokenResponse)
|
||||
return nil
|
||||
|
||||
}
|
||||
}
|
||||
@@ -13,15 +13,10 @@ import (
|
||||
func WelcomeEndpoint(appConfig *config.ApplicationConfig,
|
||||
cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
models, _ := services.ListModels(cl, ml, "", true)
|
||||
backendConfigs := cl.GetAllBackendConfigs()
|
||||
|
||||
galleryConfigs := map[string]*gallery.Config{}
|
||||
modelsWithBackendConfig := map[string]interface{}{}
|
||||
|
||||
for _, m := range backendConfigs {
|
||||
modelsWithBackendConfig[m.Name] = nil
|
||||
|
||||
cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
|
||||
if err != nil {
|
||||
continue
|
||||
@@ -29,17 +24,11 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
|
||||
galleryConfigs[m.Name] = cfg
|
||||
}
|
||||
|
||||
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
|
||||
|
||||
// Get model statuses to display in the UI the operation in progress
|
||||
processingModels, taskTypes := modelStatus()
|
||||
|
||||
modelsWithoutConfig := []string{}
|
||||
|
||||
for _, m := range models {
|
||||
if _, ok := modelsWithBackendConfig[m]; !ok {
|
||||
modelsWithoutConfig = append(modelsWithoutConfig, m)
|
||||
}
|
||||
}
|
||||
|
||||
summary := fiber.Map{
|
||||
"Title": "LocalAI API - " + internal.PrintableVersion(),
|
||||
"Version": internal.PrintableVersion(),
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
@@ -83,7 +84,7 @@ func CreateAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
|
||||
if !modelExists(cl, ml, request.Model) {
|
||||
log.Warn().Msgf("Model: %s was not found in list of models.", request.Model)
|
||||
return c.Status(fiber.StatusBadRequest).SendString("Model " + request.Model + " not found")
|
||||
return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Model %q not found", request.Model)))
|
||||
}
|
||||
|
||||
if request.Tools == nil {
|
||||
@@ -147,7 +148,7 @@ func ListAssistantsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoade
|
||||
// Convert string limit to integer
|
||||
limit, err := strconv.Atoi(limitQuery)
|
||||
if err != nil {
|
||||
return c.Status(http.StatusBadRequest).SendString(fmt.Sprintf("Invalid limit query value: %s", limitQuery))
|
||||
return c.Status(http.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Invalid limit query value: %s", limitQuery)))
|
||||
}
|
||||
|
||||
// Sort assistants
|
||||
@@ -225,7 +226,7 @@ func filterAssistantsAfterID(assistants []Assistant, id string) []Assistant {
|
||||
|
||||
func modelExists(cl *config.BackendConfigLoader, ml *model.ModelLoader, modelName string) (found bool) {
|
||||
found = false
|
||||
models, err := services.ListModels(cl, ml, "", true)
|
||||
models, err := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@@ -288,7 +289,7 @@ func GetAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader,
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -337,11 +338,11 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find file_id: %s", request.FileID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find file_id: %s", request.FileID)))
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find %q", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find %q", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -442,7 +443,7 @@ func ModifyAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
return c.Status(fiber.StatusOK).JSON(newAssistant)
|
||||
}
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -513,9 +514,9 @@ func GetAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoa
|
||||
if assistantFile.ID == fileId {
|
||||
return c.Status(fiber.StatusOK).JSON(assistantFile)
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId)))
|
||||
}
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,6 +161,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
||||
textContentToReturn = ""
|
||||
id = uuid.New().String()
|
||||
created = int(time.Now().Unix())
|
||||
// Set CorrelationID
|
||||
correlationID := c.Get("X-Correlation-ID")
|
||||
if len(strings.TrimSpace(correlationID)) == 0 {
|
||||
correlationID = id
|
||||
}
|
||||
c.Set("X-Correlation-ID", correlationID)
|
||||
|
||||
modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
|
||||
if err != nil {
|
||||
@@ -444,6 +450,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
||||
c.Set("Cache-Control", "no-cache")
|
||||
c.Set("Connection", "keep-alive")
|
||||
c.Set("Transfer-Encoding", "chunked")
|
||||
c.Set("X-Correlation-ID", id)
|
||||
|
||||
responses := make(chan schema.OpenAIResponse)
|
||||
|
||||
@@ -640,8 +647,16 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
|
||||
for _, m := range input.Messages {
|
||||
images = append(images, m.StringImages...)
|
||||
}
|
||||
videos := []string{}
|
||||
for _, m := range input.Messages {
|
||||
videos = append(videos, m.StringVideos...)
|
||||
}
|
||||
audios := []string{}
|
||||
for _, m := range input.Messages {
|
||||
audios = append(audios, m.StringAudios...)
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
|
||||
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("model inference failed")
|
||||
return "", err
|
||||
|
||||
@@ -57,6 +57,8 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
|
||||
}
|
||||
|
||||
return func(c *fiber.Ctx) error {
|
||||
// Add Correlation
|
||||
c.Set("X-Correlation-ID", id)
|
||||
modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed reading parameters from request:%w", err)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user