mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 11:13:31 -05:00
Compare commits
255 Commits
cleanup_de
...
fix/closed
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83110891fd | ||
|
|
2553de0187 | ||
|
|
408dfe62ee | ||
|
|
648ffdf449 | ||
|
|
04c0841ca9 | ||
|
|
43144c4743 | ||
|
|
a778668bcd | ||
|
|
4b131a7090 | ||
|
|
d06a052d54 | ||
|
|
b5115903bf | ||
|
|
afaff175d0 | ||
|
|
4686877c6d | ||
|
|
e5586e8781 | ||
|
|
3acd767ac4 | ||
|
|
5488fc3bc1 | ||
|
|
0965c6cd68 | ||
|
|
db704199dc | ||
|
|
2cc3b7128e | ||
|
|
88b99d30bb | ||
|
|
307a835199 | ||
|
|
f84b55d1ef | ||
|
|
139209353f | ||
|
|
a30058b80f | ||
|
|
53f406dc35 | ||
|
|
2649407f44 | ||
|
|
0a8f627cce | ||
|
|
76d4e88e0c | ||
|
|
d4d2a76f8f | ||
|
|
7d306c6431 | ||
|
|
44bdacac61 | ||
|
|
6bd6e2bdeb | ||
|
|
2908ff3f6b | ||
|
|
f19277b8e2 | ||
|
|
32de75c683 | ||
|
|
164a9e972f | ||
|
|
d747f2c89b | ||
|
|
58662db48e | ||
|
|
078942fc9f | ||
|
|
6dfee99575 | ||
|
|
ad62156d54 | ||
|
|
1689740269 | ||
|
|
50a3b54e34 | ||
|
|
e94a50e9db | ||
|
|
4e0f3cc980 | ||
|
|
2a8cbad122 | ||
|
|
453c45d022 | ||
|
|
4550abbfce | ||
|
|
f2ba1cfb01 | ||
|
|
8c4196faf3 | ||
|
|
b0f4556c0f | ||
|
|
fa5c98549a | ||
|
|
3d12d2037c | ||
|
|
d6522e69ca | ||
|
|
ef1507d000 | ||
|
|
a3d69872e3 | ||
|
|
33b2d38dd0 | ||
|
|
74408bdc77 | ||
|
|
8c4f720fb5 | ||
|
|
8002ad27cb | ||
|
|
1b8a77433a | ||
|
|
a370a11115 | ||
|
|
aa87eff283 | ||
|
|
0d784f46e5 | ||
|
|
c54cfd3609 | ||
|
|
6555994060 | ||
|
|
0893d3cbbe | ||
|
|
90cacb9692 | ||
|
|
69d2902b0a | ||
|
|
c1752cbb83 | ||
|
|
b8e129f2a6 | ||
|
|
cc6fac1688 | ||
|
|
043cb94436 | ||
|
|
bbdf78615e | ||
|
|
e332ff8066 | ||
|
|
26d99ed1c7 | ||
|
|
1da8d8b9db | ||
|
|
bf8f8671d1 | ||
|
|
51cba89682 | ||
|
|
3e8e71f8b6 | ||
|
|
4edd8c80b4 | ||
|
|
fd70a22196 | ||
|
|
56f4deb938 | ||
|
|
9bd7f3f995 | ||
|
|
ee21b00a8d | ||
|
|
1f43678d53 | ||
|
|
20c0e128c0 | ||
|
|
5c3d1d81e6 | ||
|
|
c22b3187a7 | ||
|
|
54f2657870 | ||
|
|
cef7f8a014 | ||
|
|
bf8e50a11d | ||
|
|
6c6cd8bbe0 | ||
|
|
00d6c2a966 | ||
|
|
415cf31aa3 | ||
|
|
f55053bfba | ||
|
|
e24654ada0 | ||
|
|
c4cecba07f | ||
|
|
38cad0b8dc | ||
|
|
052af98dcd | ||
|
|
56d8f5163c | ||
|
|
b6af4f4467 | ||
|
|
a5b08f43ff | ||
|
|
c15f506fd5 | ||
|
|
a2a63460e9 | ||
|
|
2fcea486eb | ||
|
|
5c9d26e39b | ||
|
|
191bc2e50a | ||
|
|
fbb9facda4 | ||
|
|
c6a819e92f | ||
|
|
a50cde69a2 | ||
|
|
e5bd74878e | ||
|
|
dc98b2ea44 | ||
|
|
acf119828f | ||
|
|
a53392f919 | ||
|
|
eee1fb2c75 | ||
|
|
8826ca93b3 | ||
|
|
5049629381 | ||
|
|
92136a5d34 | ||
|
|
075e5015c0 | ||
|
|
46fd4ff6db | ||
|
|
4a4e44bf55 | ||
|
|
22247ad92c | ||
|
|
d0f2bf3181 | ||
|
|
0e4e101101 | ||
|
|
f4b1bd8f6d | ||
|
|
e95cb8eaac | ||
|
|
db1159b651 | ||
|
|
a9a3a07c3b | ||
|
|
06c8339862 | ||
|
|
2394f7833f | ||
|
|
36e19928eb | ||
|
|
abc27e0dc4 | ||
|
|
42d6b9e0cc | ||
|
|
c866b77586 | ||
|
|
5356b81b7f | ||
|
|
30fe163100 | ||
|
|
afb5bbc1b8 | ||
|
|
12a8d0e46f | ||
|
|
09c7d8d458 | ||
|
|
149cc1eb13 | ||
|
|
a5ce987bdb | ||
|
|
2edc732c33 | ||
|
|
fec01d9e69 | ||
|
|
9ca5ef339a | ||
|
|
a8003f2b7c | ||
|
|
25deb4ba95 | ||
|
|
3d3db1d74f | ||
|
|
cabb1602e8 | ||
|
|
25e7661de2 | ||
|
|
cbfab81c35 | ||
|
|
925315ab5c | ||
|
|
5213e79f5c | ||
|
|
7fe6d0ad2b | ||
|
|
cf747bcdec | ||
|
|
d51444d606 | ||
|
|
e35d8169b1 | ||
|
|
a7ac2f7bb0 | ||
|
|
b7496dea9b | ||
|
|
8f45852273 | ||
|
|
48a1a7da23 | ||
|
|
535f771761 | ||
|
|
eda5c1422b | ||
|
|
300f2779e4 | ||
|
|
3be71811ca | ||
|
|
4cfa040f17 | ||
|
|
c5d5092347 | ||
|
|
c3e374f30a | ||
|
|
424b2e0064 | ||
|
|
486b491c4c | ||
|
|
7781dfe49e | ||
|
|
5139dadceb | ||
|
|
96ea240b39 | ||
|
|
8b8522046d | ||
|
|
36d980e520 | ||
|
|
11c16f529e | ||
|
|
58c4a6d9d9 | ||
|
|
9a159fbfad | ||
|
|
791c3ace72 | ||
|
|
ac5d655598 | ||
|
|
a9f438e1e6 | ||
|
|
6b72bdcb0a | ||
|
|
f336c1a7b8 | ||
|
|
47bc72343c | ||
|
|
bf87943da7 | ||
|
|
dbe1e652bc | ||
|
|
8ac79cfc33 | ||
|
|
0180bcf22a | ||
|
|
a7998e0263 | ||
|
|
923e4cce85 | ||
|
|
e2d40d0fcc | ||
|
|
70f6d80677 | ||
|
|
22e30fccbc | ||
|
|
5da07b0a84 | ||
|
|
0ff1b7f8f7 | ||
|
|
71be066937 | ||
|
|
b95c523385 | ||
|
|
589a2ac869 | ||
|
|
68fc014c6d | ||
|
|
56db715a91 | ||
|
|
c2804c42fe | ||
|
|
1655411ccd | ||
|
|
3daba4731c | ||
|
|
45ce1803f8 | ||
|
|
164dee65c3 | ||
|
|
2259512345 | ||
|
|
b8e7a76524 | ||
|
|
72f97e62bb | ||
|
|
607fd066f0 | ||
|
|
69a3b22fa1 | ||
|
|
11d960b2a6 | ||
|
|
ae6d327698 | ||
|
|
bb9a5aea9e | ||
|
|
49739e85a0 | ||
|
|
12950cac21 | ||
|
|
d2da2f1672 | ||
|
|
e1d0d94073 | ||
|
|
b5b01ea635 | ||
|
|
bc684c259c | ||
|
|
da3bc8077d | ||
|
|
6a6094a58d | ||
|
|
8369614b6e | ||
|
|
cac472d4a1 | ||
|
|
6d0ede813f | ||
|
|
a0252127a2 | ||
|
|
00ad01fd79 | ||
|
|
d5c0ad8a1b | ||
|
|
11ed1cebb3 | ||
|
|
fc640be591 | ||
|
|
311954f41b | ||
|
|
bbfa5075f6 | ||
|
|
47fe31aa53 | ||
|
|
18dddc1ae0 | ||
|
|
b38fd8780b | ||
|
|
11eaf9c0a7 | ||
|
|
5d892f86ea | ||
|
|
7f06954425 | ||
|
|
771a052480 | ||
|
|
99b57b321b | ||
|
|
75ef6ccf1e | ||
|
|
de1fbdca71 | ||
|
|
ce827139bb | ||
|
|
0762aa5327 | ||
|
|
81ae92f017 | ||
|
|
84d6e5a987 | ||
|
|
ac5f6f210b | ||
|
|
61fe2404a0 | ||
|
|
db2d8f4d04 | ||
|
|
a9c521eb41 | ||
|
|
a913fd310d | ||
|
|
fbaae8528d | ||
|
|
7d030b56b2 | ||
|
|
0add16049e | ||
|
|
2bb48b0816 | ||
|
|
023ce59d44 | ||
|
|
7822d944b5 |
@@ -9,6 +9,7 @@
|
||||
# Param 2: email
|
||||
#
|
||||
config_user() {
|
||||
echo "Configuring git for $1 <$2>"
|
||||
local gcn=$(git config --global user.name)
|
||||
if [ -z "${gcn}" ]; then
|
||||
echo "Setting up git user / remote"
|
||||
@@ -24,6 +25,7 @@ config_user() {
|
||||
# Param 2: remote url
|
||||
#
|
||||
config_remote() {
|
||||
echo "Adding git remote and fetching $2 as $1"
|
||||
local gr=$(git remote -v | grep $1)
|
||||
if [ -z "${gr}" ]; then
|
||||
git remote add $1 $2
|
||||
@@ -32,18 +34,22 @@ config_remote() {
|
||||
}
|
||||
|
||||
# Setup special .ssh files
|
||||
#
|
||||
# Prints out lines of text to make things pretty
|
||||
# Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
|
||||
setup_ssh() {
|
||||
echo "starting ~/.ssh directory setup..."
|
||||
mkdir -p "${HOME}.ssh"
|
||||
chmod 0700 "${HOME}/.ssh"
|
||||
echo "-----"
|
||||
local files=("$@")
|
||||
for file in "${files[@]}"; then
|
||||
for file in "${files[@]}" ; do
|
||||
local cfile="/devcontainer-customization/${file}"
|
||||
local hfile="~/.ssh/${file}"
|
||||
local hfile="${HOME}/.ssh/${file}"
|
||||
if [ ! -f "${hfile}" ]; then
|
||||
echo "copying ${file}"
|
||||
echo "copying \"${file}\""
|
||||
cp "${cfile}" "${hfile}"
|
||||
chmod 600 "${hfile}"
|
||||
fi
|
||||
done
|
||||
ls ~/.ssh
|
||||
echo "~/.ssh directory setup complete!"
|
||||
}
|
||||
|
||||
11
.github/check_and_update.py
vendored
11
.github/check_and_update.py
vendored
@@ -29,9 +29,14 @@ def calculate_sha256(file_path):
|
||||
def manual_safety_check_hf(repo_id):
|
||||
scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
|
||||
scan = scanResponse.json()
|
||||
if scan['hasUnsafeFile']:
|
||||
return scan
|
||||
return None
|
||||
# Check if 'hasUnsafeFile' exists in the response
|
||||
if 'hasUnsafeFile' in scan:
|
||||
if scan['hasUnsafeFile']:
|
||||
return scan
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
download_type, repo_id_or_url = parse_uri(uri)
|
||||
|
||||
|
||||
2
.github/workflows/bump_deps.yaml
vendored
2
.github/workflows/bump_deps.yaml
vendored
@@ -56,7 +56,7 @@ jobs:
|
||||
rm -rfv ${{ matrix.variable }}_message.txt
|
||||
rm -rfv ${{ matrix.variable }}_commit.txt
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
||||
2
.github/workflows/bump_docs.yaml
vendored
2
.github/workflows/bump_docs.yaml
vendored
@@ -17,7 +17,7 @@ jobs:
|
||||
run: |
|
||||
bash .github/bump_docs.sh ${{ matrix.repository }}
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
||||
2
.github/workflows/checksum_checker.yaml
vendored
2
.github/workflows/checksum_checker.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
sudo chmod 777 /hf_cache
|
||||
bash .github/checksum_checker.sh gallery/index.yaml
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
||||
117
.github/workflows/image.yml
vendored
117
.github/workflows/image.yml
vendored
@@ -13,6 +13,78 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
hipblas-jobs:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
tag-latest: ${{ matrix.tag-latest }}
|
||||
tag-suffix: ${{ matrix.tag-suffix }}
|
||||
ffmpeg: ${{ matrix.ffmpeg }}
|
||||
image-type: ${{ matrix.image-type }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
cuda-major-version: ${{ matrix.cuda-major-version }}
|
||||
cuda-minor-version: ${{ matrix.cuda-minor-version }}
|
||||
platforms: ${{ matrix.platforms }}
|
||||
runs-on: ${{ matrix.runs-on }}
|
||||
base-image: ${{ matrix.base-image }}
|
||||
grpc-base-image: ${{ matrix.grpc-base-image }}
|
||||
aio: ${{ matrix.aio }}
|
||||
makeflags: ${{ matrix.makeflags }}
|
||||
latest-image: ${{ matrix.latest-image }}
|
||||
latest-image-aio: ${{ matrix.latest-image-aio }}
|
||||
secrets:
|
||||
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
|
||||
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
|
||||
strategy:
|
||||
# Pushing with all jobs in parallel
|
||||
# eats the bandwidth of all the nodes
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
include:
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-hipblas-ffmpeg'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
aio: "-aio-gpu-hipblas"
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
latest-image: 'latest-gpu-hipblas'
|
||||
latest-image-aio: 'latest-aio-gpu-hipblas'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'extras'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-core'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
self-hosted-jobs:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
@@ -39,7 +111,7 @@ jobs:
|
||||
strategy:
|
||||
# Pushing with all jobs in parallel
|
||||
# eats the bandwidth of all the nodes
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
|
||||
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
|
||||
matrix:
|
||||
include:
|
||||
# Extra images
|
||||
@@ -122,29 +194,6 @@ jobs:
|
||||
base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-hipblas-ffmpeg'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'extras'
|
||||
aio: "-aio-gpu-hipblas"
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
latest-image: 'latest-gpu-hipblas'
|
||||
latest-image-aio: 'latest-aio-gpu-hipblas'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'extras'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'sycl_f16'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
@@ -212,26 +261,6 @@ jobs:
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-ffmpeg-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
- build-type: 'hipblas'
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-hipblas-core'
|
||||
ffmpeg: 'false'
|
||||
image-type: 'core'
|
||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||
grpc-base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=3 --output-sync=target"
|
||||
|
||||
core-image-build:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
|
||||
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@@ -294,7 +294,7 @@ jobs:
|
||||
export C_INCLUDE_PATH=/usr/local/include
|
||||
export CPLUS_INCLUDE_PATH=/usr/local/include
|
||||
export PATH=$PATH:$GOPATH/bin
|
||||
|
||||
export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
|
||||
make dist
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
@@ -327,7 +327,7 @@ jobs:
|
||||
cache: false
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew install protobuf grpc
|
||||
brew install protobuf grpc libomp llvm
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
- name: Build
|
||||
@@ -336,7 +336,7 @@ jobs:
|
||||
export C_INCLUDE_PATH=/usr/local/include
|
||||
export CPLUS_INCLUDE_PATH=/usr/local/include
|
||||
export PATH=$PATH:$GOPATH/bin
|
||||
|
||||
export CC=/opt/homebrew/opt/llvm/bin/clang
|
||||
make dist
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
|
||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
uses: securego/gosec@master
|
||||
uses: securego/gosec@v2.21.4
|
||||
with:
|
||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||
|
||||
14
.github/workflows/test.yml
vendored
14
.github/workflows/test.yml
vendored
@@ -178,13 +178,22 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
# Install protoc
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||
- name: Build images
|
||||
run: |
|
||||
docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
|
||||
BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
|
||||
- name: Test
|
||||
run: |
|
||||
LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
|
||||
PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
|
||||
make run-e2e-aio
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
@@ -214,12 +223,13 @@ jobs:
|
||||
run: go version
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
|
||||
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
|
||||
pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
- name: Test
|
||||
run: |
|
||||
export C_INCLUDE_PATH=/usr/local/include
|
||||
export CPLUS_INCLUDE_PATH=/usr/local/include
|
||||
export CC=/opt/homebrew/opt/llvm/bin/clang
|
||||
# Used to run the newer GNUMake version from brew that supports --output-sync
|
||||
export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
|
||||
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
|
||||
|
||||
2
.github/workflows/update_swagger.yaml
vendored
2
.github/workflows/update_swagger.yaml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
run: |
|
||||
make protogen-go swagger
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
||||
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
|
||||
- [Documentation](#documentation)
|
||||
- [Community and Communication](#community-and-communication)
|
||||
|
||||
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
|
||||
|
||||
## Coding Guidelines
|
||||
|
||||
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
|
||||
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
|
||||
|
||||
## Testing
|
||||
|
||||
@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
|
||||
- You can reach out via the Github issue tracker.
|
||||
- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
|
||||
- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
|
||||
|
||||
---
|
||||
|
||||
44
Dockerfile
44
Dockerfile
@@ -13,7 +13,7 @@ ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
@@ -263,14 +263,20 @@ EOT
|
||||
# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
|
||||
FROM builder-base AS builder-sd
|
||||
|
||||
COPY . .
|
||||
COPY .git .
|
||||
# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
|
||||
COPY Makefile .
|
||||
COPY go.mod .
|
||||
COPY go.sum .
|
||||
COPY backend/backend.proto ./backend/backend.proto
|
||||
COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
|
||||
COPY pkg/grpc ./pkg/grpc
|
||||
COPY pkg/stablediffusion ./pkg/stablediffusion
|
||||
RUN git init
|
||||
RUN make sources/go-stable-diffusion
|
||||
RUN touch prepare-sources
|
||||
|
||||
RUN make prepare
|
||||
|
||||
|
||||
# stablediffusion does not tolerate a newer version of abseil, build it first
|
||||
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
|
||||
# Actually build the backend
|
||||
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
|
||||
|
||||
###################################
|
||||
###################################
|
||||
@@ -285,8 +291,20 @@ COPY --from=grpc /opt/grpc /usr/local
|
||||
# Rebuild with defaults backends
|
||||
WORKDIR /build
|
||||
|
||||
COPY . .
|
||||
COPY .git .
|
||||
|
||||
RUN make prepare
|
||||
|
||||
## Build the binary
|
||||
RUN make build
|
||||
## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
|
||||
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
|
||||
## (both will use CUDA or hipblas for the actual computation)
|
||||
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
||||
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
|
||||
else \
|
||||
make build; \
|
||||
fi
|
||||
|
||||
RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
|
||||
mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
|
||||
@@ -320,9 +338,8 @@ RUN if [ "${FFMPEG}" = "true" ]; then \
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ssh less && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
ssh less wget
|
||||
# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
|
||||
|
||||
RUN go install github.com/go-delve/delve/cmd/dlv@latest
|
||||
|
||||
@@ -400,9 +417,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/transformers-musicgen \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/exllama \
|
||||
; fi
|
||||
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
|
||||
26
Makefile
26
Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
|
||||
# llama.cpp versions
|
||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=2f3c1466ff46a2413b0e363a5005c46538186ee6
|
||||
CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077
|
||||
|
||||
# go-rwkv version
|
||||
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
|
||||
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=d65786ea540a5aef21f67cacfa6f134097727780
|
||||
WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b
|
||||
|
||||
# bert.cpp version
|
||||
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
|
||||
@@ -338,7 +338,7 @@ rebuild: ## Rebuilds the project
|
||||
$(MAKE) -C sources/go-tiny-dream clean
|
||||
$(MAKE) build
|
||||
|
||||
prepare: prepare-sources gen-assets $(OPTIONAL_TARGETS)
|
||||
prepare: prepare-sources $(OPTIONAL_TARGETS)
|
||||
|
||||
clean: ## Remove build related file
|
||||
$(GOCMD) clean -cache
|
||||
@@ -359,6 +359,9 @@ clean-tests:
|
||||
rm -rf test-dir
|
||||
rm -rf core/http/backend-assets
|
||||
|
||||
clean-dc: clean
|
||||
cp -r /build/backend-assets /workspace/backend-assets
|
||||
|
||||
## Build:
|
||||
build: prepare backend-assets grpcs ## Build the project
|
||||
$(info ${GREEN}I local-ai build info:${RESET})
|
||||
@@ -465,7 +468,7 @@ run-e2e-image:
|
||||
ls -liah $(abspath ./tests/e2e-fixtures)
|
||||
docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
|
||||
|
||||
run-e2e-aio:
|
||||
run-e2e-aio: protogen-go
|
||||
@echo 'Running e2e AIO tests'
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
|
||||
|
||||
@@ -534,10 +537,10 @@ protogen-go-clean:
|
||||
$(RM) bin/*
|
||||
|
||||
.PHONY: protogen-python
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||
|
||||
.PHONY: protogen-python-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||
|
||||
.PHONY: autogptq-protogen
|
||||
autogptq-protogen:
|
||||
@@ -571,14 +574,6 @@ diffusers-protogen:
|
||||
diffusers-protogen-clean:
|
||||
$(MAKE) -C backend/python/diffusers protogen-clean
|
||||
|
||||
.PHONY: exllama-protogen
|
||||
exllama-protogen:
|
||||
$(MAKE) -C backend/python/exllama protogen
|
||||
|
||||
.PHONY: exllama-protogen-clean
|
||||
exllama-protogen-clean:
|
||||
$(MAKE) -C backend/python/exllama protogen-clean
|
||||
|
||||
.PHONY: exllama2-protogen
|
||||
exllama2-protogen:
|
||||
$(MAKE) -C backend/python/exllama2 protogen
|
||||
@@ -675,7 +670,6 @@ prepare-extra-conda-environments: protogen-python
|
||||
$(MAKE) -C backend/python/parler-tts
|
||||
$(MAKE) -C backend/python/vall-e-x
|
||||
$(MAKE) -C backend/python/openvoice
|
||||
$(MAKE) -C backend/python/exllama
|
||||
$(MAKE) -C backend/python/exllama2
|
||||
|
||||
prepare-test-extra: protogen-python
|
||||
@@ -846,7 +840,7 @@ endif
|
||||
|
||||
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/whisper
|
||||
endif
|
||||
|
||||
13
README.md
13
README.md
@@ -40,7 +40,7 @@
|
||||
|
||||
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
|
||||
>
|
||||
> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/)
|
||||
|
||||
[](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[](https://artifacthub.io/packages/search?repo=localai)
|
||||
|
||||
@@ -68,10 +68,9 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
|
||||
[💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
||||
|
||||
## 🔥🔥 Hot topics / Roadmap
|
||||
|
||||
[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
## 📰 Latest project news
|
||||
|
||||
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
|
||||
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
|
||||
- June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
|
||||
- June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
|
||||
@@ -82,8 +81,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
|
||||
- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
|
||||
|
||||
Hot topics (looking for contributors):
|
||||
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
|
||||
## 🔥🔥 Hot topics (looking for help):
|
||||
|
||||
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
|
||||
- Realtime API https://github.com/mudler/LocalAI/issues/3714
|
||||
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
|
||||
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
|
||||
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
|
||||
|
||||
@@ -2,7 +2,7 @@ backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
name: gpt-4-vision-preview
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
|
||||
@@ -2,7 +2,7 @@ backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
name: gpt-4-vision-preview
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
name: stablediffusion
|
||||
parameters:
|
||||
model: runwayml/stable-diffusion-v1-5
|
||||
model: Lykon/dreamshaper-8
|
||||
backend: diffusers
|
||||
step: 25
|
||||
f16: true
|
||||
|
||||
@@ -2,7 +2,7 @@ backend: llama-cpp
|
||||
context_size: 4096
|
||||
mmap: false
|
||||
f16: false
|
||||
name: gpt-4-vision-preview
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
|
||||
@@ -16,6 +16,7 @@ service Backend {
|
||||
rpc GenerateImage(GenerateImageRequest) returns (Result) {}
|
||||
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
|
||||
rpc TTS(TTSRequest) returns (Result) {}
|
||||
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
|
||||
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
||||
rpc Status(HealthMessage) returns (StatusResponse) {}
|
||||
|
||||
@@ -25,6 +26,19 @@ service Backend {
|
||||
rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
|
||||
|
||||
rpc Rerank(RerankRequest) returns (RerankResult) {}
|
||||
|
||||
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
|
||||
}
|
||||
|
||||
// Define the empty request
|
||||
message MetricsRequest {}
|
||||
|
||||
message MetricsResponse {
|
||||
int32 slot_id = 1;
|
||||
string prompt_json_for_slot = 2; // Stores the prompt as a JSON string.
|
||||
float tokens_per_second = 3;
|
||||
int32 tokens_generated = 4;
|
||||
int32 prompt_tokens_processed = 5;
|
||||
}
|
||||
|
||||
message RerankRequest {
|
||||
@@ -133,6 +147,9 @@ message PredictOptions {
|
||||
repeated string Images = 42;
|
||||
bool UseTokenizerTemplate = 43;
|
||||
repeated Message Messages = 44;
|
||||
repeated string Videos = 45;
|
||||
repeated string Audios = 46;
|
||||
string CorrelationId = 47;
|
||||
}
|
||||
|
||||
// The response message containing the result
|
||||
@@ -270,6 +287,17 @@ message TTSRequest {
|
||||
optional string language = 5;
|
||||
}
|
||||
|
||||
message SoundGenerationRequest {
|
||||
string text = 1;
|
||||
string model = 2;
|
||||
string dst = 3;
|
||||
optional float duration = 4;
|
||||
optional float temperature = 5;
|
||||
optional bool sample = 6;
|
||||
optional string src = 7;
|
||||
optional int32 src_divisor = 8;
|
||||
}
|
||||
|
||||
message TokenizationResponse {
|
||||
int32 length = 1;
|
||||
repeated int32 tokens = 2;
|
||||
|
||||
@@ -13,15 +13,15 @@
|
||||
#include <getopt.h>
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "log.h"
|
||||
#include "stb_image.h"
|
||||
#include "common.h"
|
||||
#include "json.hpp"
|
||||
#include "llama.h"
|
||||
#include "grammar-parser.h"
|
||||
#include "backend.pb.h"
|
||||
#include "backend.grpc.pb.h"
|
||||
#include "utils.hpp"
|
||||
|
||||
#include "sampling.h"
|
||||
// include std::regex
|
||||
#include <cstddef>
|
||||
#include <thread>
|
||||
@@ -203,8 +203,8 @@ struct llama_client_slot
|
||||
std::string stopping_word;
|
||||
|
||||
// sampling
|
||||
struct llama_sampling_params sparams;
|
||||
llama_sampling_context *ctx_sampling = nullptr;
|
||||
struct gpt_sampler_params sparams;
|
||||
gpt_sampler *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
@@ -449,7 +449,7 @@ struct llama_server_context
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -463,7 +463,7 @@ struct llama_server_context
|
||||
ctx = llama_init.context;
|
||||
if (model == nullptr)
|
||||
{
|
||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||
LOG_ERR("unable to load model: %s", params.model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -471,7 +471,7 @@ struct llama_server_context
|
||||
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
|
||||
const int n_embd_llm = llama_n_embd(model);
|
||||
if (n_embd_clip != n_embd_llm) {
|
||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
|
||||
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
return false;
|
||||
@@ -490,11 +490,21 @@ struct llama_server_context
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||
if (res < 0) {
|
||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||
LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
|
||||
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
||||
}
|
||||
}
|
||||
|
||||
llama_client_slot* get_active_slot() {
|
||||
for (llama_client_slot& slot : slots) {
|
||||
// Check if the slot is currently processing
|
||||
if (slot.is_processing()) {
|
||||
return &slot; // Return the active slot
|
||||
}
|
||||
}
|
||||
return nullptr; // No active slot found
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
// create slots
|
||||
all_slots_are_idle = true;
|
||||
@@ -619,7 +629,7 @@ struct llama_server_context
|
||||
|
||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||
slot_params default_params;
|
||||
llama_sampling_params default_sparams;
|
||||
gpt_sampler_params default_sparams;
|
||||
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
@@ -628,7 +638,7 @@ struct llama_server_context
|
||||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
|
||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
||||
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
|
||||
@@ -641,7 +651,7 @@ struct llama_server_context
|
||||
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||
@@ -665,6 +675,7 @@ struct llama_server_context
|
||||
slot->params.input_prefix = "";
|
||||
}
|
||||
|
||||
|
||||
if (data.count("input_suffix") != 0)
|
||||
{
|
||||
slot->params.input_suffix = data["input_suffix"];
|
||||
@@ -683,6 +694,10 @@ struct llama_server_context
|
||||
slot->prompt = "";
|
||||
}
|
||||
|
||||
if (json_value(data, "ignore_eos", false)) {
|
||||
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
|
||||
}
|
||||
/*
|
||||
slot->sparams.penalty_prompt_tokens.clear();
|
||||
slot->sparams.use_penalty_prompt_tokens = false;
|
||||
const auto &penalty_prompt = data.find("penalty_prompt");
|
||||
@@ -718,14 +733,10 @@ struct llama_server_context
|
||||
slot->sparams.use_penalty_prompt_tokens = true;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
slot->sparams.logit_bias.clear();
|
||||
|
||||
if (json_value(data, "ignore_eos", false))
|
||||
{
|
||||
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
const auto &logit_bias = data.find("logit_bias");
|
||||
if (logit_bias != data.end() && logit_bias->is_array())
|
||||
{
|
||||
@@ -753,7 +764,7 @@ struct llama_server_context
|
||||
llama_token tok = el[0].get<llama_token>();
|
||||
if (tok >= 0 && tok < n_vocab)
|
||||
{
|
||||
slot->sparams.logit_bias[tok] = bias;
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
}
|
||||
else if (el[0].is_string())
|
||||
@@ -761,13 +772,13 @@ struct llama_server_context
|
||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks)
|
||||
{
|
||||
slot->sparams.logit_bias[tok] = bias;
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
slot->params.antiprompt.clear();
|
||||
|
||||
const auto &stop = data.find("stop");
|
||||
@@ -781,24 +792,22 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto &samplers_sequence = data.find("samplers");
|
||||
if (samplers_sequence != data.end() && samplers_sequence->is_array())
|
||||
{
|
||||
|
||||
const auto & samplers = data.find("samplers");
|
||||
if (samplers != data.end() && samplers->is_array()) {
|
||||
std::vector<std::string> sampler_names;
|
||||
for (const auto &sampler_name : *samplers_sequence)
|
||||
{
|
||||
if (sampler_name.is_string())
|
||||
{
|
||||
sampler_names.emplace_back(sampler_name);
|
||||
for (const auto & name : *samplers) {
|
||||
if (name.is_string()) {
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
|
||||
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
|
||||
slot->sparams.samplers = default_sparams.samplers;
|
||||
}
|
||||
|
||||
|
||||
if (multimodal)
|
||||
{
|
||||
@@ -814,10 +823,11 @@ struct llama_server_context
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_ERROR("failed to load image", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
|
||||
__func__,
|
||||
slot->id,
|
||||
img_sl.id
|
||||
);
|
||||
return false;
|
||||
}
|
||||
LOG_VERBOSE("image loaded", {
|
||||
@@ -855,12 +865,12 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
|
||||
LOG("ERROR: Image with id: %i, not found.\n", img_id);
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
} catch (const std::invalid_argument& e) {
|
||||
LOG_TEE("Invalid image number id in prompt\n");
|
||||
LOG("Invalid image number id in prompt\n");
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
@@ -875,10 +885,10 @@ struct llama_server_context
|
||||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
llama_sampling_free(slot->ctx_sampling);
|
||||
gpt_sampler_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = llama_sampling_init(slot->sparams);
|
||||
llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
|
||||
//llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
all_slots_are_idle = false;
|
||||
@@ -888,7 +898,7 @@ struct llama_server_context
|
||||
{"task_id", slot->task_id},
|
||||
});
|
||||
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
|
||||
// LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -928,7 +938,7 @@ struct llama_server_context
|
||||
};
|
||||
if (llama_decode(ctx, batch_view) != 0)
|
||||
{
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -940,7 +950,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("system prompt updated\n");
|
||||
LOG("system prompt updated\n");
|
||||
system_need_update = false;
|
||||
}
|
||||
|
||||
@@ -1006,11 +1016,13 @@ struct llama_server_context
|
||||
slot.generated_text += token_str;
|
||||
slot.has_next_token = true;
|
||||
|
||||
/*
|
||||
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||
{
|
||||
// we can change penalty_prompt_tokens because it is always created from scratch each request
|
||||
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
|
||||
}
|
||||
*/
|
||||
|
||||
// check if there is incomplete UTF-8 character at the end
|
||||
bool incomplete = false;
|
||||
@@ -1119,8 +1131,8 @@ struct llama_server_context
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG_TEE("Error processing the given image");
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1132,7 +1144,7 @@ struct llama_server_context
|
||||
|
||||
void send_error(task_server& task, const std::string &error)
|
||||
{
|
||||
LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
|
||||
LOG("task %i - error: %s\n", task.id, error.c_str());
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
@@ -1144,13 +1156,11 @@ struct llama_server_context
|
||||
|
||||
json get_formated_generation(llama_client_slot &slot)
|
||||
{
|
||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||
std::vector<std::string> samplers_sequence;
|
||||
for (const auto &sampler_type : slot.sparams.samplers_sequence)
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers)
|
||||
{
|
||||
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -1165,13 +1175,11 @@ struct llama_server_context
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
{"tfs_z", slot.sparams.tfs_z},
|
||||
{"typical_p", slot.sparams.typical_p},
|
||||
{"typical_p", slot.sparams.typ_p},
|
||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||
{"repeat_penalty", slot.sparams.penalty_repeat},
|
||||
{"presence_penalty", slot.sparams.penalty_present},
|
||||
{"frequency_penalty", slot.sparams.penalty_freq},
|
||||
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
|
||||
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
|
||||
{"mirostat", slot.sparams.mirostat},
|
||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||
@@ -1179,13 +1187,13 @@ struct llama_server_context
|
||||
{"stop", slot.params.antiprompt},
|
||||
{"n_predict", slot.params.n_predict},
|
||||
{"n_keep", params.n_keep},
|
||||
{"ignore_eos", ignore_eos},
|
||||
{"ignore_eos", slot.sparams.ignore_eos},
|
||||
{"stream", slot.params.stream},
|
||||
{"logit_bias", slot.sparams.logit_bias},
|
||||
// {"logit_bias", slot.sparams.logit_bias},
|
||||
{"n_probs", slot.sparams.n_probs},
|
||||
{"min_keep", slot.sparams.min_keep},
|
||||
{"grammar", slot.sparams.grammar},
|
||||
{"samplers", samplers_sequence}
|
||||
{"samplers", samplers}
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1375,7 +1383,7 @@ struct llama_server_context
|
||||
};
|
||||
if (llama_decode(ctx, batch_view))
|
||||
{
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
LOG("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1393,7 +1401,7 @@ struct llama_server_context
|
||||
llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
|
||||
if (llama_decode(ctx, batch_img))
|
||||
{
|
||||
LOG_TEE("%s : failed to eval image\n", __func__);
|
||||
LOG("%s : failed to eval image\n", __func__);
|
||||
return false;
|
||||
}
|
||||
slot.n_past += n_eval;
|
||||
@@ -1576,7 +1584,7 @@ struct llama_server_context
|
||||
slot.n_past = 0;
|
||||
slot.truncated = false;
|
||||
slot.has_next_token = true;
|
||||
LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
|
||||
continue;
|
||||
// END LOCALAI changes
|
||||
@@ -1714,7 +1722,7 @@ struct llama_server_context
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
llama_sampling_reset(slot.ctx_sampling);
|
||||
gpt_sampler_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
@@ -1726,7 +1734,7 @@ struct llama_server_context
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
||||
gpt_sampler_accept(slot.ctx_sampling, token, false);
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
@@ -1824,10 +1832,11 @@ struct llama_server_context
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
{
|
||||
LOG_ERROR("failed processing images", {
|
||||
"slot_id", slot.id,
|
||||
"task_id", slot.task_id,
|
||||
});
|
||||
LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
|
||||
__func__,
|
||||
slot.id,
|
||||
slot.task_id
|
||||
);
|
||||
// FIXME @phymbert: to be properly tested
|
||||
// early returning without changing the slot state will block the slot for ever
|
||||
// no one at the moment is checking the return value
|
||||
@@ -1867,10 +1876,10 @@ struct llama_server_context
|
||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
LOG("\n");
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
@@ -1880,7 +1889,7 @@ struct llama_server_context
|
||||
|
||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
||||
|
||||
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
}
|
||||
slot.n_past_se += n_tokens;
|
||||
}
|
||||
@@ -1905,11 +1914,11 @@ struct llama_server_context
|
||||
if (n_batch == 1 || ret < 0)
|
||||
{
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
|
||||
LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
n_batch /= 2;
|
||||
@@ -1934,9 +1943,9 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
|
||||
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
|
||||
gpt_sampler_accept(slot.ctx_sampling, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1)
|
||||
@@ -1946,19 +1955,14 @@ struct llama_server_context
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
result.tok = id;
|
||||
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
|
||||
|
||||
const int32_t n_probs = slot.sparams.n_probs;
|
||||
if (slot.sparams.temp <= 0 && n_probs > 0)
|
||||
{
|
||||
// for llama_sample_token_greedy we need to sort candidates
|
||||
llama_sample_softmax(ctx, &cur_p);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
||||
{
|
||||
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
});
|
||||
}
|
||||
|
||||
if (!process_token(result, slot))
|
||||
@@ -2112,6 +2116,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
data["ignore_eos"] = predict->ignoreeos();
|
||||
data["embeddings"] = predict->embeddings();
|
||||
|
||||
// Add the correlationid to json data
|
||||
data["correlation_id"] = predict->correlationid();
|
||||
|
||||
// for each image in the request, add the image data
|
||||
//
|
||||
for (int i = 0; i < predict->images_size(); i++) {
|
||||
@@ -2210,7 +2217,7 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
params.model_alias = request->modelfile();
|
||||
params.n_ctx = request->contextsize();
|
||||
//params.memory_f16 = request->f16memory();
|
||||
params.n_threads = request->threads();
|
||||
params.cpuparams.n_threads = request->threads();
|
||||
params.n_gpu_layers = request->ngpulayers();
|
||||
params.n_batch = request->nbatch();
|
||||
// Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
|
||||
@@ -2350,6 +2357,11 @@ public:
|
||||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
reply.set_prompt_tokens(tokens_evaluated);
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
{ "id", data["correlation_id"] }
|
||||
});
|
||||
|
||||
// Send the reply
|
||||
writer->Write(reply);
|
||||
|
||||
@@ -2373,6 +2385,12 @@ public:
|
||||
std::string completion_text;
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
if (!result.error && result.stop) {
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
{ "id", data["correlation_id"] }
|
||||
});
|
||||
|
||||
completion_text = result.result_json.value("content", "");
|
||||
int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
|
||||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
@@ -2412,6 +2430,31 @@ public:
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
|
||||
llama_client_slot* active_slot = llama.get_active_slot();
|
||||
|
||||
if (active_slot != nullptr) {
|
||||
// Calculate the tokens per second using existing logic
|
||||
double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
|
||||
|
||||
// Populate the response with metrics
|
||||
response->set_slot_id(active_slot->id);
|
||||
response->set_prompt_json_for_slot(active_slot->prompt.dump());
|
||||
response->set_tokens_per_second(tokens_per_second);
|
||||
response->set_tokens_generated(active_slot->n_decoded);
|
||||
response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
|
||||
} else {
|
||||
// Handle case when no active slot exists
|
||||
response->set_slot_id(0);
|
||||
response->set_prompt_json_for_slot("");
|
||||
response->set_tokens_per_second(0);
|
||||
response->set_tokens_generated(0);
|
||||
response->set_prompt_tokens_processed(0);
|
||||
}
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
};
|
||||
|
||||
void RunServer(const std::string& server_address) {
|
||||
|
||||
13
backend/cpp/llama/patches/01-llava.patch
Normal file
13
backend/cpp/llama/patches/01-llava.patch
Normal file
@@ -0,0 +1,13 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 342042ff..224db9b5 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
- patches_data[i] = i + 1;
|
||||
+ patches_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
@@ -1,5 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
## Patches
|
||||
## Apply patches from the `patches` directory
|
||||
for patch in $(ls patches); do
|
||||
echo "Applying patch $patch"
|
||||
patch -d llama.cpp/ -p1 < patches/$patch
|
||||
done
|
||||
|
||||
cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
|
||||
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
|
||||
cp -rfv json.hpp llama.cpp/examples/grpc-server/
|
||||
|
||||
@@ -480,31 +480,4 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
//
|
||||
// random string / id
|
||||
//
|
||||
|
||||
static std::string random_string()
|
||||
{
|
||||
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 generator(rd());
|
||||
|
||||
std::string result(32, ' ');
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
result[i] = str[generator() % str.size()];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string gen_chatcmplid()
|
||||
{
|
||||
std::stringstream chatcmplid;
|
||||
chatcmplid << "chatcmpl-" << random_string();
|
||||
return chatcmplid.str();
|
||||
}
|
||||
@@ -1,104 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
"github.com/go-audio/wav"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
)
|
||||
|
||||
func ffmpegCommand(args []string) (string, error) {
|
||||
cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
|
||||
cmd.Env = os.Environ()
|
||||
out, err := cmd.CombinedOutput()
|
||||
return string(out), err
|
||||
}
|
||||
|
||||
// AudioToWav converts audio to wav for transcribe.
|
||||
// TODO: use https://github.com/mccoyst/ogg?
|
||||
func audioToWav(src, dst string) error {
|
||||
commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
|
||||
out, err := ffmpegCommand(commandArgs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error: %w out: %s", err, out)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
|
||||
res := schema.TranscriptionResult{}
|
||||
|
||||
dir, err := os.MkdirTemp("", "whisper")
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
defer os.RemoveAll(dir)
|
||||
|
||||
convertedPath := filepath.Join(dir, "converted.wav")
|
||||
|
||||
if err := audioToWav(audiopath, convertedPath); err != nil {
|
||||
return res, err
|
||||
}
|
||||
|
||||
// Open samples
|
||||
fh, err := os.Open(convertedPath)
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
defer fh.Close()
|
||||
|
||||
// Read samples
|
||||
d := wav.NewDecoder(fh)
|
||||
buf, err := d.FullPCMBuffer()
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
|
||||
data := buf.AsFloat32Buffer().Data
|
||||
|
||||
// Process samples
|
||||
context, err := model.NewContext()
|
||||
if err != nil {
|
||||
return res, err
|
||||
|
||||
}
|
||||
|
||||
context.SetThreads(threads)
|
||||
|
||||
if language != "" {
|
||||
context.SetLanguage(language)
|
||||
} else {
|
||||
context.SetLanguage("auto")
|
||||
}
|
||||
|
||||
if translate {
|
||||
context.SetTranslate(true)
|
||||
}
|
||||
|
||||
if err := context.Process(data, nil, nil); err != nil {
|
||||
return res, err
|
||||
}
|
||||
|
||||
for {
|
||||
s, err := context.NextSegment()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
var tokens []int
|
||||
for _, t := range s.Tokens {
|
||||
tokens = append(tokens, t.Id)
|
||||
}
|
||||
|
||||
segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
|
||||
res.Segments = append(res.Segments, segment)
|
||||
|
||||
res.Text += s.Text
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
@@ -1,26 +0,0 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
type Whisper struct {
|
||||
base.SingleThread
|
||||
whisper whisper.Model
|
||||
}
|
||||
|
||||
func (sd *Whisper) Load(opts *pb.ModelOptions) error {
|
||||
// Note: the Model here is a path to a directory containing the model files
|
||||
w, err := whisper.New(opts.ModelFile)
|
||||
sd.whisper = w
|
||||
return err
|
||||
}
|
||||
|
||||
func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
|
||||
return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
|
||||
}
|
||||
105
backend/go/transcribe/whisper/whisper.go
Normal file
105
backend/go/transcribe/whisper/whisper.go
Normal file
@@ -0,0 +1,105 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
|
||||
"github.com/go-audio/wav"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
)
|
||||
|
||||
type Whisper struct {
|
||||
base.SingleThread
|
||||
whisper whisper.Model
|
||||
}
|
||||
|
||||
func (sd *Whisper) Load(opts *pb.ModelOptions) error {
|
||||
// Note: the Model here is a path to a directory containing the model files
|
||||
w, err := whisper.New(opts.ModelFile)
|
||||
sd.whisper = w
|
||||
return err
|
||||
}
|
||||
|
||||
func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
||||
|
||||
dir, err := os.MkdirTemp("", "whisper")
|
||||
if err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
defer os.RemoveAll(dir)
|
||||
|
||||
convertedPath := filepath.Join(dir, "converted.wav")
|
||||
|
||||
if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
|
||||
// Open samples
|
||||
fh, err := os.Open(convertedPath)
|
||||
if err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
defer fh.Close()
|
||||
|
||||
// Read samples
|
||||
d := wav.NewDecoder(fh)
|
||||
buf, err := d.FullPCMBuffer()
|
||||
if err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
|
||||
data := buf.AsFloat32Buffer().Data
|
||||
|
||||
// Process samples
|
||||
context, err := sd.whisper.NewContext()
|
||||
if err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
|
||||
}
|
||||
|
||||
context.SetThreads(uint(opts.Threads))
|
||||
|
||||
if opts.Language != "" {
|
||||
context.SetLanguage(opts.Language)
|
||||
} else {
|
||||
context.SetLanguage("auto")
|
||||
}
|
||||
|
||||
if opts.Translate {
|
||||
context.SetTranslate(true)
|
||||
}
|
||||
|
||||
if err := context.Process(data, nil, nil); err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
|
||||
segments := []*pb.TranscriptSegment{}
|
||||
text := ""
|
||||
for {
|
||||
s, err := context.NextSegment()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
var tokens []int32
|
||||
for _, t := range s.Tokens {
|
||||
tokens = append(tokens, int32(t.Id))
|
||||
}
|
||||
|
||||
segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
|
||||
segments = append(segments, segment)
|
||||
|
||||
text += s.Text
|
||||
}
|
||||
|
||||
return pb.TranscriptResult{
|
||||
Segments: segments,
|
||||
Text: text,
|
||||
}, nil
|
||||
|
||||
}
|
||||
@@ -2,4 +2,4 @@
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,6 +1,6 @@
|
||||
accelerate
|
||||
auto-gptq==0.7.1
|
||||
grpcio==1.65.4
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
transformers
|
||||
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
bark==0.1.5
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -1,2 +1,2 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
TTS==0.22.0
|
||||
grpcio==1.65.5
|
||||
coqui-tts
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -168,7 +168,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if request.CFGScale != 0:
|
||||
self.cfg_scale = request.CFGScale
|
||||
|
||||
clipmodel = "runwayml/stable-diffusion-v1-5"
|
||||
clipmodel = "Lykon/dreamshaper-8"
|
||||
if request.CLIPModel != "":
|
||||
clipmodel = request.CLIPModel
|
||||
clipsubfolder = "text_encoder"
|
||||
|
||||
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchvision
|
||||
optimum[openvino]
|
||||
setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
diffusers
|
||||
opencv-python
|
||||
transformers
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
setuptools
|
||||
grpcio==1.65.4
|
||||
grpcio==1.66.2
|
||||
pillow
|
||||
protobuf
|
||||
certifi
|
||||
|
||||
@@ -53,7 +53,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
|
||||
self.assertTrue(response.success)
|
||||
self.assertEqual(response.message, "Model loaded successfully")
|
||||
except Exception as err:
|
||||
@@ -71,7 +71,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
|
||||
print(response.message)
|
||||
self.assertTrue(response.success)
|
||||
image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
|
||||
@@ -81,4 +81,4 @@ class TestBackendServicer(unittest.TestCase):
|
||||
print(err)
|
||||
self.fail("Image gen service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
self.tearDown()
|
||||
|
||||
1
backend/python/exllama/.gitignore
vendored
1
backend/python/exllama/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
source
|
||||
@@ -1,25 +0,0 @@
|
||||
export CONDA_ENV_PATH = "exllama.yml"
|
||||
|
||||
.PHONY: exllama
|
||||
exllama: protogen
|
||||
bash install.sh ${CONDA_ENV_PATH}
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
@echo "Running exllama..."
|
||||
bash run.sh
|
||||
@echo "exllama run."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
$(RM) -r venv source __pycache__
|
||||
@@ -1,5 +0,0 @@
|
||||
# Creating a separate environment for the exllama project
|
||||
|
||||
```
|
||||
make exllama
|
||||
```
|
||||
@@ -1,159 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import grpc
|
||||
from concurrent import futures
|
||||
import time
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os, glob
|
||||
|
||||
from pathlib import Path
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import version as torch_version
|
||||
|
||||
from source.tokenizer import ExLlamaTokenizer
|
||||
from source.generator import ExLlamaGenerator
|
||||
from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
# Implement the BackendServicer class with the service methods
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def generate(self,prompt, max_new_tokens):
|
||||
self.generator.end_beam_search()
|
||||
|
||||
# Tokenizing the input
|
||||
ids = self.generator.tokenizer.encode(prompt)
|
||||
|
||||
self.generator.gen_begin_reuse(ids)
|
||||
initial_len = self.generator.sequence[0].shape[0]
|
||||
has_leading_space = False
|
||||
decoded_text = ''
|
||||
for i in range(max_new_tokens):
|
||||
token = self.generator.gen_single_token()
|
||||
if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
|
||||
has_leading_space = True
|
||||
|
||||
decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
|
||||
if has_leading_space:
|
||||
decoded_text = ' ' + decoded_text
|
||||
|
||||
if token.item() == self.generator.tokenizer.eos_token_id:
|
||||
break
|
||||
return decoded_text
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
def LoadModel(self, request, context):
|
||||
try:
|
||||
# https://github.com/turboderp/exllama/blob/master/example_cfg.py
|
||||
model_directory = request.ModelFile
|
||||
|
||||
# Locate files we need within that directory
|
||||
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
|
||||
model_config_path = os.path.join(model_directory, "config.json")
|
||||
st_pattern = os.path.join(model_directory, "*.safetensors")
|
||||
model_path = glob.glob(st_pattern)[0]
|
||||
|
||||
# Create config, model, tokenizer and generator
|
||||
|
||||
config = ExLlamaConfig(model_config_path) # create config from config.json
|
||||
config.model_path = model_path # supply path to model weights file
|
||||
if (request.ContextSize):
|
||||
config.max_seq_len = request.ContextSize # override max sequence length
|
||||
config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2.
|
||||
# https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
|
||||
|
||||
# Set Rope scaling.
|
||||
if (request.RopeFreqScale):
|
||||
# Alpha value for Rope scaling.
|
||||
# Higher value increases context but adds perplexity.
|
||||
# alpha_value and compress_pos_emb are mutually exclusive.
|
||||
# https://github.com/turboderp/exllama/issues/115
|
||||
config.alpha_value = request.RopeFreqScale
|
||||
config.calculate_rotary_embedding_base()
|
||||
|
||||
model = ExLlama(config) # create ExLlama instance and load the weights
|
||||
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
|
||||
|
||||
cache = ExLlamaCache(model, batch_size = 2) # create cache for inference
|
||||
generator = ExLlamaGenerator(model, tokenizer, cache) # create generator
|
||||
|
||||
self.generator= generator
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
self.cache = cache
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def Predict(self, request, context):
|
||||
penalty = 1.15
|
||||
if request.Penalty != 0.0:
|
||||
penalty = request.Penalty
|
||||
self.generator.settings.token_repetition_penalty_max = penalty
|
||||
self.generator.settings.temperature = request.Temperature
|
||||
self.generator.settings.top_k = request.TopK
|
||||
self.generator.settings.top_p = request.TopP
|
||||
|
||||
tokens = 512
|
||||
if request.Tokens != 0:
|
||||
tokens = request.Tokens
|
||||
|
||||
if self.cache.batch_size == 1:
|
||||
del self.cache
|
||||
self.cache = ExLlamaCache(self.model, batch_size=2)
|
||||
self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
|
||||
|
||||
t = self.generate(request.Prompt, tokens)
|
||||
|
||||
# Remove prompt from response if present
|
||||
if request.Prompt in t:
|
||||
t = t.replace(request.Prompt, "")
|
||||
|
||||
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
||||
|
||||
def PredictStream(self, request, context):
|
||||
# Implement PredictStream RPC
|
||||
#for reply in some_data_generator():
|
||||
# yield reply
|
||||
# Not implemented yet
|
||||
return self.Predict(request, context)
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
|
||||
# Define the signal handler function
|
||||
def signal_handler(sig, frame):
|
||||
print("Received termination signal. Shutting down...")
|
||||
server.stop(0)
|
||||
sys.exit(0)
|
||||
|
||||
# Set the signal handlers for SIGINT and SIGTERM
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||
parser.add_argument(
|
||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
serve(args.addr)
|
||||
@@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
LIMIT_TARGETS="cublas"
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
installRequirements
|
||||
|
||||
git clone https://github.com/turboderp/exllama $MY_DIR/source
|
||||
uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
|
||||
|
||||
cp -v ./*py $MY_DIR/source/
|
||||
@@ -1,3 +0,0 @@
|
||||
transformers
|
||||
accelerate
|
||||
torch
|
||||
@@ -1,4 +0,0 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,3 +0,0 @@
|
||||
torch
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +0,0 @@
|
||||
grpcio==1.65.5
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
LIMIT_TARGETS="cublas"
|
||||
BACKEND_FILE="${MY_DIR}/source/backend.py"
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
startBackend $@
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
runUnittests
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.65.4
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
wheel
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -2,7 +2,7 @@
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
optimum[openvino]
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
librosa==0.9.1
|
||||
faster-whisper==1.0.3
|
||||
@@ -18,6 +18,6 @@ python-dotenv
|
||||
pypinyin==0.50.0
|
||||
cn2an==0.5.22
|
||||
jieba==0.42.1
|
||||
gradio==4.38.1
|
||||
gradio==4.44.1
|
||||
langid==1.1.6
|
||||
git+https://github.com/myshell-ai/MeloTTS.git
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
librosa
|
||||
faster-whisper
|
||||
|
||||
@@ -15,5 +15,12 @@ installRequirements
|
||||
|
||||
# https://github.com/descriptinc/audiotools/issues/101
|
||||
# incompatible protobuf versions.
|
||||
PYDIR=$(ls ${MY_DIR}/venv/lib)
|
||||
curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
|
||||
PYDIR=python3.10
|
||||
pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
|
||||
|
||||
if [ ! -d ${pyenv} ]; then
|
||||
echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch
|
||||
torchaudio
|
||||
torch==2.3.0+rocm6.0
|
||||
torchaudio==2.3.0+rocm6.0
|
||||
transformers
|
||||
accelerate
|
||||
accelerate
|
||||
|
||||
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
llvmlite==0.43.0
|
||||
@@ -5,4 +5,4 @@ accelerate
|
||||
torch
|
||||
rerankers[transformers]
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.65.4
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -55,7 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
"""
|
||||
model_name = request.Model
|
||||
try:
|
||||
self.model = SentenceTransformer(model_name)
|
||||
self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
|
||||
@@ -2,5 +2,5 @@ torch
|
||||
accelerate
|
||||
transformers
|
||||
bitsandbytes
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,4 +1,4 @@
|
||||
torch
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -4,5 +4,5 @@ torch
|
||||
optimum[openvino]
|
||||
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||
accelerate
|
||||
sentence-transformers==3.0.1
|
||||
sentence-transformers==3.1.1
|
||||
transformers
|
||||
@@ -1,3 +1,5 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
certifi
|
||||
datasets
|
||||
einops
|
||||
@@ -15,7 +15,7 @@ import backend_pb2_grpc
|
||||
|
||||
import grpc
|
||||
|
||||
from scipy.io.wavfile import write as write_wav
|
||||
from scipy.io import wavfile
|
||||
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
@@ -63,6 +63,61 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def SoundGeneration(self, request, context):
|
||||
model_name = request.model
|
||||
if model_name == "":
|
||||
return backend_pb2.Result(success=False, message="request.model is required")
|
||||
try:
|
||||
self.processor = AutoProcessor.from_pretrained(model_name)
|
||||
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
||||
inputs = None
|
||||
if request.text == "":
|
||||
inputs = self.model.get_unconditional_inputs(num_samples=1)
|
||||
elif request.HasField('src'):
|
||||
# TODO SECURITY CODE GOES HERE LOL
|
||||
# WHO KNOWS IF THIS WORKS???
|
||||
sample_rate, wsamples = wavfile.read('path_to_your_file.wav')
|
||||
|
||||
if request.HasField('src_divisor'):
|
||||
wsamples = wsamples[: len(wsamples) // request.src_divisor]
|
||||
|
||||
inputs = self.processor(
|
||||
audio=wsamples,
|
||||
sampling_rate=sample_rate,
|
||||
text=[request.text],
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
else:
|
||||
inputs = self.processor(
|
||||
text=[request.text],
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
tokens = 256
|
||||
if request.HasField('duration'):
|
||||
tokens = int(request.duration * 51.2) # 256 tokens = 5 seconds, therefore 51.2 tokens is one second
|
||||
guidance = 3.0
|
||||
if request.HasField('temperature'):
|
||||
guidance = request.temperature
|
||||
dosample = True
|
||||
if request.HasField('sample'):
|
||||
dosample = request.sample
|
||||
audio_values = self.model.generate(**inputs, do_sample=dosample, guidance_scale=guidance, max_new_tokens=tokens)
|
||||
print("[transformers-musicgen] SoundGeneration generated!", file=sys.stderr)
|
||||
sampling_rate = self.model.config.audio_encoder.sampling_rate
|
||||
wavfile.write(request.dst, rate=sampling_rate, data=audio_values[0, 0].numpy())
|
||||
print("[transformers-musicgen] SoundGeneration saved to", request.dst, file=sys.stderr)
|
||||
print("[transformers-musicgen] SoundGeneration for", file=sys.stderr)
|
||||
print("[transformers-musicgen] SoundGeneration requested tokens", tokens, file=sys.stderr)
|
||||
print(request, file=sys.stderr)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(success=True)
|
||||
|
||||
|
||||
# The TTS endpoint is older, and provides fewer features, but exists for compatibility reasons
|
||||
def TTS(self, request, context):
|
||||
model_name = request.model
|
||||
if model_name == "":
|
||||
@@ -75,8 +130,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
tokens = 256
|
||||
# TODO get tokens from request?
|
||||
tokens = 512 # No good place to set the "length" in TTS, so use 10s as a sane default
|
||||
audio_values = self.model.generate(**inputs, max_new_tokens=tokens)
|
||||
print("[transformers-musicgen] TTS generated!", file=sys.stderr)
|
||||
sampling_rate = self.model.config.audio_encoder.sampling_rate
|
||||
|
||||
@@ -4,4 +4,4 @@ transformers
|
||||
accelerate
|
||||
torch
|
||||
optimum[openvino]
|
||||
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
scipy==1.14.0
|
||||
certifi
|
||||
@@ -63,7 +63,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
|
||||
def test_tts(self):
|
||||
"""
|
||||
This method tests if the embeddings are generated successfully
|
||||
This method tests if TTS is generated successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
@@ -77,5 +77,24 @@ class TestBackendServicer(unittest.TestCase):
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("TTS service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_sound_generation(self):
|
||||
"""
|
||||
This method tests if SoundGeneration is generated successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small"))
|
||||
self.assertTrue(response.success)
|
||||
sg_request = backend_pb2.SoundGenerationRequest(text="80s TV news production music hit for tonight's biggest story")
|
||||
sg_response = stub.SoundGeneration(sg_request)
|
||||
self.assertIsNotNone(sg_response)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("SoundGeneration service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -4,4 +4,4 @@ accelerate
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -5,6 +5,8 @@ import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
from typing import List
|
||||
from PIL import Image
|
||||
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
@@ -15,6 +17,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.assets.video import VideoAsset
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
@@ -105,6 +109,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
try:
|
||||
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
except Exception as err:
|
||||
print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
try:
|
||||
@@ -117,7 +122,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
print("Model loaded successfully", file=sys.stderr)
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
async def Predict(self, request, context):
|
||||
@@ -135,6 +140,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
res = await gen.__anext__()
|
||||
return res
|
||||
|
||||
def Embedding(self, request, context):
|
||||
"""
|
||||
A gRPC method that calculates embeddings for a given sentence.
|
||||
|
||||
Args:
|
||||
request: An EmbeddingRequest object that contains the request parameters.
|
||||
context: A grpc.ServicerContext object that provides information about the RPC.
|
||||
|
||||
Returns:
|
||||
An EmbeddingResult object that contains the calculated embeddings.
|
||||
"""
|
||||
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
|
||||
outputs = self.model.encode(request.Embeddings)
|
||||
# Check if we have one result at least
|
||||
if len(outputs) == 0:
|
||||
context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
|
||||
context.set_details("No embeddings were calculated.")
|
||||
return backend_pb2.EmbeddingResult()
|
||||
return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
|
||||
|
||||
async def PredictStream(self, request, context):
|
||||
"""
|
||||
Generates text based on the given prompt and sampling parameters, and streams the results.
|
||||
@@ -176,15 +201,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if request.Seed != 0:
|
||||
sampling_params.seed = request.Seed
|
||||
|
||||
# Extract image paths and process images
|
||||
prompt = request.Prompt
|
||||
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
|
||||
|
||||
image_paths = request.Images
|
||||
image_data = [self.load_image(img_path) for img_path in image_paths]
|
||||
|
||||
videos_path = request.Videos
|
||||
video_data = [self.load_video(video_path) for video_path in videos_path]
|
||||
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
|
||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
# Generate text
|
||||
# Generate text using the LLM engine
|
||||
request_id = random_uuid()
|
||||
outputs = self.llm.generate(prompt, sampling_params, request_id)
|
||||
print(f"Generating text with request_id: {request_id}", file=sys.stderr)
|
||||
outputs = self.llm.generate(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"image": image_data if image_data else None,
|
||||
"video": video_data if video_data else None,
|
||||
} if image_data or video_data else None,
|
||||
},
|
||||
sampling_params=sampling_params,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
# Stream the results
|
||||
generated_text = ""
|
||||
@@ -207,9 +250,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if streaming:
|
||||
return
|
||||
|
||||
# Remove the image files from /tmp folder
|
||||
for img_path in image_paths:
|
||||
try:
|
||||
os.remove(img_path)
|
||||
except Exception as e:
|
||||
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
|
||||
|
||||
# Sending the final generated text
|
||||
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
|
||||
|
||||
def load_image(self, image_path: str):
|
||||
"""
|
||||
Load an image from the given file path.
|
||||
|
||||
Args:
|
||||
image_path (str): The path to the image file.
|
||||
|
||||
Returns:
|
||||
Image: The loaded image.
|
||||
"""
|
||||
try:
|
||||
return Image.open(image_path)
|
||||
except Exception as e:
|
||||
print(f"Error loading image {image_path}: {e}", file=sys.stderr)
|
||||
return self.load_video(image_path)
|
||||
|
||||
def load_video(self, video_path: str):
|
||||
"""
|
||||
Load a video from the given file path.
|
||||
|
||||
Args:
|
||||
video_path (str): The path to the image file.
|
||||
|
||||
Returns:
|
||||
Video: The loaded video.
|
||||
"""
|
||||
try:
|
||||
video = VideoAsset(name=video_path).np_ndarrays
|
||||
return video
|
||||
except Exception as e:
|
||||
print(f"Error loading video {image_path}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
async def serve(address):
|
||||
# Start asyncio gRPC server
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
|
||||
@@ -13,4 +13,18 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
installRequirements
|
||||
if [ "x${BUILD_TYPE}" == "x" ]; then
|
||||
ensureVenv
|
||||
# https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
|
||||
if [ ! -d vllm ]; then
|
||||
git clone https://github.com/vllm-project/vllm
|
||||
fi
|
||||
pushd vllm
|
||||
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
|
||||
uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
popd
|
||||
rm -rf vllm
|
||||
else
|
||||
installRequirements
|
||||
fi
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -1,3 +1,4 @@
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -1,4 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -4,4 +4,5 @@ accelerate
|
||||
torch
|
||||
transformers
|
||||
optimum[openvino]
|
||||
setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
bitsandbytes
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.65.5
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
||||
@@ -72,5 +72,28 @@ class TestBackendServicer(unittest.TestCase):
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("text service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_embedding(self):
|
||||
"""
|
||||
This method tests if the embeddings are generated successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
|
||||
self.assertTrue(response.success)
|
||||
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
|
||||
embedding_response = stub.Embedding(embedding_request)
|
||||
self.assertIsNotNone(embedding_response.embeddings)
|
||||
# assert that is a list of floats
|
||||
self.assertIsInstance(embedding_response.embeddings, list)
|
||||
# assert that the list is not empty
|
||||
self.assertTrue(len(embedding_response.embeddings) > 0)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("Embedding service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
13
core/backend/backend_suite_test.go
Normal file
13
core/backend/backend_suite_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package backend_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestBackend(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "Backend test suite")
|
||||
}
|
||||
@@ -10,20 +10,11 @@ import (
|
||||
)
|
||||
|
||||
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
|
||||
modelFile := backendConfig.Model
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
|
||||
var inferenceModel interface{}
|
||||
var err error
|
||||
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
model.WithThreads(uint32(*backendConfig.Threads)),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
if backendConfig.Backend == "" {
|
||||
inferenceModel, err = loader.GreedyLoader(opts...)
|
||||
|
||||
@@ -8,19 +8,8 @@ import (
|
||||
)
|
||||
|
||||
func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
|
||||
threads := backendConfig.Threads
|
||||
if *threads == 0 && appConfig.Threads != 0 {
|
||||
threads = &appConfig.Threads
|
||||
}
|
||||
gRPCOpts := gRPCModelOpts(backendConfig)
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithBackendString(backendConfig.Backend),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithThreads(uint32(*threads)),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithModel(backendConfig.Model),
|
||||
model.WithLoadGRPCLoadModelOpts(gRPCOpts),
|
||||
})
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
inferenceModel, err := loader.BackendLoader(
|
||||
opts...,
|
||||
|
||||
@@ -9,6 +9,8 @@ import (
|
||||
"sync"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
@@ -29,24 +31,13 @@ type TokenUsage struct {
|
||||
Completion int
|
||||
}
|
||||
|
||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||
modelFile := c.Model
|
||||
threads := c.Threads
|
||||
if *threads == 0 && o.Threads != 0 {
|
||||
threads = &o.Threads
|
||||
}
|
||||
grpcOpts := gRPCModelOpts(c)
|
||||
|
||||
var inferenceModel grpc.Backend
|
||||
var err error
|
||||
|
||||
opts := modelOpts(c, o, []model.Option{
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
|
||||
model.WithAssetDir(o.AssetsDestination),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(o.Context),
|
||||
})
|
||||
opts := ModelOptions(c, o, []model.Option{})
|
||||
|
||||
if c.Backend != "" {
|
||||
opts = append(opts, model.WithBackendString(c.Backend))
|
||||
@@ -87,7 +78,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
||||
case string:
|
||||
protoMessages[i].Content = ct
|
||||
default:
|
||||
return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
|
||||
return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -99,6 +90,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
||||
opts.Messages = protoMessages
|
||||
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
|
||||
opts.Images = images
|
||||
opts.Videos = videos
|
||||
opts.Audios = audios
|
||||
|
||||
tokenUsage := TokenUsage{}
|
||||
|
||||
@@ -181,13 +174,37 @@ func Finetune(config config.BackendConfig, input, prediction string) string {
|
||||
mu.Lock()
|
||||
reg, ok := cutstrings[c]
|
||||
if !ok {
|
||||
cutstrings[c] = regexp.MustCompile(c)
|
||||
r, err := regexp.Compile(c)
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to compile regex")
|
||||
}
|
||||
cutstrings[c] = r
|
||||
reg = cutstrings[c]
|
||||
}
|
||||
mu.Unlock()
|
||||
prediction = reg.ReplaceAllString(prediction, "")
|
||||
}
|
||||
|
||||
// extract results from the response which can be for instance inside XML tags
|
||||
var predResult string
|
||||
for _, r := range config.ExtractRegex {
|
||||
mu.Lock()
|
||||
reg, ok := cutstrings[r]
|
||||
if !ok {
|
||||
regex, err := regexp.Compile(r)
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to compile regex")
|
||||
}
|
||||
cutstrings[r] = regex
|
||||
reg = regex
|
||||
}
|
||||
mu.Unlock()
|
||||
predResult += reg.FindString(prediction)
|
||||
}
|
||||
if predResult != "" {
|
||||
prediction = predResult
|
||||
}
|
||||
|
||||
for _, c := range config.TrimSpace {
|
||||
prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
|
||||
}
|
||||
|
||||
109
core/backend/llm_test.go
Normal file
109
core/backend/llm_test.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package backend_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("LLM tests", func() {
|
||||
Context("Finetune LLM output", func() {
|
||||
var (
|
||||
testConfig config.BackendConfig
|
||||
input string
|
||||
prediction string
|
||||
result string
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
testConfig = config.BackendConfig{
|
||||
PredictionOptions: schema.PredictionOptions{
|
||||
Echo: false,
|
||||
},
|
||||
LLMConfig: config.LLMConfig{
|
||||
Cutstrings: []string{`<.*?>`}, // Example regex for removing XML tags
|
||||
ExtractRegex: []string{`<result>(.*?)</result>`}, // Example regex to extract from tags
|
||||
TrimSpace: []string{" ", "\n"},
|
||||
TrimSuffix: []string{".", "!"},
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
Context("when echo is enabled", func() {
|
||||
BeforeEach(func() {
|
||||
testConfig.Echo = true
|
||||
input = "Hello"
|
||||
prediction = "World"
|
||||
})
|
||||
|
||||
It("should prepend input to prediction", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("HelloWorld"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when echo is disabled", func() {
|
||||
BeforeEach(func() {
|
||||
testConfig.Echo = false
|
||||
input = "Hello"
|
||||
prediction = "World"
|
||||
})
|
||||
|
||||
It("should not modify the prediction with input", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("World"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when cutstrings regex is applied", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = "<div>Hello</div> World"
|
||||
})
|
||||
|
||||
It("should remove substrings matching cutstrings regex", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("Hello World"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when extract regex is applied", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = "<response><result>42</result></response>"
|
||||
})
|
||||
|
||||
It("should extract substrings matching the extract regex", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("42"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when trimming spaces", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = " Hello World "
|
||||
})
|
||||
|
||||
It("should trim spaces from the prediction", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("Hello World"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when trimming suffixes", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = "Hello World."
|
||||
})
|
||||
|
||||
It("should trim suffixes from the prediction", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("Hello World"))
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -11,32 +11,65 @@ import (
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
|
||||
func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
|
||||
name := c.Name
|
||||
if name == "" {
|
||||
name = c.Model
|
||||
}
|
||||
|
||||
defOpts := []model.Option{
|
||||
model.WithBackendString(c.Backend),
|
||||
model.WithModel(c.Model),
|
||||
model.WithAssetDir(so.AssetsDestination),
|
||||
model.WithContext(so.Context),
|
||||
model.WithModelID(name),
|
||||
}
|
||||
|
||||
threads := 1
|
||||
|
||||
if c.Threads != nil {
|
||||
threads = *c.Threads
|
||||
}
|
||||
|
||||
if so.Threads != 0 {
|
||||
threads = so.Threads
|
||||
}
|
||||
|
||||
c.Threads = &threads
|
||||
|
||||
grpcOpts := grpcModelOpts(c)
|
||||
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
|
||||
|
||||
if so.SingleBackend {
|
||||
opts = append(opts, model.WithSingleActiveBackend())
|
||||
defOpts = append(defOpts, model.WithSingleActiveBackend())
|
||||
}
|
||||
|
||||
if so.ParallelBackendRequests {
|
||||
opts = append(opts, model.EnableParallelRequests)
|
||||
defOpts = append(defOpts, model.EnableParallelRequests)
|
||||
}
|
||||
|
||||
if c.GRPC.Attempts != 0 {
|
||||
opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
|
||||
defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
|
||||
}
|
||||
|
||||
if c.GRPC.AttemptsSleepTime != 0 {
|
||||
opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
|
||||
defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
|
||||
}
|
||||
|
||||
for k, v := range so.ExternalGRPCBackends {
|
||||
opts = append(opts, model.WithExternalBackend(k, v))
|
||||
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
||||
}
|
||||
|
||||
return opts
|
||||
return append(defOpts, opts...)
|
||||
}
|
||||
|
||||
func getSeed(c config.BackendConfig) int32 {
|
||||
seed := int32(*c.Seed)
|
||||
var seed int32 = config.RAND_SEED
|
||||
|
||||
if c.Seed != nil {
|
||||
seed = int32(*c.Seed)
|
||||
}
|
||||
|
||||
if seed == config.RAND_SEED {
|
||||
seed = rand.Int31()
|
||||
}
|
||||
@@ -44,11 +77,47 @@ func getSeed(c config.BackendConfig) int32 {
|
||||
return seed
|
||||
}
|
||||
|
||||
func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
b := 512
|
||||
if c.Batch != 0 {
|
||||
b = c.Batch
|
||||
}
|
||||
|
||||
f16 := false
|
||||
if c.F16 != nil {
|
||||
f16 = *c.F16
|
||||
}
|
||||
|
||||
embeddings := false
|
||||
if c.Embeddings != nil {
|
||||
embeddings = *c.Embeddings
|
||||
}
|
||||
|
||||
lowVRAM := false
|
||||
if c.LowVRAM != nil {
|
||||
lowVRAM = *c.LowVRAM
|
||||
}
|
||||
|
||||
mmap := false
|
||||
if c.MMap != nil {
|
||||
mmap = *c.MMap
|
||||
}
|
||||
|
||||
ctxSize := 1024
|
||||
if c.ContextSize != nil {
|
||||
ctxSize = *c.ContextSize
|
||||
}
|
||||
|
||||
mmlock := false
|
||||
if c.MMlock != nil {
|
||||
mmlock = *c.MMlock
|
||||
}
|
||||
|
||||
nGPULayers := 9999999
|
||||
if c.NGPULayers != nil {
|
||||
nGPULayers = *c.NGPULayers
|
||||
}
|
||||
|
||||
return &pb.ModelOptions{
|
||||
CUDA: c.CUDA || c.Diffusers.CUDA,
|
||||
SchedulerType: c.Diffusers.SchedulerType,
|
||||
@@ -56,14 +125,14 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
CFGScale: c.Diffusers.CFGScale,
|
||||
LoraAdapter: c.LoraAdapter,
|
||||
LoraScale: c.LoraScale,
|
||||
F16Memory: *c.F16,
|
||||
F16Memory: f16,
|
||||
LoraBase: c.LoraBase,
|
||||
IMG2IMG: c.Diffusers.IMG2IMG,
|
||||
CLIPModel: c.Diffusers.ClipModel,
|
||||
CLIPSubfolder: c.Diffusers.ClipSubFolder,
|
||||
CLIPSkip: int32(c.Diffusers.ClipSkip),
|
||||
ControlNet: c.Diffusers.ControlNet,
|
||||
ContextSize: int32(*c.ContextSize),
|
||||
ContextSize: int32(ctxSize),
|
||||
Seed: getSeed(c),
|
||||
NBatch: int32(b),
|
||||
NoMulMatQ: c.NoMulMatQ,
|
||||
@@ -85,16 +154,16 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
YarnBetaSlow: c.YarnBetaSlow,
|
||||
NGQA: c.NGQA,
|
||||
RMSNormEps: c.RMSNormEps,
|
||||
MLock: *c.MMlock,
|
||||
MLock: mmlock,
|
||||
RopeFreqBase: c.RopeFreqBase,
|
||||
RopeScaling: c.RopeScaling,
|
||||
Type: c.ModelType,
|
||||
RopeFreqScale: c.RopeFreqScale,
|
||||
NUMA: c.NUMA,
|
||||
Embeddings: *c.Embeddings,
|
||||
LowVRAM: *c.LowVRAM,
|
||||
NGPULayers: int32(*c.NGPULayers),
|
||||
MMap: *c.MMap,
|
||||
Embeddings: embeddings,
|
||||
LowVRAM: lowVRAM,
|
||||
NGPULayers: int32(nGPULayers),
|
||||
MMap: mmap,
|
||||
MainGPU: c.MainGPU,
|
||||
Threads: int32(*c.Threads),
|
||||
TensorSplit: c.TensorSplit,
|
||||
|
||||
@@ -9,21 +9,9 @@ import (
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
||||
bb := backend
|
||||
if bb == "" {
|
||||
return nil, fmt.Errorf("backend is required")
|
||||
}
|
||||
func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(bb),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
|
||||
rerankModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
63
core/backend/soundgeneration.go
Normal file
63
core/backend/soundgeneration.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
)
|
||||
|
||||
func SoundGeneration(
|
||||
modelFile string,
|
||||
text string,
|
||||
duration *float32,
|
||||
temperature *float32,
|
||||
doSample *bool,
|
||||
sourceFile *string,
|
||||
sourceDivisor *int32,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig,
|
||||
) (string, *proto.Result, error) {
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
|
||||
|
||||
soundGenModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
if soundGenModel == nil {
|
||||
return "", nil, fmt.Errorf("could not load sound generation model")
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
|
||||
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
||||
}
|
||||
|
||||
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
|
||||
filePath := filepath.Join(appConfig.AudioDir, fileName)
|
||||
|
||||
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
|
||||
Text: text,
|
||||
Model: modelFile,
|
||||
Dst: filePath,
|
||||
Sample: doSample,
|
||||
Duration: duration,
|
||||
Temperature: temperature,
|
||||
Src: sourceFile,
|
||||
SrcDivisor: sourceDivisor,
|
||||
})
|
||||
|
||||
// return RPC error if any
|
||||
if !res.Success {
|
||||
return "", nil, fmt.Errorf(res.Message)
|
||||
}
|
||||
|
||||
return filePath, res, err
|
||||
}
|
||||
33
core/backend/token_metrics.go
Normal file
33
core/backend/token_metrics.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func TokenMetrics(
|
||||
modelFile string,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig) (*proto.MetricsResponse, error) {
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{
|
||||
model.WithModel(modelFile),
|
||||
})
|
||||
model, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if model == nil {
|
||||
return nil, fmt.Errorf("could not loadmodel model")
|
||||
}
|
||||
|
||||
res, err := model.GetTokenMetrics(context.Background(), &proto.MetricsRequest{})
|
||||
|
||||
return res, err
|
||||
}
|
||||
44
core/backend/tokenize.go
Normal file
44
core/backend/tokenize.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/grpc"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
|
||||
|
||||
modelFile := backendConfig.Model
|
||||
|
||||
var inferenceModel grpc.Backend
|
||||
var err error
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{
|
||||
model.WithModel(modelFile),
|
||||
})
|
||||
|
||||
if backendConfig.Backend == "" {
|
||||
inferenceModel, err = loader.GreedyLoader(opts...)
|
||||
} else {
|
||||
opts = append(opts, model.WithBackendString(backendConfig.Backend))
|
||||
inferenceModel, err = loader.BackendLoader(opts...)
|
||||
}
|
||||
if err != nil {
|
||||
return schema.TokenizeResponse{}, err
|
||||
}
|
||||
|
||||
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
|
||||
predictOptions.Prompt = s
|
||||
|
||||
// tokenize the string
|
||||
resp, err := inferenceModel.TokenizeString(appConfig.Context, predictOptions)
|
||||
if err != nil {
|
||||
return schema.TokenizeResponse{}, err
|
||||
}
|
||||
|
||||
return schema.TokenizeResponse{
|
||||
Tokens: resp.Tokens,
|
||||
}, nil
|
||||
|
||||
}
|
||||
@@ -3,37 +3,57 @@ package backend
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
|
||||
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithBackendString(model.WhisperBackend),
|
||||
model.WithModel(backendConfig.Model),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithThreads(uint32(*backendConfig.Threads)),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
})
|
||||
if backendConfig.Backend == "" {
|
||||
backendConfig.Backend = model.WhisperBackend
|
||||
}
|
||||
|
||||
whisperModel, err := ml.BackendLoader(opts...)
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
transcriptionModel, err := ml.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if whisperModel == nil {
|
||||
return nil, fmt.Errorf("could not load whisper model")
|
||||
if transcriptionModel == nil {
|
||||
return nil, fmt.Errorf("could not load transcription model")
|
||||
}
|
||||
|
||||
return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
|
||||
r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
|
||||
Dst: audio,
|
||||
Language: language,
|
||||
Translate: translate,
|
||||
Threads: uint32(*backendConfig.Threads),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tr := &schema.TranscriptionResult{
|
||||
Text: r.Text,
|
||||
}
|
||||
for _, s := range r.Segments {
|
||||
var tks []int
|
||||
for _, t := range s.Tokens {
|
||||
tks = append(tks, int(t))
|
||||
}
|
||||
tr.Segments = append(tr.Segments,
|
||||
schema.Segment{
|
||||
Text: s.Text,
|
||||
Id: int(s.Id),
|
||||
Start: time.Duration(s.Start),
|
||||
End: time.Duration(s.End),
|
||||
Tokens: tks,
|
||||
})
|
||||
}
|
||||
return tr, err
|
||||
}
|
||||
|
||||
@@ -9,31 +9,15 @@ import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
)
|
||||
|
||||
func generateUniqueFileName(dir, baseName, ext string) string {
|
||||
counter := 1
|
||||
fileName := baseName + ext
|
||||
|
||||
for {
|
||||
filePath := filepath.Join(dir, fileName)
|
||||
_, err := os.Stat(filePath)
|
||||
if os.IsNotExist(err) {
|
||||
return fileName
|
||||
}
|
||||
|
||||
counter++
|
||||
fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
|
||||
}
|
||||
}
|
||||
|
||||
func ModelTTS(
|
||||
backend,
|
||||
text,
|
||||
modelFile,
|
||||
voice ,
|
||||
voice,
|
||||
language string,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
@@ -44,14 +28,9 @@ func ModelTTS(
|
||||
bb = model.PiperBackend
|
||||
}
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
opts := ModelOptions(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(bb),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
ttsModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
@@ -66,7 +45,7 @@ func ModelTTS(
|
||||
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
||||
}
|
||||
|
||||
fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
|
||||
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
|
||||
filePath := filepath.Join(appConfig.AudioDir, fileName)
|
||||
|
||||
// If the model file is not empty, we pass it joined with the model path
|
||||
@@ -88,12 +67,15 @@ func ModelTTS(
|
||||
}
|
||||
|
||||
res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
|
||||
Text: text,
|
||||
Model: modelPath,
|
||||
Voice: voice,
|
||||
Dst: filePath,
|
||||
Text: text,
|
||||
Model: modelPath,
|
||||
Voice: voice,
|
||||
Dst: filePath,
|
||||
Language: &language,
|
||||
})
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
// return RPC error if any
|
||||
if !res.Success {
|
||||
|
||||
80
core/cli/api/p2p.go
Normal file
80
core/cli/api/p2p.go
Normal file
@@ -0,0 +1,80 @@
|
||||
package cli_api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/p2p"
|
||||
"github.com/mudler/edgevpn/pkg/node"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func StartP2PStack(ctx context.Context, address, token, networkID string, federated bool) error {
|
||||
var n *node.Node
|
||||
// Here we are avoiding creating multiple nodes:
|
||||
// - if the federated mode is enabled, we create a federated node and expose a service
|
||||
// - exposing a service creates a node with specific options, and we don't want to create another node
|
||||
|
||||
// If the federated mode is enabled, we expose a service to the local instance running
|
||||
// at r.Address
|
||||
if federated {
|
||||
_, port, err := net.SplitHostPort(address)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Here a new node is created and started
|
||||
// and a service is exposed by the node
|
||||
node, err := p2p.ExposeService(ctx, "localhost", port, token, p2p.NetworkID(networkID, p2p.FederatedID))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := p2p.ServiceDiscoverer(ctx, node, token, p2p.NetworkID(networkID, p2p.FederatedID), nil, false); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
n = node
|
||||
}
|
||||
|
||||
// If the p2p mode is enabled, we start the service discovery
|
||||
if token != "" {
|
||||
// If a node wasn't created previously, create it
|
||||
if n == nil {
|
||||
node, err := p2p.NewNode(token)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = node.Start(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting new node: %w", err)
|
||||
}
|
||||
n = node
|
||||
}
|
||||
|
||||
// Attach a ServiceDiscoverer to the p2p node
|
||||
log.Info().Msg("Starting P2P server discovery...")
|
||||
if err := p2p.ServiceDiscoverer(ctx, n, token, p2p.NetworkID(networkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
|
||||
var tunnelAddresses []string
|
||||
for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(networkID, p2p.WorkerID)) {
|
||||
if v.IsOnline() {
|
||||
tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
|
||||
} else {
|
||||
log.Info().Msgf("Node %s is offline", v.ID)
|
||||
}
|
||||
}
|
||||
tunnelEnvVar := strings.Join(tunnelAddresses, ",")
|
||||
|
||||
os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
|
||||
log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
|
||||
}, true); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -8,12 +8,13 @@ import (
|
||||
var CLI struct {
|
||||
cliContext.Context `embed:""`
|
||||
|
||||
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
|
||||
Federated FederatedCLI `cmd:"" help:"Run LocalAI in federated mode"`
|
||||
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
|
||||
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
|
||||
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
|
||||
Worker worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
|
||||
Util UtilCMD `cmd:"" help:"Utility commands"`
|
||||
Explorer ExplorerCMD `cmd:"" help:"Run p2p explorer"`
|
||||
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
|
||||
Federated FederatedCLI `cmd:"" help:"Run LocalAI in federated mode"`
|
||||
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
|
||||
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
|
||||
SoundGeneration SoundGenerationCMD `cmd:"" help:"Generates audio files from text or audio"`
|
||||
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
|
||||
Worker worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
|
||||
Util UtilCMD `cmd:"" help:"Utility commands"`
|
||||
Explorer ExplorerCMD `cmd:"" help:"Run p2p explorer"`
|
||||
}
|
||||
|
||||
107
core/cli/run.go
107
core/cli/run.go
@@ -3,11 +3,10 @@ package cli
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cli_api "github.com/mudler/LocalAI/core/cli/api"
|
||||
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/http"
|
||||
@@ -42,29 +41,35 @@ type RunCMD struct {
|
||||
Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
|
||||
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
|
||||
|
||||
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
|
||||
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
|
||||
CORSAllowOrigins string `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
|
||||
LibraryPath string `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
|
||||
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
|
||||
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
|
||||
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
|
||||
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
|
||||
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
|
||||
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
|
||||
Peer2Peer bool `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
|
||||
Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
|
||||
Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
|
||||
ParallelRequests bool `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
|
||||
SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
|
||||
PreloadBackendOnly bool `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
|
||||
ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
|
||||
EnableWatchdogIdle bool `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
|
||||
WatchdogIdleTimeout string `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
|
||||
EnableWatchdogBusy bool `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
|
||||
WatchdogBusyTimeout string `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
|
||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||
DisableGalleryEndpoint bool `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
|
||||
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
|
||||
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
|
||||
CORSAllowOrigins string `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
|
||||
LibraryPath string `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
|
||||
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
|
||||
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
|
||||
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
|
||||
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
|
||||
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
|
||||
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
|
||||
UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
|
||||
DisableApiKeyRequirementForHttpGet bool `env:"LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET" default:"false" help:"If true, a valid API key is not required to issue GET requests to portions of the web ui. This should only be enabled in secure testing environments" group:"hardening"`
|
||||
HttpGetExemptedEndpoints []string `env:"LOCALAI_HTTP_GET_EXEMPTED_ENDPOINTS" default:"^/$,^/browse/?$,^/talk/?$,^/p2p/?$,^/chat/?$,^/text2image/?$,^/tts/?$,^/static/.*$,^/swagger.*$" help:"If LOCALAI_DISABLE_API_KEY_REQUIREMENT_FOR_HTTP_GET is overriden to true, this is the list of endpoints to exempt. Only adjust this in case of a security incident or as a result of a personal security posture review" group:"hardening"`
|
||||
Peer2Peer bool `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
|
||||
Peer2PeerDHTInterval int `env:"LOCALAI_P2P_DHT_INTERVAL,P2P_DHT_INTERVAL" default:"360" name:"p2p-dht-interval" help:"Interval for DHT refresh (used during token generation)" group:"p2p"`
|
||||
Peer2PeerOTPInterval int `env:"LOCALAI_P2P_OTP_INTERVAL,P2P_OTP_INTERVAL" default:"9000" name:"p2p-otp-interval" help:"Interval for OTP refresh (used during token generation)" group:"p2p"`
|
||||
Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
|
||||
Peer2PeerNetworkID string `env:"LOCALAI_P2P_NETWORK_ID,P2P_NETWORK_ID" help:"Network ID for P2P mode, can be set arbitrarly by the user for grouping a set of instances" group:"p2p"`
|
||||
ParallelRequests bool `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
|
||||
SingleActiveBackend bool `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
|
||||
PreloadBackendOnly bool `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
|
||||
ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
|
||||
EnableWatchdogIdle bool `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
|
||||
WatchdogIdleTimeout string `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
|
||||
EnableWatchdogBusy bool `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
|
||||
WatchdogBusyTimeout string `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
|
||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||
DisableGalleryEndpoint bool `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
|
||||
LoadToMemory []string `env:"LOCALAI_LOAD_TO_MEMORY,LOAD_TO_MEMORY" help:"A list of models to load into memory at startup" group:"models"`
|
||||
}
|
||||
|
||||
func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
@@ -96,7 +101,11 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
|
||||
config.WithOpaqueErrors(r.OpaqueErrors),
|
||||
config.WithEnforcedPredownloadScans(!r.DisablePredownloadScan),
|
||||
config.WithSubtleKeyComparison(r.UseSubtleKeyComparison),
|
||||
config.WithDisableApiKeyRequirementForHttpGet(r.DisableApiKeyRequirementForHttpGet),
|
||||
config.WithHttpGetExemptedEndpoints(r.HttpGetExemptedEndpoints),
|
||||
config.WithP2PNetworkID(r.Peer2PeerNetworkID),
|
||||
config.WithLoadToMemory(r.LoadToMemory),
|
||||
}
|
||||
|
||||
token := ""
|
||||
@@ -107,7 +116,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
// IF no token is provided, and p2p is enabled,
|
||||
// we generate one and wait for the user to pick up the token (this is for interactive)
|
||||
log.Info().Msg("No token provided, generating one")
|
||||
token = p2p.GenerateToken()
|
||||
token = p2p.GenerateToken(r.Peer2PeerDHTInterval, r.Peer2PeerOTPInterval)
|
||||
log.Info().Msg("Generated Token:")
|
||||
fmt.Println(token)
|
||||
|
||||
@@ -115,52 +124,12 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
fmt.Printf("export TOKEN=\"%s\"\nlocal-ai worker p2p-llama-cpp-rpc\n", token)
|
||||
}
|
||||
opts = append(opts, config.WithP2PToken(token))
|
||||
|
||||
node, err := p2p.NewNode(token)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
nodeContext := context.Background()
|
||||
|
||||
err = node.Start(nodeContext)
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting new node: %w", err)
|
||||
}
|
||||
|
||||
log.Info().Msg("Starting P2P server discovery...")
|
||||
if err := p2p.ServiceDiscoverer(nodeContext, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID), func(serviceID string, node p2p.NodeData) {
|
||||
var tunnelAddresses []string
|
||||
for _, v := range p2p.GetAvailableNodes(p2p.NetworkID(r.Peer2PeerNetworkID, p2p.WorkerID)) {
|
||||
if v.IsOnline() {
|
||||
tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
|
||||
} else {
|
||||
log.Info().Msgf("Node %s is offline", v.ID)
|
||||
}
|
||||
}
|
||||
tunnelEnvVar := strings.Join(tunnelAddresses, ",")
|
||||
|
||||
os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
|
||||
log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
|
||||
}, true); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if r.Federated {
|
||||
_, port, err := net.SplitHostPort(r.Address)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fedCtx := context.Background()
|
||||
backgroundCtx := context.Background()
|
||||
|
||||
node, err := p2p.ExposeService(fedCtx, "localhost", port, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := p2p.ServiceDiscoverer(fedCtx, node, token, p2p.NetworkID(r.Peer2PeerNetworkID, p2p.FederatedID), nil, false); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := cli_api.StartP2PStack(backgroundCtx, r.Address, token, r.Peer2PeerNetworkID, r.Federated); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
idleWatchDog := r.EnableWatchdogIdle
|
||||
|
||||
111
core/cli/soundgeneration.go
Normal file
111
core/cli/soundgeneration.go
Normal file
@@ -0,0 +1,111 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
type SoundGenerationCMD struct {
|
||||
Text []string `arg:""`
|
||||
|
||||
Backend string `short:"b" required:"" help:"Backend to run the SoundGeneration model"`
|
||||
Model string `short:"m" required:"" help:"Model name to run the SoundGeneration"`
|
||||
Duration string `short:"d" help:"If specified, the length of audio to generate in seconds"`
|
||||
Temperature string `short:"t" help:"If specified, the temperature of the generation"`
|
||||
InputFile string `short:"i" help:"If specified, the input file to condition generation upon"`
|
||||
InputFileSampleDivisor string `short:"f" help:"If InputFile and this divisor is specified, the first portion of the sample file will be used"`
|
||||
DoSample bool `short:"s" default:"true" help:"Enables sampling from the model. Better quality at the cost of speed. Defaults to enabled."`
|
||||
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||
ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
|
||||
}
|
||||
|
||||
func parseToFloat32Ptr(input string) *float32 {
|
||||
f, err := strconv.ParseFloat(input, 32)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
f2 := float32(f)
|
||||
return &f2
|
||||
}
|
||||
|
||||
func parseToInt32Ptr(input string) *int32 {
|
||||
i, err := strconv.ParseInt(input, 10, 32)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
i2 := int32(i)
|
||||
return &i2
|
||||
}
|
||||
|
||||
func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
|
||||
outputFile := t.OutputFile
|
||||
outputDir := t.BackendAssetsPath
|
||||
if outputFile != "" {
|
||||
outputDir = filepath.Dir(outputFile)
|
||||
}
|
||||
|
||||
text := strings.Join(t.Text, " ")
|
||||
|
||||
externalBackends := make(map[string]string)
|
||||
// split ":" to get backend name and the uri
|
||||
for _, v := range t.ExternalGRPCBackends {
|
||||
backend := v[:strings.IndexByte(v, ':')]
|
||||
uri := v[strings.IndexByte(v, ':')+1:]
|
||||
externalBackends[backend] = uri
|
||||
fmt.Printf("TMP externalBackends[%q]=%q\n\n", backend, uri)
|
||||
}
|
||||
|
||||
opts := &config.ApplicationConfig{
|
||||
ModelPath: t.ModelsPath,
|
||||
Context: context.Background(),
|
||||
AudioDir: outputDir,
|
||||
AssetsDestination: t.BackendAssetsPath,
|
||||
ExternalGRPCBackends: externalBackends,
|
||||
}
|
||||
ml := model.NewModelLoader(opts.ModelPath)
|
||||
|
||||
defer func() {
|
||||
err := ml.StopAllGRPC()
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("unable to stop all grpc processes")
|
||||
}
|
||||
}()
|
||||
|
||||
options := config.BackendConfig{}
|
||||
options.SetDefaults()
|
||||
options.Backend = t.Backend
|
||||
|
||||
var inputFile *string
|
||||
if t.InputFile != "" {
|
||||
inputFile = &t.InputFile
|
||||
}
|
||||
|
||||
filePath, _, err := backend.SoundGeneration(t.Model, text,
|
||||
parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
|
||||
inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if outputFile != "" {
|
||||
if err := os.Rename(filePath, outputFile); err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("Generate file %s\n", outputFile)
|
||||
} else {
|
||||
fmt.Printf("Generate file %s\n", filePath)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -15,8 +15,9 @@ import (
|
||||
)
|
||||
|
||||
type UtilCMD struct {
|
||||
GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
|
||||
HFScan HFScanCMD `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
|
||||
GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
|
||||
HFScan HFScanCMD `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
|
||||
UsecaseHeuristic UsecaseHeuristicCMD `cmd:"" name:"usecase-heuristic" help:"Checks a specific model config and prints what usecase LocalAI will offer for it."`
|
||||
}
|
||||
|
||||
type GGUFInfoCMD struct {
|
||||
@@ -30,6 +31,11 @@ type HFScanCMD struct {
|
||||
ToScan []string `arg:""`
|
||||
}
|
||||
|
||||
type UsecaseHeuristicCMD struct {
|
||||
ConfigName string `name:"The config file to check"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
}
|
||||
|
||||
func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
|
||||
if u.Args == nil || len(u.Args) == 0 {
|
||||
return fmt.Errorf("no GGUF file provided")
|
||||
@@ -99,3 +105,31 @@ func (hfscmd *HFScanCMD) Run(ctx *cliContext.Context) error {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (uhcmd *UsecaseHeuristicCMD) Run(ctx *cliContext.Context) error {
|
||||
if len(uhcmd.ConfigName) == 0 {
|
||||
log.Error().Msg("ConfigName is a required parameter")
|
||||
return fmt.Errorf("config name is a required parameter")
|
||||
}
|
||||
if len(uhcmd.ModelsPath) == 0 {
|
||||
log.Error().Msg("ModelsPath is a required parameter")
|
||||
return fmt.Errorf("model path is a required parameter")
|
||||
}
|
||||
bcl := config.NewBackendConfigLoader(uhcmd.ModelsPath)
|
||||
err := bcl.LoadBackendConfig(uhcmd.ConfigName)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("ConfigName", uhcmd.ConfigName).Msg("error while loading backend")
|
||||
return err
|
||||
}
|
||||
bc, exists := bcl.GetBackendConfig(uhcmd.ConfigName)
|
||||
if !exists {
|
||||
log.Error().Str("ConfigName", uhcmd.ConfigName).Msg("ConfigName not found")
|
||||
}
|
||||
for name, uc := range config.GetAllBackendConfigUsecases() {
|
||||
if bc.HasUsecases(uc) {
|
||||
log.Info().Str("Usecase", name)
|
||||
}
|
||||
}
|
||||
log.Info().Msg("---")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package worker
|
||||
|
||||
type WorkerFlags struct {
|
||||
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||
ExtraLLamaCPPArgs string `name:"llama-cpp-args" env:"LOCALAI_EXTRA_LLAMA_CPP_ARGS,EXTRA_LLAMA_CPP_ARGS" help:"Extra arguments to pass to llama-cpp-rpc-server"`
|
||||
}
|
||||
|
||||
type Worker struct {
|
||||
|
||||
@@ -3,6 +3,7 @@ package worker
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
||||
@@ -12,7 +13,6 @@ import (
|
||||
)
|
||||
|
||||
type LLamaCPP struct {
|
||||
Args []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
|
||||
WorkerFlags `embed:""`
|
||||
}
|
||||
|
||||
@@ -34,9 +34,8 @@ func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
|
||||
"llama-cpp-rpc-server",
|
||||
)
|
||||
|
||||
args := os.Args[4:]
|
||||
args := strings.Split(r.ExtraLLamaCPPArgs, " ")
|
||||
args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
|
||||
|
||||
args = append([]string{grpcProcess}, args...)
|
||||
return syscall.Exec(
|
||||
grpcProcess,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user