mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-04 11:42:57 -05:00
Compare commits
1 Commits
v2.27.0
...
fix/closed
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83110891fd |
@@ -7,7 +7,7 @@ services:
|
||||
args:
|
||||
- FFMPEG=true
|
||||
- IMAGE_TYPE=extras
|
||||
- GO_TAGS=p2p tts
|
||||
- GO_TAGS=stablediffusion p2p tts
|
||||
env_file:
|
||||
- ../.env
|
||||
ports:
|
||||
|
||||
15
.env
15
.env
@@ -38,12 +38,12 @@
|
||||
## Uncomment and set to true to enable rebuilding from source
|
||||
# REBUILD=true
|
||||
|
||||
## Enable go tags, available: p2p, tts
|
||||
## p2p: enable distributed inferencing
|
||||
## Enable go tags, available: stablediffusion, tts
|
||||
## stablediffusion: image generation with stablediffusion
|
||||
## tts: enables text-to-speech with go-piper
|
||||
## (requires REBUILD=true)
|
||||
#
|
||||
# GO_TAGS=p2p
|
||||
# GO_TAGS=stablediffusion
|
||||
|
||||
## Path where to store generated images
|
||||
# LOCALAI_IMAGE_PATH=/tmp/generated/images
|
||||
@@ -82,15 +82,6 @@
|
||||
# Enable to allow p2p mode
|
||||
# LOCALAI_P2P=true
|
||||
|
||||
# Enable to use federated mode
|
||||
# LOCALAI_FEDERATED=true
|
||||
|
||||
# Enable to start federation server
|
||||
# FEDERATED_SERVER=true
|
||||
|
||||
# Define to use federation token
|
||||
# TOKEN=""
|
||||
|
||||
### Watchdog settings
|
||||
###
|
||||
# Enables watchdog to kill backends that are inactive for too much time
|
||||
|
||||
1
.gitattributes
vendored
1
.gitattributes
vendored
@@ -1,2 +1 @@
|
||||
*.sh text eol=lf
|
||||
backend/cpp/llama/*.hpp linguist-vendored
|
||||
7
.github/ci/modelslist.go
vendored
7
.github/ci/modelslist.go
vendored
@@ -6,7 +6,6 @@ import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
@@ -280,12 +279,6 @@ func main() {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure that all arbitrary text content is sanitized before display
|
||||
for i, m := range models {
|
||||
models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
|
||||
models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
|
||||
}
|
||||
|
||||
// render the template
|
||||
data := struct {
|
||||
Models []*GalleryModel
|
||||
|
||||
10
.github/dependabot.yml
vendored
10
.github/dependabot.yml
vendored
@@ -9,8 +9,6 @@ updates:
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
ignore:
|
||||
- dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
- package-ecosystem: "github-actions"
|
||||
# Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
|
||||
directory: "/"
|
||||
@@ -81,6 +79,14 @@ updates:
|
||||
directory: "/backend/python/transformers"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/backend/python/transformers-musicgen"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/backend/python/vall-e-x"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/backend/python/vllm"
|
||||
schedule:
|
||||
|
||||
11
.github/labeler.yml
vendored
11
.github/labeler.yml
vendored
@@ -1,15 +1,6 @@
|
||||
enhancement:
|
||||
enhancements:
|
||||
- head-branch: ['^feature', 'feature']
|
||||
|
||||
dependencies:
|
||||
- any:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: 'Makefile'
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: '*.mod'
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: '*.sum'
|
||||
|
||||
kind/documentation:
|
||||
- any:
|
||||
- changed-files:
|
||||
|
||||
19
.github/workflows/bump_deps.yaml
vendored
19
.github/workflows/bump_deps.yaml
vendored
@@ -9,17 +9,26 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- repository: "ggml-org/llama.cpp"
|
||||
- repository: "ggerganov/llama.cpp"
|
||||
variable: "CPPLLAMA_VERSION"
|
||||
branch: "master"
|
||||
- repository: "go-skynet/go-ggml-transformers.cpp"
|
||||
variable: "GOGGMLTRANSFORMERS_VERSION"
|
||||
branch: "master"
|
||||
- repository: "donomii/go-rwkv.cpp"
|
||||
variable: "RWKV_VERSION"
|
||||
branch: "main"
|
||||
- repository: "ggerganov/whisper.cpp"
|
||||
variable: "WHISPER_CPP_VERSION"
|
||||
branch: "master"
|
||||
- repository: "PABannier/bark.cpp"
|
||||
variable: "BARKCPP_VERSION"
|
||||
- repository: "go-skynet/go-bert.cpp"
|
||||
variable: "BERT_VERSION"
|
||||
branch: "master"
|
||||
- repository: "go-skynet/bloomz.cpp"
|
||||
variable: "BLOOMZ_VERSION"
|
||||
branch: "main"
|
||||
- repository: "leejet/stable-diffusion.cpp"
|
||||
variable: "STABLEDIFFUSION_GGML_VERSION"
|
||||
- repository: "mudler/go-ggllm.cpp"
|
||||
variable: "GOGGLLM_VERSION"
|
||||
branch: "master"
|
||||
- repository: "mudler/go-stable-diffusion"
|
||||
variable: "STABLEDIFFUSION_VERSION"
|
||||
|
||||
2
.github/workflows/checksum_checker.yaml
vendored
2
.github/workflows/checksum_checker.yaml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
sudo pip install --upgrade pip
|
||||
pip install huggingface_hub
|
||||
- name: 'Setup yq'
|
||||
uses: dcarbone/install-yq-action@v1.3.1
|
||||
uses: dcarbone/install-yq-action@v1.1.1
|
||||
with:
|
||||
version: 'v4.44.2'
|
||||
download-compressed: true
|
||||
|
||||
2
.github/workflows/dependabot_auto.yml
vendored
2
.github/workflows/dependabot_auto.yml
vendored
@@ -14,7 +14,7 @@ jobs:
|
||||
steps:
|
||||
- name: Dependabot metadata
|
||||
id: metadata
|
||||
uses: dependabot/fetch-metadata@v2.3.0
|
||||
uses: dependabot/fetch-metadata@v2.2.0
|
||||
with:
|
||||
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
||||
skip-commit-verification: true
|
||||
|
||||
4
.github/workflows/deploy-explorer.yaml
vendored
4
.github/workflows/deploy-explorer.yaml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
run: |
|
||||
CGO_ENABLED=0 make build-api
|
||||
- name: rm
|
||||
uses: appleboy/ssh-action@v1.2.2
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
rm: true
|
||||
target: ./local-ai
|
||||
- name: restarting
|
||||
uses: appleboy/ssh-action@v1.2.2
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
|
||||
9
.github/workflows/generate_grpc_cache.yaml
vendored
9
.github/workflows/generate_grpc_cache.yaml
vendored
@@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
schedule:
|
||||
# daily at midnight
|
||||
- cron: '0 0 * * *'
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
|
||||
@@ -17,7 +16,7 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- grpc-base-image: ubuntu:22.04
|
||||
runs-on: 'arc-runner-set'
|
||||
runs-on: 'ubuntu-latest'
|
||||
platforms: 'linux/amd64,linux/arm64'
|
||||
runs-on: ${{matrix.runs-on}}
|
||||
steps:
|
||||
|
||||
2
.github/workflows/generate_intel_image.yaml
vendored
2
.github/workflows/generate_intel_image.yaml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
|
||||
- base-image: intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04
|
||||
runs-on: 'ubuntu-latest'
|
||||
platforms: 'linux/amd64'
|
||||
runs-on: ${{matrix.runs-on}}
|
||||
|
||||
47
.github/workflows/image.yml
vendored
47
.github/workflows/image.yml
vendored
@@ -280,7 +280,6 @@ jobs:
|
||||
makeflags: ${{ matrix.makeflags }}
|
||||
latest-image: ${{ matrix.latest-image }}
|
||||
latest-image-aio: ${{ matrix.latest-image-aio }}
|
||||
skip-drivers: ${{ matrix.skip-drivers }}
|
||||
secrets:
|
||||
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
@@ -302,7 +301,6 @@ jobs:
|
||||
latest-image: 'latest-cpu'
|
||||
latest-image-aio: 'latest-aio-cpu'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "11"
|
||||
cuda-minor-version: "7"
|
||||
@@ -314,7 +312,6 @@ jobs:
|
||||
base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
@@ -326,7 +323,6 @@ jobs:
|
||||
base-image: "ubuntu:22.04"
|
||||
runs-on: 'arc-runner-set'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "11"
|
||||
cuda-minor-version: "7"
|
||||
@@ -338,7 +334,6 @@ jobs:
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'false'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
@@ -349,7 +344,6 @@ jobs:
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
skip-drivers: 'false'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
- build-type: 'vulkan'
|
||||
platforms: 'linux/amd64'
|
||||
@@ -360,45 +354,4 @@ jobs:
|
||||
image-type: 'core'
|
||||
runs-on: 'arc-runner-set'
|
||||
base-image: "ubuntu:22.04"
|
||||
skip-drivers: 'false'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
gh-runner:
|
||||
uses: ./.github/workflows/image_build.yml
|
||||
with:
|
||||
tag-latest: ${{ matrix.tag-latest }}
|
||||
tag-suffix: ${{ matrix.tag-suffix }}
|
||||
ffmpeg: ${{ matrix.ffmpeg }}
|
||||
image-type: ${{ matrix.image-type }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
cuda-major-version: ${{ matrix.cuda-major-version }}
|
||||
cuda-minor-version: ${{ matrix.cuda-minor-version }}
|
||||
platforms: ${{ matrix.platforms }}
|
||||
runs-on: ${{ matrix.runs-on }}
|
||||
aio: ${{ matrix.aio }}
|
||||
base-image: ${{ matrix.base-image }}
|
||||
grpc-base-image: ${{ matrix.grpc-base-image }}
|
||||
makeflags: ${{ matrix.makeflags }}
|
||||
latest-image: ${{ matrix.latest-image }}
|
||||
latest-image-aio: ${{ matrix.latest-image-aio }}
|
||||
skip-drivers: ${{ matrix.skip-drivers }}
|
||||
secrets:
|
||||
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
|
||||
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
tag-latest: 'false'
|
||||
tag-suffix: '-nvidia-l4t-arm64-core'
|
||||
latest-image: 'latest-nvidia-l4t-arm64-core'
|
||||
ffmpeg: 'true'
|
||||
image-type: 'core'
|
||||
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
makeflags: "--jobs=4 --output-sync=target"
|
||||
skip-drivers: 'true'
|
||||
11
.github/workflows/image_build.yml
vendored
11
.github/workflows/image_build.yml
vendored
@@ -49,10 +49,6 @@ on:
|
||||
description: 'FFMPEG'
|
||||
default: ''
|
||||
type: string
|
||||
skip-drivers:
|
||||
description: 'Skip drivers by default'
|
||||
default: 'false'
|
||||
type: string
|
||||
image-type:
|
||||
description: 'Image type'
|
||||
default: ''
|
||||
@@ -238,7 +234,6 @@ jobs:
|
||||
GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
|
||||
GRPC_VERSION=v1.65.0
|
||||
MAKEFLAGS=${{ inputs.makeflags }}
|
||||
SKIP_DRIVERS=${{ inputs.skip-drivers }}
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
cache-from: type=gha
|
||||
@@ -267,7 +262,6 @@ jobs:
|
||||
GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
|
||||
GRPC_VERSION=v1.65.0
|
||||
MAKEFLAGS=${{ inputs.makeflags }}
|
||||
SKIP_DRIVERS=${{ inputs.skip-drivers }}
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
cache-from: type=gha
|
||||
@@ -310,11 +304,6 @@ jobs:
|
||||
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
|
||||
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
|
||||
|
||||
- name: Cleanup
|
||||
run: |
|
||||
docker builder prune -f
|
||||
docker system prune --force --volumes --all
|
||||
|
||||
- name: Latest tag
|
||||
# run this on branches, when it is a tag and there is a latest-image defined
|
||||
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'
|
||||
|
||||
8
.github/workflows/notify-models.yaml
vendored
8
.github/workflows/notify-models.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
with:
|
||||
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||
# Check the PR diff using the current branch and the base branch of the PR
|
||||
- uses: GrantBirki/git-diff-action@v2.8.0
|
||||
- uses: GrantBirki/git-diff-action@v2.7.0
|
||||
id: git-diff-action
|
||||
with:
|
||||
json_diff_file_output: diff.json
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
args: ${{ steps.summarize.outputs.message }}
|
||||
- name: Setup tmate session if fails
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
|
||||
until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done
|
||||
# Check the PR diff using the current branch and the base branch of the PR
|
||||
- uses: GrantBirki/git-diff-action@v2.8.0
|
||||
- uses: GrantBirki/git-diff-action@v2.7.0
|
||||
id: git-diff-action
|
||||
with:
|
||||
json_diff_file_output: diff.json
|
||||
@@ -161,7 +161,7 @@ jobs:
|
||||
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
|
||||
- name: Setup tmate session if fails
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
43
.github/workflows/release.yaml
vendored
43
.github/workflows/release.yaml
vendored
@@ -123,7 +123,7 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -232,12 +232,45 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
limit-access-to-actor: true
|
||||
|
||||
build-stablediffusion:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.21.x'
|
||||
cache: false
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
- name: Build stablediffusion
|
||||
run: |
|
||||
export PATH=$PATH:$GOPATH/bin
|
||||
make backend-assets/grpc/stablediffusion
|
||||
mkdir -p release && cp backend-assets/grpc/stablediffusion release
|
||||
env:
|
||||
GO_TAGS: stablediffusion
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: stablediffusion
|
||||
path: release/
|
||||
- name: Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
with:
|
||||
files: |
|
||||
release/*
|
||||
|
||||
build-macOS-x86_64:
|
||||
runs-on: macos-13
|
||||
@@ -275,7 +308,7 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -317,7 +350,7 @@ jobs:
|
||||
release/*
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
uses: securego/gosec@v2.22.0
|
||||
uses: securego/gosec@v2.21.4
|
||||
with:
|
||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||
|
||||
128
.github/workflows/test-extra.yml
vendored
128
.github/workflows/test-extra.yml
vendored
@@ -35,6 +35,30 @@ jobs:
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/transformers
|
||||
make --jobs=5 --output-sync=target -C backend/python/transformers test
|
||||
|
||||
tests-sentencetransformers:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ffmpeg
|
||||
# Install UV
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||
sudo apt-get install -y libopencv-dev
|
||||
pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
|
||||
- name: Test sentencetransformers
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
|
||||
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
|
||||
|
||||
|
||||
tests-rerankers:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
@@ -78,27 +102,71 @@ jobs:
|
||||
make --jobs=5 --output-sync=target -C backend/python/diffusers
|
||||
make --jobs=5 --output-sync=target -C backend/python/diffusers test
|
||||
|
||||
# tests-transformers-musicgen:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential ffmpeg
|
||||
# # Install UV
|
||||
# curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||
# sudo apt-get install -y libopencv-dev
|
||||
# pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
tests-parler-tts:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ffmpeg
|
||||
# Install UV
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||
sudo apt-get install -y libopencv-dev
|
||||
pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
|
||||
# - name: Test transformers-musicgen
|
||||
# run: |
|
||||
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
|
||||
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
|
||||
- name: Test parler-tts
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/parler-tts
|
||||
make --jobs=5 --output-sync=target -C backend/python/parler-tts test
|
||||
|
||||
tests-openvoice:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ffmpeg
|
||||
# Install UV
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||
sudo apt-get install -y libopencv-dev
|
||||
pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
|
||||
- name: Test openvoice
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/openvoice
|
||||
make --jobs=5 --output-sync=target -C backend/python/openvoice test
|
||||
|
||||
tests-transformers-musicgen:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ffmpeg
|
||||
# Install UV
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||
sudo apt-get install -y libopencv-dev
|
||||
pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
|
||||
- name: Test transformers-musicgen
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
|
||||
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
|
||||
|
||||
# tests-bark:
|
||||
# runs-on: ubuntu-latest
|
||||
@@ -185,6 +253,26 @@ jobs:
|
||||
# run: |
|
||||
# make --jobs=5 --output-sync=target -C backend/python/vllm
|
||||
# make --jobs=5 --output-sync=target -C backend/python/vllm test
|
||||
tests-vallex:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ffmpeg
|
||||
# Install UV
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||
sudo apt-get install -y libopencv-dev
|
||||
pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
- name: Test vall-e-x
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/vall-e-x
|
||||
make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
|
||||
|
||||
tests-coqui:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
17
.github/workflows/test.yml
vendored
17
.github/workflows/test.yml
vendored
@@ -100,12 +100,15 @@ jobs:
|
||||
# The python3-grpc-tools package in 22.04 is too old
|
||||
pip install --user grpcio-tools
|
||||
|
||||
make -C backend/python/transformers
|
||||
sudo rm -rfv /usr/bin/conda || true
|
||||
PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
|
||||
|
||||
# Pre-build piper before we start tests in order to have shared libraries in place
|
||||
make sources/go-piper && \
|
||||
GO_TAGS="tts" make -C sources/go-piper piper.o && \
|
||||
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
|
||||
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
|
||||
# Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
|
||||
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
|
||||
env:
|
||||
CUDA_VERSION: 12-4
|
||||
- name: Cache grpc
|
||||
@@ -127,10 +130,10 @@ jobs:
|
||||
cd grpc && cd cmake/build && sudo make --jobs 5 install
|
||||
- name: Test
|
||||
run: |
|
||||
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
|
||||
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -194,7 +197,7 @@ jobs:
|
||||
make run-e2e-aio
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -221,7 +224,7 @@ jobs:
|
||||
- name: Dependencies
|
||||
run: |
|
||||
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
|
||||
pip install --user --no-cache-dir grpcio-tools
|
||||
pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||
- name: Test
|
||||
run: |
|
||||
export C_INCLUDE_PATH=/usr/local/include
|
||||
@@ -232,7 +235,7 @@ jobs:
|
||||
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.19
|
||||
uses: mxschmitt/action-tmate@v3.18
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,7 +2,6 @@
|
||||
/sources/
|
||||
__pycache__/
|
||||
*.a
|
||||
*.o
|
||||
get-sources
|
||||
prepare-sources
|
||||
/backend/cpp/llama/grpc-server
|
||||
@@ -13,6 +12,7 @@ prepare-sources
|
||||
|
||||
go-ggml-transformers
|
||||
go-gpt2
|
||||
go-rwkv
|
||||
whisper.cpp
|
||||
/bloomz
|
||||
go-bert
|
||||
|
||||
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@@ -26,7 +26,7 @@
|
||||
"LOCALAI_P2P": "true",
|
||||
"LOCALAI_FEDERATED": "true"
|
||||
},
|
||||
"buildFlags": ["-tags", "p2p tts", "-v"],
|
||||
"buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"cwd": "${workspaceRoot}"
|
||||
}
|
||||
|
||||
116
Dockerfile
116
Dockerfile
@@ -9,39 +9,25 @@ FROM ${BASE_IMAGE} AS requirements-core
|
||||
USER root
|
||||
|
||||
ARG GO_VERSION=1.22.6
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ccache \
|
||||
ca-certificates \
|
||||
curl libssl-dev \
|
||||
cmake \
|
||||
curl \
|
||||
git \
|
||||
git-lfs \
|
||||
unzip upx-ucl && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# Install Go
|
||||
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
|
||||
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
|
||||
@@ -69,10 +55,14 @@ ENV PATH=/opt/rocm/bin:${PATH}
|
||||
# OpenBLAS requirements and stable diffusion
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libopenblas-dev && \
|
||||
libopenblas-dev \
|
||||
libopencv-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set up OpenCV
|
||||
RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
|
||||
|
||||
WORKDIR /build
|
||||
|
||||
###################################
|
||||
@@ -81,8 +71,7 @@ WORKDIR /build
|
||||
# The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
|
||||
FROM requirements-core AS requirements-extras
|
||||
|
||||
# Install uv as a system package
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
@@ -111,13 +100,12 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
|
||||
ARG BUILD_TYPE
|
||||
ARG CUDA_MAJOR_VERSION=12
|
||||
ARG CUDA_MINOR_VERSION=0
|
||||
ARG SKIP_DRIVERS=false
|
||||
|
||||
ENV BUILD_TYPE=${BUILD_TYPE}
|
||||
|
||||
# Vulkan requirements
|
||||
RUN <<EOT bash
|
||||
if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
|
||||
if [ "${BUILD_TYPE}" = "vulkan" ]; then
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
software-properties-common pciutils wget gpg-agent && \
|
||||
@@ -133,7 +121,7 @@ EOT
|
||||
|
||||
# CuBLAS requirements
|
||||
RUN <<EOT bash
|
||||
if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
|
||||
if [ "${BUILD_TYPE}" = "cublas" ]; then
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
software-properties-common pciutils
|
||||
@@ -159,7 +147,7 @@ RUN <<EOT bash
|
||||
EOT
|
||||
|
||||
# If we are building with clblas support, we need the libraries for the builds
|
||||
RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
|
||||
RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libclblast-dev && \
|
||||
@@ -167,7 +155,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
|
||||
rm -rf /var/lib/apt/lists/* \
|
||||
; fi
|
||||
|
||||
RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
|
||||
RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
hipblas-dev \
|
||||
@@ -200,8 +188,6 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
|
||||
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
|
||||
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
|
||||
ARG GRPC_VERSION=v1.65.0
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
|
||||
ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
|
||||
|
||||
@@ -210,24 +196,12 @@ WORKDIR /build
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
build-essential curl libssl-dev \
|
||||
build-essential \
|
||||
cmake \
|
||||
git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
|
||||
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
|
||||
# and running make install in the target container
|
||||
@@ -247,7 +221,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
|
||||
|
||||
FROM requirements-drivers AS builder-base
|
||||
|
||||
ARG GO_TAGS="tts p2p"
|
||||
ARG GO_TAGS="stablediffusion tts p2p"
|
||||
ARG GRPC_BACKENDS
|
||||
ARG MAKEFLAGS
|
||||
ARG LD_FLAGS="-s -w"
|
||||
@@ -281,12 +255,35 @@ RUN <<EOT bash
|
||||
fi
|
||||
EOT
|
||||
|
||||
|
||||
###################################
|
||||
###################################
|
||||
|
||||
# This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
|
||||
# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
|
||||
FROM builder-base AS builder-sd
|
||||
|
||||
# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
|
||||
COPY Makefile .
|
||||
COPY go.mod .
|
||||
COPY go.sum .
|
||||
COPY backend/backend.proto ./backend/backend.proto
|
||||
COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
|
||||
COPY pkg/grpc ./pkg/grpc
|
||||
COPY pkg/stablediffusion ./pkg/stablediffusion
|
||||
RUN git init
|
||||
RUN make sources/go-stable-diffusion
|
||||
RUN touch prepare-sources
|
||||
|
||||
# Actually build the backend
|
||||
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
|
||||
|
||||
###################################
|
||||
###################################
|
||||
|
||||
# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
|
||||
# Adjustments to the build process should likely be made here.
|
||||
FROM builder-base AS builder
|
||||
FROM builder-sd AS builder
|
||||
|
||||
# Install the pre-built GRPC
|
||||
COPY --from=grpc /opt/grpc /usr/local
|
||||
@@ -304,7 +301,7 @@ RUN make prepare
|
||||
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
|
||||
## (both will use CUDA or hipblas for the actual computation)
|
||||
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
||||
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
|
||||
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
|
||||
else \
|
||||
make build; \
|
||||
fi
|
||||
@@ -326,6 +323,8 @@ ARG FFMPEG
|
||||
|
||||
COPY --from=grpc /opt/grpc /usr/local
|
||||
|
||||
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
|
||||
|
||||
COPY .devcontainer-scripts /.devcontainer-scripts
|
||||
|
||||
# Add FFmpeg
|
||||
@@ -398,28 +397,36 @@ COPY --from=builder /build/local-ai ./
|
||||
# Copy shared libraries for piper
|
||||
COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
|
||||
|
||||
# do not let stablediffusion rebuild (requires an older version of absl)
|
||||
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
|
||||
|
||||
# Change the shell to bash so we can use [[ tests below
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
# We try to strike a balance between individual layer size (as that affects total push time) and total image size
|
||||
# Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
|
||||
# Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
|
||||
|
||||
RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
|
||||
apt-get -qq -y install espeak-ng \
|
||||
; fi
|
||||
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/coqui \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/faster-whisper \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/parler-tts \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/diffusers \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/transformers-musicgen \
|
||||
; fi
|
||||
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/kokoro \
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/vall-e-x \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/openvoice \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/sentencetransformers \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/exllama2 \
|
||||
@@ -439,6 +446,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/rerankers \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/mamba \
|
||||
; fi
|
||||
|
||||
# Make sure the models directory exists
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
|
||||
Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
348
Makefile
348
Makefile
@@ -6,27 +6,33 @@ BINARY_NAME=local-ai
|
||||
DETECT_LIBS?=true
|
||||
|
||||
# llama.cpp versions
|
||||
CPPLLAMA_VERSION?=4663bd353c61c1136cd8a97b9908755e4ab30cec
|
||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077
|
||||
|
||||
# go-rwkv version
|
||||
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
|
||||
RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
|
||||
WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b
|
||||
|
||||
# bert.cpp version
|
||||
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
|
||||
BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
|
||||
|
||||
# go-piper version
|
||||
PIPER_REPO?=https://github.com/mudler/go-piper
|
||||
PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
|
||||
PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
|
||||
|
||||
# bark.cpp
|
||||
BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
|
||||
BARKCPP_VERSION?=v1.0.0
|
||||
# stablediffusion version
|
||||
STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
|
||||
STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
|
||||
|
||||
# stablediffusion.cpp (ggml)
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=19d876ee300a055629926ff836489901f734f2b7
|
||||
|
||||
ONNX_VERSION?=1.20.0
|
||||
ONNX_ARCH?=x64
|
||||
ONNX_OS?=linux
|
||||
# tinydream version
|
||||
TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
|
||||
TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
|
||||
|
||||
export BUILD_TYPE?=
|
||||
export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
|
||||
@@ -39,7 +45,6 @@ CGO_LDFLAGS_WHISPER+=-lggml
|
||||
CUDA_LIBPATH?=/usr/local/cuda/lib64/
|
||||
GO_TAGS?=
|
||||
BUILD_ID?=
|
||||
NATIVE?=false
|
||||
|
||||
TEST_DIR=/tmp/test
|
||||
|
||||
@@ -78,25 +83,7 @@ ifndef UNAME_S
|
||||
UNAME_S := $(shell uname -s)
|
||||
endif
|
||||
|
||||
# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
endif
|
||||
|
||||
# Detect if we are running on arm64
|
||||
ifneq (,$(findstring aarch64,$(shell uname -m)))
|
||||
ONNX_ARCH=aarch64
|
||||
endif
|
||||
|
||||
ifeq ($(OS),Darwin)
|
||||
ONNX_OS=osx
|
||||
ifneq (,$(findstring aarch64,$(shell uname -m)))
|
||||
ONNX_ARCH=arm64
|
||||
else ifneq (,$(findstring arm64,$(shell uname -m)))
|
||||
ONNX_ARCH=arm64
|
||||
else
|
||||
ONNX_ARCH=x86_64
|
||||
endif
|
||||
|
||||
ifeq ($(OSX_SIGNING_IDENTITY),)
|
||||
OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
|
||||
@@ -149,11 +136,12 @@ ifeq ($(BUILD_TYPE),hipblas)
|
||||
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
|
||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||
# llama-ggml has no hipblas support, so override it here.
|
||||
export STABLE_BUILD_TYPE=
|
||||
export GGML_HIP=1
|
||||
export GGML_HIPBLAS=1
|
||||
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
||||
AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
|
||||
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
|
||||
endif
|
||||
|
||||
@@ -172,6 +160,16 @@ ifeq ($(STATIC),true)
|
||||
LD_FLAGS+=-linkmode external -extldflags -static
|
||||
endif
|
||||
|
||||
ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
|
||||
# OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
|
||||
OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
|
||||
endif
|
||||
|
||||
ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
|
||||
# OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
|
||||
OPTIONAL_GRPC+=backend-assets/grpc/tinydream
|
||||
endif
|
||||
|
||||
ifeq ($(findstring tts,$(GO_TAGS)),tts)
|
||||
# OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
|
||||
# OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
|
||||
@@ -181,23 +179,16 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
|
||||
endif
|
||||
|
||||
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
|
||||
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
||||
|
||||
ifeq ($(ONNX_OS),linux)
|
||||
ifeq ($(ONNX_ARCH),x64)
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
|
||||
endif
|
||||
endif
|
||||
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
|
||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
|
||||
ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
|
||||
# Use filter-out to remove the specified backends
|
||||
ALL_GRPC_BACKENDS := $(filter-out $(SKIP_GRPC_BACKEND),$(ALL_GRPC_BACKENDS))
|
||||
@@ -218,22 +209,31 @@ endif
|
||||
|
||||
all: help
|
||||
|
||||
## bark.cpp
|
||||
sources/bark.cpp:
|
||||
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
|
||||
cd sources/bark.cpp && \
|
||||
git checkout $(BARKCPP_VERSION) && \
|
||||
## BERT embeddings
|
||||
sources/go-bert.cpp:
|
||||
mkdir -p sources/go-bert.cpp
|
||||
cd sources/go-bert.cpp && \
|
||||
git init && \
|
||||
git remote add origin $(BERT_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout $(BERT_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
sources/bark.cpp/build/libbark.a: sources/bark.cpp
|
||||
cd sources/bark.cpp && \
|
||||
mkdir -p build && \
|
||||
cd build && \
|
||||
cmake $(CMAKE_ARGS) .. && \
|
||||
cmake --build . --config Release
|
||||
sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
|
||||
$(MAKE) -C sources/go-bert.cpp libgobert.a
|
||||
|
||||
backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
|
||||
$(MAKE) -C backend/go/bark libbark.a
|
||||
## go-llama.cpp
|
||||
sources/go-llama.cpp:
|
||||
mkdir -p sources/go-llama.cpp
|
||||
cd sources/go-llama.cpp && \
|
||||
git init && \
|
||||
git remote add origin $(GOLLAMA_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout $(GOLLAMA_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
|
||||
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
|
||||
|
||||
## go-piper
|
||||
sources/go-piper:
|
||||
@@ -248,37 +248,45 @@ sources/go-piper:
|
||||
sources/go-piper/libpiper_binding.a: sources/go-piper
|
||||
$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
|
||||
|
||||
## stablediffusion (ggml)
|
||||
sources/stablediffusion-ggml.cpp:
|
||||
git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
|
||||
cd sources/stablediffusion-ggml.cpp && \
|
||||
git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
|
||||
|
||||
## RWKV
|
||||
sources/go-rwkv.cpp:
|
||||
mkdir -p sources/go-rwkv.cpp
|
||||
cd sources/go-rwkv.cpp && \
|
||||
git init && \
|
||||
git remote add origin $(RWKV_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout $(RWKV_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
|
||||
$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
|
||||
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
|
||||
sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
|
||||
cd sources/go-rwkv.cpp && cd rwkv.cpp && cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF && cmake --build . && cp librwkv.a ..
|
||||
|
||||
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/stablediffusion-ggml
|
||||
endif
|
||||
## stable diffusion
|
||||
sources/go-stable-diffusion:
|
||||
mkdir -p sources/go-stable-diffusion
|
||||
cd sources/go-stable-diffusion && \
|
||||
git init && \
|
||||
git remote add origin $(STABLEDIFFUSION_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout $(STABLEDIFFUSION_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
sources/onnxruntime:
|
||||
mkdir -p sources/onnxruntime
|
||||
curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
|
||||
cd sources/onnxruntime && tar -xvf onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz && rm onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
|
||||
cd sources/onnxruntime && mv onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION)/* ./
|
||||
sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
|
||||
CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
|
||||
|
||||
backend-assets/lib/libonnxruntime.so.1: backend-assets/lib sources/onnxruntime
|
||||
cp -rfv sources/onnxruntime/lib/* backend-assets/lib/
|
||||
ifeq ($(OS),Darwin)
|
||||
mv backend-assets/lib/libonnxruntime.$(ONNX_VERSION).dylib backend-assets/lib/libonnxruntime.dylib
|
||||
else
|
||||
mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
|
||||
endif
|
||||
## tiny-dream
|
||||
sources/go-tiny-dream:
|
||||
mkdir -p sources/go-tiny-dream
|
||||
cd sources/go-tiny-dream && \
|
||||
git init && \
|
||||
git remote add origin $(TINYDREAM_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout $(TINYDREAM_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
|
||||
$(MAKE) -C sources/go-tiny-dream libtinydream.a
|
||||
|
||||
## whisper
|
||||
sources/whisper.cpp:
|
||||
@@ -293,17 +301,27 @@ sources/whisper.cpp:
|
||||
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
||||
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
||||
|
||||
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
||||
get-sources: sources/go-llama.cpp sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
|
||||
|
||||
replace:
|
||||
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
|
||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
|
||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
|
||||
$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
|
||||
$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
|
||||
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
|
||||
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
|
||||
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
|
||||
|
||||
dropreplace:
|
||||
$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
|
||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
|
||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
|
||||
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
|
||||
$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
|
||||
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
|
||||
$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
|
||||
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
|
||||
|
||||
prepare-sources: get-sources replace
|
||||
$(GOCMD) mod download
|
||||
@@ -311,8 +329,13 @@ prepare-sources: get-sources replace
|
||||
## GENERIC
|
||||
rebuild: ## Rebuilds the project
|
||||
$(GOCMD) clean -cache
|
||||
$(MAKE) -C sources/go-llama.cpp clean
|
||||
$(MAKE) -C sources/go-rwkv.cpp clean
|
||||
$(MAKE) -C sources/whisper.cpp clean
|
||||
$(MAKE) -C sources/go-stable-diffusion clean
|
||||
$(MAKE) -C sources/go-bert.cpp clean
|
||||
$(MAKE) -C sources/go-piper clean
|
||||
$(MAKE) -C sources/go-tiny-dream clean
|
||||
$(MAKE) build
|
||||
|
||||
prepare: prepare-sources $(OPTIONAL_TARGETS)
|
||||
@@ -325,9 +348,7 @@ clean: ## Remove build related file
|
||||
rm -rf release/
|
||||
rm -rf backend-assets/*
|
||||
$(MAKE) -C backend/cpp/grpc clean
|
||||
$(MAKE) -C backend/go/bark clean
|
||||
$(MAKE) -C backend/cpp/llama clean
|
||||
$(MAKE) -C backend/go/image/stablediffusion-ggml clean
|
||||
rm -rf backend/cpp/llama-* || true
|
||||
$(MAKE) dropreplace
|
||||
$(MAKE) protogen-clean
|
||||
@@ -414,10 +435,12 @@ run: prepare ## run local-ai
|
||||
test-models/testmodel.ggml:
|
||||
mkdir test-models
|
||||
mkdir test-dir
|
||||
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
|
||||
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
|
||||
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
|
||||
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
|
||||
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
|
||||
wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
|
||||
wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
|
||||
cp tests/models_fixtures/* test-models
|
||||
|
||||
prepare-test: grpcs
|
||||
@@ -426,10 +449,11 @@ prepare-test: grpcs
|
||||
|
||||
test: prepare test-models/testmodel.ggml grpcs
|
||||
@echo 'Running tests'
|
||||
export GO_TAGS="tts debug"
|
||||
export GO_TAGS="tts stablediffusion debug"
|
||||
$(MAKE) prepare-test
|
||||
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||
$(MAKE) test-llama
|
||||
$(MAKE) test-llama-gguf
|
||||
$(MAKE) test-tts
|
||||
$(MAKE) test-stablediffusion
|
||||
@@ -446,34 +470,38 @@ run-e2e-image:
|
||||
|
||||
run-e2e-aio: protogen-go
|
||||
@echo 'Running e2e AIO tests'
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
|
||||
|
||||
test-e2e:
|
||||
@echo 'Running e2e tests'
|
||||
BUILD_TYPE=$(BUILD_TYPE) \
|
||||
LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
|
||||
|
||||
teardown-e2e:
|
||||
rm -rf $(TEST_DIR) || true
|
||||
docker stop $$(docker ps -q --filter ancestor=localai-tests)
|
||||
|
||||
test-llama: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
|
||||
test-llama-gguf: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
|
||||
test-tts: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
|
||||
test-stablediffusion: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
|
||||
test-stores: backend-assets/grpc/local-store
|
||||
mkdir -p tests/integration/backend-assets/grpc
|
||||
cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
|
||||
|
||||
test-container:
|
||||
docker build --target requirements -t local-ai-test-container .
|
||||
@@ -509,10 +537,10 @@ protogen-go-clean:
|
||||
$(RM) bin/*
|
||||
|
||||
.PHONY: protogen-python
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||
|
||||
.PHONY: protogen-python-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||
|
||||
.PHONY: autogptq-protogen
|
||||
autogptq-protogen:
|
||||
@@ -546,14 +574,6 @@ diffusers-protogen:
|
||||
diffusers-protogen-clean:
|
||||
$(MAKE) -C backend/python/diffusers protogen-clean
|
||||
|
||||
.PHONY: faster-whisper-protogen
|
||||
faster-whisper-protogen:
|
||||
$(MAKE) -C backend/python/faster-whisper protogen
|
||||
|
||||
.PHONY: faster-whisper-protogen-clean
|
||||
faster-whisper-protogen-clean:
|
||||
$(MAKE) -C backend/python/faster-whisper protogen-clean
|
||||
|
||||
.PHONY: exllama2-protogen
|
||||
exllama2-protogen:
|
||||
$(MAKE) -C backend/python/exllama2 protogen
|
||||
@@ -562,6 +582,14 @@ exllama2-protogen:
|
||||
exllama2-protogen-clean:
|
||||
$(MAKE) -C backend/python/exllama2 protogen-clean
|
||||
|
||||
.PHONY: mamba-protogen
|
||||
mamba-protogen:
|
||||
$(MAKE) -C backend/python/mamba protogen
|
||||
|
||||
.PHONY: mamba-protogen-clean
|
||||
mamba-protogen-clean:
|
||||
$(MAKE) -C backend/python/mamba protogen-clean
|
||||
|
||||
.PHONY: rerankers-protogen
|
||||
rerankers-protogen:
|
||||
$(MAKE) -C backend/python/rerankers protogen
|
||||
@@ -570,6 +598,14 @@ rerankers-protogen:
|
||||
rerankers-protogen-clean:
|
||||
$(MAKE) -C backend/python/rerankers protogen-clean
|
||||
|
||||
.PHONY: sentencetransformers-protogen
|
||||
sentencetransformers-protogen:
|
||||
$(MAKE) -C backend/python/sentencetransformers protogen
|
||||
|
||||
.PHONY: sentencetransformers-protogen-clean
|
||||
sentencetransformers-protogen-clean:
|
||||
$(MAKE) -C backend/python/sentencetransformers protogen-clean
|
||||
|
||||
.PHONY: transformers-protogen
|
||||
transformers-protogen:
|
||||
$(MAKE) -C backend/python/transformers protogen
|
||||
@@ -578,13 +614,37 @@ transformers-protogen:
|
||||
transformers-protogen-clean:
|
||||
$(MAKE) -C backend/python/transformers protogen-clean
|
||||
|
||||
.PHONY: kokoro-protogen
|
||||
kokoro-protogen:
|
||||
$(MAKE) -C backend/python/kokoro protogen
|
||||
.PHONY: parler-tts-protogen
|
||||
parler-tts-protogen:
|
||||
$(MAKE) -C backend/python/parler-tts protogen
|
||||
|
||||
.PHONY: kokoro-protogen-clean
|
||||
kokoro-protogen-clean:
|
||||
$(MAKE) -C backend/python/kokoro protogen-clean
|
||||
.PHONY: parler-tts-protogen-clean
|
||||
parler-tts-protogen-clean:
|
||||
$(MAKE) -C backend/python/parler-tts protogen-clean
|
||||
|
||||
.PHONY: transformers-musicgen-protogen
|
||||
transformers-musicgen-protogen:
|
||||
$(MAKE) -C backend/python/transformers-musicgen protogen
|
||||
|
||||
.PHONY: transformers-musicgen-protogen-clean
|
||||
transformers-musicgen-protogen-clean:
|
||||
$(MAKE) -C backend/python/transformers-musicgen protogen-clean
|
||||
|
||||
.PHONY: vall-e-x-protogen
|
||||
vall-e-x-protogen:
|
||||
$(MAKE) -C backend/python/vall-e-x protogen
|
||||
|
||||
.PHONY: vall-e-x-protogen-clean
|
||||
vall-e-x-protogen-clean:
|
||||
$(MAKE) -C backend/python/vall-e-x protogen-clean
|
||||
|
||||
.PHONY: openvoice-protogen
|
||||
openvoice-protogen:
|
||||
$(MAKE) -C backend/python/openvoice protogen
|
||||
|
||||
.PHONY: openvoice-protogen-clean
|
||||
openvoice-protogen-clean:
|
||||
$(MAKE) -C backend/python/openvoice protogen-clean
|
||||
|
||||
.PHONY: vllm-protogen
|
||||
vllm-protogen:
|
||||
@@ -601,11 +661,15 @@ prepare-extra-conda-environments: protogen-python
|
||||
$(MAKE) -C backend/python/bark
|
||||
$(MAKE) -C backend/python/coqui
|
||||
$(MAKE) -C backend/python/diffusers
|
||||
$(MAKE) -C backend/python/faster-whisper
|
||||
$(MAKE) -C backend/python/vllm
|
||||
$(MAKE) -C backend/python/mamba
|
||||
$(MAKE) -C backend/python/sentencetransformers
|
||||
$(MAKE) -C backend/python/rerankers
|
||||
$(MAKE) -C backend/python/transformers
|
||||
$(MAKE) -C backend/python/kokoro
|
||||
$(MAKE) -C backend/python/transformers-musicgen
|
||||
$(MAKE) -C backend/python/parler-tts
|
||||
$(MAKE) -C backend/python/vall-e-x
|
||||
$(MAKE) -C backend/python/openvoice
|
||||
$(MAKE) -C backend/python/exllama2
|
||||
|
||||
prepare-test-extra: protogen-python
|
||||
@@ -629,6 +693,13 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
|
||||
backend-assets/grpc: protogen-go replace
|
||||
mkdir -p backend-assets/grpc
|
||||
|
||||
backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/bert-embeddings
|
||||
endif
|
||||
|
||||
backend-assets/grpc/huggingface: backend-assets/grpc
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
|
||||
ifneq ($(UPX),)
|
||||
@@ -675,13 +746,6 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
|
||||
|
||||
backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-avx512
|
||||
$(MAKE) -C backend/cpp/llama-avx512 purge
|
||||
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
|
||||
|
||||
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-avx
|
||||
$(MAKE) -C backend/cpp/llama-avx purge
|
||||
@@ -695,6 +759,10 @@ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/ll
|
||||
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
|
||||
# TODO: every binary should have its own folder instead, so can have different metal implementations
|
||||
ifeq ($(BUILD_TYPE),metal)
|
||||
cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
|
||||
endif
|
||||
|
||||
backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-cuda
|
||||
@@ -707,7 +775,7 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/lla
|
||||
cp -rf backend/cpp/llama backend/cpp/llama-hipblas
|
||||
$(MAKE) -C backend/cpp/llama-hipblas purge
|
||||
$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
|
||||
BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
|
||||
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
|
||||
|
||||
backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
|
||||
@@ -735,11 +803,11 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
|
||||
mkdir -p backend-assets/util/
|
||||
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
|
||||
|
||||
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
|
||||
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/bark-cpp
|
||||
$(UPX) backend-assets/grpc/llama-ggml
|
||||
endif
|
||||
|
||||
backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
|
||||
@@ -749,11 +817,25 @@ ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/piper
|
||||
endif
|
||||
|
||||
backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
|
||||
backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/silero-vad
|
||||
$(UPX) backend-assets/grpc/rwkv
|
||||
endif
|
||||
|
||||
backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/stablediffusion
|
||||
endif
|
||||
|
||||
backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
|
||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
|
||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
|
||||
ifneq ($(UPX),)
|
||||
$(UPX) backend-assets/grpc/tinydream
|
||||
endif
|
||||
|
||||
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
|
||||
@@ -809,7 +891,7 @@ docker-aio-all:
|
||||
|
||||
docker-image-intel:
|
||||
docker build \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
|
||||
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
||||
--build-arg GO_TAGS="none" \
|
||||
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
||||
@@ -817,7 +899,7 @@ docker-image-intel:
|
||||
|
||||
docker-image-intel-xpu:
|
||||
docker build \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
|
||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
|
||||
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
||||
--build-arg GO_TAGS="none" \
|
||||
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
||||
@@ -829,7 +911,7 @@ swagger:
|
||||
|
||||
.PHONY: gen-assets
|
||||
gen-assets:
|
||||
$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
|
||||
$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
|
||||
|
||||
## Documentation
|
||||
docs/layouts/_default:
|
||||
|
||||
64
README.md
64
README.md
@@ -38,13 +38,9 @@
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
</p>
|
||||
|
||||
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
|
||||
>
|
||||
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples)
|
||||
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/)
|
||||
|
||||
[](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[](https://artifacthub.io/packages/search?repo=localai)
|
||||
|
||||
@@ -60,47 +56,29 @@ curl https://localai.io/install.sh | sh
|
||||
|
||||
Or run with docker:
|
||||
```bash
|
||||
# CPU only image:
|
||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
|
||||
|
||||
# Nvidia GPU:
|
||||
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
|
||||
|
||||
# CPU and GPU image (bigger size):
|
||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
|
||||
|
||||
# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
|
||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
```
|
||||
|
||||
To load models:
|
||||
|
||||
```bash
|
||||
# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
|
||||
local-ai run llama-3.2-1b-instruct:q4_k_m
|
||||
# Start LocalAI with the phi-2 model directly from huggingface
|
||||
local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
|
||||
# Install and run a model from the Ollama OCI registry
|
||||
local-ai run ollama://gemma:2b
|
||||
# Run a model from a configuration file
|
||||
local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
|
||||
# Install and run a model from a standard OCI registry (e.g., Docker Hub)
|
||||
local-ai run oci://localai/phi-2:latest
|
||||
# Alternative images:
|
||||
# - if you have an Nvidia GPU:
|
||||
# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
|
||||
# - without preconfigured models
|
||||
# docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
|
||||
# - without preconfigured models for Nvidia GPUs
|
||||
# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
|
||||
```
|
||||
|
||||
[💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
||||
|
||||
## 📰 Latest project news
|
||||
|
||||
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
|
||||
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
|
||||
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
|
||||
- Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
|
||||
- Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
|
||||
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
|
||||
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
|
||||
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
|
||||
- June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
|
||||
- June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
|
||||
- May 2024: 🔥🔥 Decentralized P2P llama.cpp: https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs https://localai.io/features/distribute/
|
||||
- May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
|
||||
- May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
|
||||
- May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
|
||||
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
|
||||
- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
|
||||
|
||||
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
@@ -109,10 +87,12 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
|
||||
|
||||
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
|
||||
- Realtime API https://github.com/mudler/LocalAI/issues/3714
|
||||
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
|
||||
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
|
||||
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
|
||||
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
|
||||
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
|
||||
- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
|
||||
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
|
||||
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
|
||||
|
||||
@@ -120,10 +100,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
|
||||
|
||||
## 🚀 [Features](https://localai.io/features/)
|
||||
|
||||
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
|
||||
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
|
||||
- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
|
||||
- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
|
||||
- 🎨 [Image generation](https://localai.io/features/image-generation)
|
||||
- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
|
||||
- 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/)
|
||||
- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
|
||||
- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
|
||||
@@ -131,7 +111,6 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
|
||||
- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
|
||||
- 📈 [Reranker API](https://localai.io/features/reranker/)
|
||||
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
|
||||
- 🔊 Voice activity detection (Silero-VAD support)
|
||||
- 🌍 Integrated WebUI!
|
||||
|
||||
## 💻 Usage
|
||||
@@ -154,7 +133,6 @@ Model galleries
|
||||
Other:
|
||||
- Helm chart https://github.com/go-skynet/helm-charts
|
||||
- VSCode extension https://github.com/badgooooor/localai-vscode-plugin
|
||||
- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
|
||||
- Terminal utility https://github.com/djcopley/ShellOracle
|
||||
- Local Smart assistant https://github.com/mudler/LocalAGI
|
||||
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
|
||||
@@ -162,9 +140,6 @@ Other:
|
||||
- Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
|
||||
- Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
|
||||
- Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
|
||||
- Another Telegram Bot https://github.com/JackBekket/Hellper
|
||||
- Auto-documentation https://github.com/JackBekket/Reflexia
|
||||
- Github bot which answer on issues, with code and documentation as context https://github.com/JackBekket/GitHelper
|
||||
- Github Actions: https://github.com/marketplace/actions/start-localai
|
||||
- Examples: https://github.com/mudler/LocalAI/tree/master/examples/
|
||||
|
||||
@@ -212,7 +187,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
|
||||
|
||||
<p align="center">
|
||||
<a href="https://www.spectrocloud.com/" target="blank">
|
||||
<img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
|
||||
<img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
|
||||
</a>
|
||||
<a href="https://www.premai.io/" target="blank">
|
||||
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
|
||||
@@ -239,6 +214,7 @@ LocalAI couldn't have been built without the help of great software already avai
|
||||
- https://github.com/antimatter15/alpaca.cpp
|
||||
- https://github.com/EdVince/Stable-Diffusion-NCNN
|
||||
- https://github.com/ggerganov/whisper.cpp
|
||||
- https://github.com/saharNooby/rwkv.cpp
|
||||
- https://github.com/rhasspy/piper
|
||||
|
||||
## 🤗 Contributors
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
embeddings: true
|
||||
name: text-embedding-ada-002
|
||||
backend: bert-embeddings
|
||||
parameters:
|
||||
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||
model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
|
||||
|
||||
usage: |
|
||||
You can test this model with curl like this:
|
||||
|
||||
@@ -1,17 +1,56 @@
|
||||
name: stablediffusion
|
||||
backend: stablediffusion-ggml
|
||||
cfg_scale: 4.5
|
||||
|
||||
options:
|
||||
- sampler:euler
|
||||
backend: stablediffusion
|
||||
parameters:
|
||||
model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
|
||||
step: 25
|
||||
model: stablediffusion_assets
|
||||
|
||||
license: "BSD-3"
|
||||
urls:
|
||||
- https://github.com/EdVince/Stable-Diffusion-NCNN
|
||||
- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
|
||||
|
||||
description: |
|
||||
Stable Diffusion in NCNN with c++, supported txt2img and img2img
|
||||
|
||||
download_files:
|
||||
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
|
||||
sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
|
||||
uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
|
||||
- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
|
||||
sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
|
||||
- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
|
||||
sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
|
||||
- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
|
||||
sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
|
||||
- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
|
||||
sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
|
||||
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
|
||||
- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
|
||||
sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
|
||||
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
|
||||
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
|
||||
sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
|
||||
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
|
||||
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
|
||||
sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
|
||||
- filename: "stablediffusion_assets/log_sigmas.bin"
|
||||
sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
|
||||
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
|
||||
- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
|
||||
sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
|
||||
- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
|
||||
sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
|
||||
- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
|
||||
sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
|
||||
- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
|
||||
sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
|
||||
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
|
||||
- filename: "stablediffusion_assets/vocab.txt"
|
||||
sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
|
||||
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
|
||||
|
||||
usage: |
|
||||
curl http://localhost:8080/v1/images/generations \
|
||||
|
||||
@@ -1,57 +1,101 @@
|
||||
context_size: 8192
|
||||
f16: true
|
||||
function:
|
||||
grammar:
|
||||
no_mixed_free_string: true
|
||||
schema_type: llama3.1 # or JSON is supported too (json)
|
||||
response_regex:
|
||||
- <function=(?P<name>\w+)>(?P<arguments>.*)</function>
|
||||
mmap: true
|
||||
name: gpt-4
|
||||
mmap: true
|
||||
parameters:
|
||||
model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
||||
context_size: 8192
|
||||
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- <|eot_id|>
|
||||
- <|end_of_text|>
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "</tool_call>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
|
||||
function:
|
||||
# disable injecting the "answer" tool
|
||||
disable_no_action: true
|
||||
|
||||
grammar:
|
||||
# This allows the grammar to also return messages
|
||||
mixed_mode: true
|
||||
# Suffix to add to the grammar
|
||||
#prefix: '<tool_call>\n'
|
||||
# Force parallel calls in the grammar
|
||||
# parallel_calls: true
|
||||
|
||||
return_name_in_function_response: true
|
||||
# Without grammar uncomment the lines below
|
||||
# Warning: this is relying only on the capability of the
|
||||
# LLM model to generate the correct function call.
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*?)"
|
||||
replace_llm_results:
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
replace_function_results:
|
||||
# Replace everything that is not JSON array or object
|
||||
#
|
||||
- key: '(?s)^[^{\[]*'
|
||||
value: ""
|
||||
- key: '(?s)[^}\]]*$'
|
||||
value: ""
|
||||
- key: "'([^']*?)'"
|
||||
value: "_DQUOTE_${1}_DQUOTE_"
|
||||
- key: '\\"'
|
||||
value: "__TEMP_QUOTE__"
|
||||
- key: "\'"
|
||||
value: "'"
|
||||
- key: "_DQUOTE_"
|
||||
value: '"'
|
||||
- key: "__TEMP_QUOTE__"
|
||||
value: '"'
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
|
||||
template:
|
||||
chat: |
|
||||
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
{{.Input }}
|
||||
<|start_header_id|>assistant<|end_header_id|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
|
||||
{{ if .FunctionCall -}}
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
The Function was executed and the response was:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content -}}
|
||||
{{ else if .FunctionCall -}}
|
||||
{{ range .FunctionCall }}
|
||||
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
|
||||
{{ end }}
|
||||
{{ end -}}
|
||||
<|eot_id|>
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
{{.Content }}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |
|
||||
<|start_header_id|>system<|end_header_id|>
|
||||
You are an expert in composing functions. You are given a question and a set of possible functions.
|
||||
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
||||
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
|
||||
If you decide to invoke any of the function(s), you MUST put it in the format as follows:
|
||||
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
|
||||
You SHOULD NOT include any other text in the response.
|
||||
Here is a list of functions in JSON format that you can invoke.
|
||||
{{toJson .Functions}}
|
||||
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
{{.Input}}
|
||||
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
download_files:
|
||||
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||
sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
|
||||
uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||
function: |-
|
||||
<|im_start|>system
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
backend: silero-vad
|
||||
name: silero-vad
|
||||
parameters:
|
||||
model: silero-vad.onnx
|
||||
download_files:
|
||||
- filename: silero-vad.onnx
|
||||
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
|
||||
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
||||
@@ -1,49 +1,31 @@
|
||||
backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
assistant: "ASSISTANT:"
|
||||
system: "SYSTEM:"
|
||||
|
||||
mmproj: bakllava-mmproj.gguf
|
||||
parameters:
|
||||
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- <|endoftext|>
|
||||
model: bakllava.gguf
|
||||
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
{{.Input}}
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
For each function call return a json object with function name and arguments
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
ASSISTANT:
|
||||
|
||||
download_files:
|
||||
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||
- filename: bakllava.gguf
|
||||
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
|
||||
- filename: bakllava-mmproj.gguf
|
||||
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
|
||||
|
||||
usage: |
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "gpt-4-vision-preview",
|
||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
||||
|
||||
@@ -129,7 +129,7 @@ detect_gpu
|
||||
detect_gpu_size
|
||||
|
||||
PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
|
||||
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
|
||||
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
|
||||
|
||||
check_vars
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
embeddings: true
|
||||
name: text-embedding-ada-002
|
||||
backend: sentencetransformers
|
||||
parameters:
|
||||
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||
model: all-MiniLM-L6-v2
|
||||
|
||||
usage: |
|
||||
You can test this model with curl like this:
|
||||
|
||||
@@ -1,53 +1,101 @@
|
||||
context_size: 4096
|
||||
f16: true
|
||||
function:
|
||||
capture_llm_results:
|
||||
- (?s)<Thought>(.*?)</Thought>
|
||||
grammar:
|
||||
properties_order: name,arguments
|
||||
json_regex_match:
|
||||
- (?s)<Output>(.*?)</Output>
|
||||
replace_llm_results:
|
||||
- key: (?s)<Thought>(.*?)</Thought>
|
||||
value: ""
|
||||
mmap: true
|
||||
name: gpt-4
|
||||
mmap: true
|
||||
parameters:
|
||||
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
||||
context_size: 8192
|
||||
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "</tool_call>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
|
||||
function:
|
||||
# disable injecting the "answer" tool
|
||||
disable_no_action: true
|
||||
|
||||
grammar:
|
||||
# This allows the grammar to also return messages
|
||||
mixed_mode: true
|
||||
# Suffix to add to the grammar
|
||||
#prefix: '<tool_call>\n'
|
||||
# Force parallel calls in the grammar
|
||||
# parallel_calls: true
|
||||
|
||||
return_name_in_function_response: true
|
||||
# Without grammar uncomment the lines below
|
||||
# Warning: this is relying only on the capability of the
|
||||
# LLM model to generate the correct function call.
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*?)"
|
||||
replace_llm_results:
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
replace_function_results:
|
||||
# Replace everything that is not JSON array or object
|
||||
#
|
||||
- key: '(?s)^[^{\[]*'
|
||||
value: ""
|
||||
- key: '(?s)[^}\]]*$'
|
||||
value: ""
|
||||
- key: "'([^']*?)'"
|
||||
value: "_DQUOTE_${1}_DQUOTE_"
|
||||
- key: '\\"'
|
||||
value: "__TEMP_QUOTE__"
|
||||
- key: "\'"
|
||||
value: "'"
|
||||
- key: "_DQUOTE_"
|
||||
value: '"'
|
||||
- key: "__TEMP_QUOTE__"
|
||||
value: '"'
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |
|
||||
function: |-
|
||||
<|im_start|>system
|
||||
You are an AI assistant that executes function calls, and these are the tools at your disposal:
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
<|im_end|>
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
download_files:
|
||||
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
||||
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
|
||||
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
||||
<|im_start|>assistant
|
||||
@@ -1,8 +0,0 @@
|
||||
backend: silero-vad
|
||||
name: silero-vad
|
||||
parameters:
|
||||
model: silero-vad.onnx
|
||||
download_files:
|
||||
- filename: silero-vad.onnx
|
||||
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
|
||||
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
||||
@@ -1,49 +1,35 @@
|
||||
backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
assistant: "ASSISTANT:"
|
||||
system: "SYSTEM:"
|
||||
|
||||
mmproj: llava-v1.6-7b-mmproj-f16.gguf
|
||||
parameters:
|
||||
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- <|endoftext|>
|
||||
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
temperature: 0.2
|
||||
top_k: 40
|
||||
top_p: 0.95
|
||||
seed: -1
|
||||
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
{{.Input}}
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
For each function call return a json object with function name and arguments
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
ASSISTANT:
|
||||
|
||||
download_files:
|
||||
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
- filename: llava-v1.6-7b-mmproj-f16.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
|
||||
|
||||
usage: |
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "gpt-4-vision-preview",
|
||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
embeddings: true
|
||||
name: text-embedding-ada-002
|
||||
backend: sentencetransformers
|
||||
parameters:
|
||||
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||
model: all-MiniLM-L6-v2
|
||||
|
||||
usage: |
|
||||
You can test this model with curl like this:
|
||||
|
||||
@@ -1,53 +1,103 @@
|
||||
context_size: 4096
|
||||
f16: true
|
||||
function:
|
||||
capture_llm_results:
|
||||
- (?s)<Thought>(.*?)</Thought>
|
||||
grammar:
|
||||
properties_order: name,arguments
|
||||
json_regex_match:
|
||||
- (?s)<Output>(.*?)</Output>
|
||||
replace_llm_results:
|
||||
- key: (?s)<Thought>(.*?)</Thought>
|
||||
value: ""
|
||||
mmap: true
|
||||
name: gpt-4
|
||||
mmap: false
|
||||
context_size: 8192
|
||||
|
||||
f16: false
|
||||
parameters:
|
||||
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
||||
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "</tool_call>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
|
||||
function:
|
||||
# disable injecting the "answer" tool
|
||||
disable_no_action: true
|
||||
|
||||
grammar:
|
||||
# This allows the grammar to also return messages
|
||||
mixed_mode: true
|
||||
# Suffix to add to the grammar
|
||||
#prefix: '<tool_call>\n'
|
||||
# Force parallel calls in the grammar
|
||||
# parallel_calls: true
|
||||
|
||||
return_name_in_function_response: true
|
||||
# Without grammar uncomment the lines below
|
||||
# Warning: this is relying only on the capability of the
|
||||
# LLM model to generate the correct function call.
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*?)"
|
||||
replace_llm_results:
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
replace_function_results:
|
||||
# Replace everything that is not JSON array or object
|
||||
#
|
||||
- key: '(?s)^[^{\[]*'
|
||||
value: ""
|
||||
- key: '(?s)[^}\]]*$'
|
||||
value: ""
|
||||
- key: "'([^']*?)'"
|
||||
value: "_DQUOTE_${1}_DQUOTE_"
|
||||
- key: '\\"'
|
||||
value: "__TEMP_QUOTE__"
|
||||
- key: "\'"
|
||||
value: "'"
|
||||
- key: "_DQUOTE_"
|
||||
value: '"'
|
||||
- key: "__TEMP_QUOTE__"
|
||||
value: '"'
|
||||
# Drop the scratchpad content from responses
|
||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
||||
value: ""
|
||||
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |
|
||||
function: |-
|
||||
<|im_start|>system
|
||||
You are an AI assistant that executes function calls, and these are the tools at your disposal:
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
<|im_end|>
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
download_files:
|
||||
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
||||
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
|
||||
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
||||
@@ -1,8 +0,0 @@
|
||||
backend: silero-vad
|
||||
name: silero-vad
|
||||
parameters:
|
||||
model: silero-vad.onnx
|
||||
download_files:
|
||||
- filename: silero-vad.onnx
|
||||
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
|
||||
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
|
||||
@@ -1,50 +1,35 @@
|
||||
backend: llama-cpp
|
||||
context_size: 4096
|
||||
f16: true
|
||||
mmap: true
|
||||
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||
mmap: false
|
||||
f16: false
|
||||
name: gpt-4o
|
||||
|
||||
roles:
|
||||
user: "USER:"
|
||||
assistant: "ASSISTANT:"
|
||||
system: "SYSTEM:"
|
||||
|
||||
mmproj: llava-v1.6-7b-mmproj-f16.gguf
|
||||
parameters:
|
||||
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||
stopwords:
|
||||
- <|im_end|>
|
||||
- <dummy32000>
|
||||
- </s>
|
||||
- <|endoftext|>
|
||||
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
temperature: 0.2
|
||||
top_k: 40
|
||||
top_p: 0.95
|
||||
seed: -1
|
||||
|
||||
template:
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
completion: |
|
||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
{{.Input}}
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
For each function call return a json object with function name and arguments
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
|
||||
ASSISTANT:
|
||||
|
||||
download_files:
|
||||
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
|
||||
- filename: llava-v1.6-7b-mmproj-f16.gguf
|
||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
|
||||
|
||||
usage: |
|
||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "gpt-4-vision-preview",
|
||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
||||
|
||||
@@ -28,8 +28,6 @@ service Backend {
|
||||
rpc Rerank(RerankRequest) returns (RerankResult) {}
|
||||
|
||||
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
|
||||
|
||||
rpc VAD(VADRequest) returns (VADResponse) {}
|
||||
}
|
||||
|
||||
// Define the empty request
|
||||
@@ -159,12 +157,6 @@ message Reply {
|
||||
bytes message = 1;
|
||||
int32 tokens = 2;
|
||||
int32 prompt_tokens = 3;
|
||||
double timing_prompt_processing = 4;
|
||||
double timing_token_generation = 5;
|
||||
}
|
||||
|
||||
message GrammarTrigger {
|
||||
string word = 1;
|
||||
}
|
||||
|
||||
message ModelOptions {
|
||||
@@ -227,12 +219,6 @@ message ModelOptions {
|
||||
int32 SwapSpace = 53;
|
||||
int32 MaxModelLen = 54;
|
||||
int32 TensorParallelSize = 55;
|
||||
string LoadFormat = 58;
|
||||
bool DisableLogStatus = 66;
|
||||
string DType = 67;
|
||||
int32 LimitImagePerPrompt = 68;
|
||||
int32 LimitVideoPerPrompt = 69;
|
||||
int32 LimitAudioPerPrompt = 70;
|
||||
|
||||
string MMProj = 41;
|
||||
|
||||
@@ -246,18 +232,6 @@ message ModelOptions {
|
||||
|
||||
bool FlashAttention = 56;
|
||||
bool NoKVOffload = 57;
|
||||
|
||||
string ModelPath = 59;
|
||||
|
||||
repeated string LoraAdapters = 60;
|
||||
repeated float LoraScales = 61;
|
||||
|
||||
repeated string Options = 62;
|
||||
|
||||
string CacheTypeKey = 63;
|
||||
string CacheTypeValue = 64;
|
||||
|
||||
repeated GrammarTrigger GrammarTriggers = 65;
|
||||
}
|
||||
|
||||
message Result {
|
||||
@@ -313,19 +287,6 @@ message TTSRequest {
|
||||
optional string language = 5;
|
||||
}
|
||||
|
||||
message VADRequest {
|
||||
repeated float audio = 1;
|
||||
}
|
||||
|
||||
message VADSegment {
|
||||
float start = 1;
|
||||
float end = 2;
|
||||
}
|
||||
|
||||
message VADResponse {
|
||||
repeated VADSegment segments = 1;
|
||||
}
|
||||
|
||||
message SoundGenerationRequest {
|
||||
string text = 1;
|
||||
string model = 2;
|
||||
@@ -361,4 +322,4 @@ message StatusResponse {
|
||||
message Message {
|
||||
string role = 1;
|
||||
string content = 2;
|
||||
}
|
||||
}
|
||||
@@ -22,7 +22,7 @@ else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
|
||||
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
|
||||
# But if it's OSX without metal, disable it here
|
||||
else ifeq ($(OS),Darwin)
|
||||
@@ -30,7 +30,9 @@ else ifeq ($(OS),Darwin)
|
||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
else
|
||||
CMAKE_ARGS+=-DGGML_METAL=ON
|
||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||
# Until this is tested properly, we disable embedded metal file
|
||||
# as we already embed it as part of the LocalAI assets
|
||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
|
||||
TARGET+=--target ggml-metal
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -113,7 +113,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
std::string ret;
|
||||
for (; begin != end; ++begin)
|
||||
{
|
||||
ret += common_token_to_piece(ctx, *begin);
|
||||
ret += llama_token_to_piece(ctx, *begin);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -121,7 +121,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
// format incomplete utf-8 multibyte character for output
|
||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||
{
|
||||
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||
// (size > 1 meaning it's already a known token)
|
||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||
@@ -134,32 +134,6 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
|
||||
return out;
|
||||
}
|
||||
|
||||
// Adds an RPC server
|
||||
// https://github.com/ggerganov/llama.cpp/compare/4dbc8b9cb71876e005724f4e8f73a3544646bcf5..3edfa7d3753c29e44b964c0ff424d2ea8d5fdee6
|
||||
static void add_rpc_devices(std::string servers) {
|
||||
auto rpc_servers = string_split<std::string>(servers, ',');
|
||||
if (rpc_servers.empty()) {
|
||||
throw std::invalid_argument("no RPC servers specified");
|
||||
}
|
||||
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
throw std::invalid_argument("failed to find RPC backend");
|
||||
}
|
||||
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
|
||||
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
throw std::invalid_argument("failed to find RPC device add function");
|
||||
}
|
||||
for (const auto & server : rpc_servers) {
|
||||
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||
if (dev) {
|
||||
ggml_backend_device_register(dev);
|
||||
} else {
|
||||
throw std::invalid_argument("failed to register RPC device");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// convert a vector of completion_token_output to json
|
||||
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
|
||||
{
|
||||
@@ -229,8 +203,8 @@ struct llama_client_slot
|
||||
std::string stopping_word;
|
||||
|
||||
// sampling
|
||||
struct common_params_sampling sparams;
|
||||
common_sampler *ctx_sampling = nullptr;
|
||||
struct gpt_sampler_params sparams;
|
||||
gpt_sampler *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
@@ -283,7 +257,7 @@ struct llama_client_slot
|
||||
images.clear();
|
||||
}
|
||||
|
||||
bool has_budget(common_params &global_params) {
|
||||
bool has_budget(gpt_params &global_params) {
|
||||
if (params.n_predict == -1 && global_params.n_predict == -1)
|
||||
{
|
||||
return true; // limitless
|
||||
@@ -417,48 +391,14 @@ struct llama_metrics {
|
||||
}
|
||||
};
|
||||
|
||||
struct llava_embd_batch {
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id> seq_id_0;
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
logits .resize(n_tokens);
|
||||
seq_id_0.resize(1);
|
||||
seq_id_0[0] = seq_id;
|
||||
seq_ids [n_tokens] = nullptr;
|
||||
batch = {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
/*logits =*/ logits.data(),
|
||||
};
|
||||
for (int i = 0; i < n_tokens; i++) {
|
||||
batch.pos [i] = pos_0 + i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_server_context
|
||||
{
|
||||
llama_model *model = nullptr;
|
||||
llama_context *ctx = nullptr;
|
||||
const llama_vocab * vocab = nullptr;
|
||||
|
||||
clip_ctx *clp_ctx = nullptr;
|
||||
|
||||
common_params params;
|
||||
gpt_params params;
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
@@ -466,11 +406,6 @@ struct llama_server_context
|
||||
bool clean_kv_cache = true;
|
||||
bool all_slots_are_idle = false;
|
||||
bool add_bos_token = true;
|
||||
bool has_eos_token = true;
|
||||
bool has_gpu = false;
|
||||
|
||||
bool grammar_lazy = false;
|
||||
std::vector<common_grammar_trigger> grammar_triggers;
|
||||
|
||||
int32_t n_ctx; // total context for all clients / slots
|
||||
|
||||
@@ -506,16 +441,13 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
bool load_model(const common_params ¶ms_)
|
||||
bool load_model(const gpt_params ¶ms_)
|
||||
{
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
multimodal = true;
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_init(params.mmproj.c_str(), clip_context_params {
|
||||
/* use_gpu */ has_gpu,
|
||||
/*verbosity=*/ 1,
|
||||
});
|
||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
|
||||
return false;
|
||||
@@ -526,9 +458,9 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
common_init_result common_init = common_init_from_params(params);
|
||||
model = common_init.model.release();
|
||||
ctx = common_init.context.release();
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
if (model == nullptr)
|
||||
{
|
||||
LOG_ERR("unable to load model: %s", params.model.c_str());
|
||||
@@ -537,7 +469,7 @@ struct llama_server_context
|
||||
|
||||
if (multimodal) {
|
||||
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
|
||||
const int n_embd_llm = llama_model_n_embd(model);
|
||||
const int n_embd_llm = llama_n_embd(model);
|
||||
if (n_embd_clip != n_embd_llm) {
|
||||
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
|
||||
llama_free(ctx);
|
||||
@@ -546,15 +478,23 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
vocab = llama_model_get_vocab(model);
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
add_bos_token = llama_add_bos_token(model);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void validate_model_chat_template(server_params & sparams) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||
if (res < 0) {
|
||||
LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
|
||||
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
||||
}
|
||||
}
|
||||
|
||||
llama_client_slot* get_active_slot() {
|
||||
for (llama_client_slot& slot : slots) {
|
||||
// Check if the slot is currently processing
|
||||
@@ -638,12 +578,12 @@ struct llama_server_context
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
@@ -660,7 +600,7 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
@@ -689,7 +629,7 @@ struct llama_server_context
|
||||
|
||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||
slot_params default_params;
|
||||
common_params_sampling default_sparams;
|
||||
gpt_sampler_params default_sparams;
|
||||
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
@@ -697,6 +637,7 @@ struct llama_server_context
|
||||
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
|
||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
||||
@@ -708,13 +649,12 @@ struct llama_server_context
|
||||
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||
slot->sparams.grammar_triggers = grammar_triggers;
|
||||
slot->sparams.grammar_lazy = grammar_lazy;
|
||||
|
||||
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||
// Might be better to reject the request with a 400 ?
|
||||
@@ -754,8 +694,8 @@ struct llama_server_context
|
||||
slot->prompt = "";
|
||||
}
|
||||
|
||||
if (json_value(data, "ignore_eos", false) && has_eos_token) {
|
||||
slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
|
||||
if (json_value(data, "ignore_eos", false)) {
|
||||
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
|
||||
}
|
||||
/*
|
||||
slot->sparams.penalty_prompt_tokens.clear();
|
||||
@@ -794,13 +734,13 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
slot->sparams.logit_bias.clear();
|
||||
|
||||
const auto &logit_bias = data.find("logit_bias");
|
||||
if (logit_bias != data.end() && logit_bias->is_array())
|
||||
{
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
const int n_vocab = llama_n_vocab(model);
|
||||
for (const auto &el : *logit_bias)
|
||||
{
|
||||
if (el.is_array() && el.size() == 2)
|
||||
@@ -829,7 +769,7 @@ struct llama_server_context
|
||||
}
|
||||
else if (el[0].is_string())
|
||||
{
|
||||
auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
|
||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks)
|
||||
{
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
@@ -861,7 +801,7 @@ struct llama_server_context
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -945,9 +885,9 @@ struct llama_server_context
|
||||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
common_sampler_free(slot->ctx_sampling);
|
||||
gpt_sampler_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = common_sampler_init(model, slot->sparams);
|
||||
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
|
||||
//llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
@@ -974,13 +914,13 @@ struct llama_server_context
|
||||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
|
||||
common_batch_clear(batch);
|
||||
llama_batch_clear(batch);
|
||||
|
||||
for (int i = 0; i < (int)system_tokens.size(); ++i)
|
||||
{
|
||||
common_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
@@ -994,6 +934,7 @@ struct llama_server_context
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
if (llama_decode(ctx, batch_view) != 0)
|
||||
{
|
||||
@@ -1068,7 +1009,7 @@ struct llama_server_context
|
||||
|
||||
bool process_token(completion_token_output &result, llama_client_slot &slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok);
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
@@ -1159,15 +1100,7 @@ struct llama_server_context
|
||||
slot.has_next_token = false;
|
||||
}
|
||||
|
||||
if (slot.n_past >= slot.n_ctx) {
|
||||
slot.truncated = true;
|
||||
slot.stopped_limit = true;
|
||||
slot.has_next_token = false;
|
||||
|
||||
LOG_VERBOSE("stopped due to running out of context capacity", {});
|
||||
}
|
||||
|
||||
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
|
||||
if (result.tok == llama_token_eos(model))
|
||||
{
|
||||
slot.stopped_eos = true;
|
||||
slot.has_next_token = false;
|
||||
@@ -1227,7 +1160,7 @@ struct llama_server_context
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers)
|
||||
{
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -1241,6 +1174,7 @@ struct llama_server_context
|
||||
{"top_k", slot.sparams.top_k},
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
{"tfs_z", slot.sparams.tfs_z},
|
||||
{"typical_p", slot.sparams.typ_p},
|
||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||
{"repeat_penalty", slot.sparams.penalty_repeat},
|
||||
@@ -1249,12 +1183,13 @@ struct llama_server_context
|
||||
{"mirostat", slot.sparams.mirostat},
|
||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||
{"penalize_nl", slot.sparams.penalize_nl},
|
||||
{"stop", slot.params.antiprompt},
|
||||
{"n_predict", slot.params.n_predict},
|
||||
{"n_keep", params.n_keep},
|
||||
{"ignore_eos", slot.sparams.ignore_eos},
|
||||
{"stream", slot.params.stream},
|
||||
// {"logit_bias", slot.sparams.logit_bias},
|
||||
// {"logit_bias", slot.sparams.logit_bias},
|
||||
{"n_probs", slot.sparams.n_probs},
|
||||
{"min_keep", slot.sparams.min_keep},
|
||||
{"grammar", slot.sparams.grammar},
|
||||
@@ -1281,7 +1216,7 @@ struct llama_server_context
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
||||
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
|
||||
if (probs_pos < probs_stop_pos)
|
||||
@@ -1333,7 +1268,7 @@ struct llama_server_context
|
||||
std::vector<completion_token_output> probs = {};
|
||||
if (!slot.params.stream && slot.stopped_word)
|
||||
{
|
||||
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
||||
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
|
||||
}
|
||||
else
|
||||
@@ -1354,7 +1289,7 @@ struct llama_server_context
|
||||
queue_results.send(res);
|
||||
}
|
||||
|
||||
void send_embedding(llama_client_slot &slot, const llama_batch & batch)
|
||||
void send_embedding(llama_client_slot &slot)
|
||||
{
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
@@ -1362,7 +1297,7 @@ struct llama_server_context
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
const int n_embd = llama_n_embd(model);
|
||||
if (!params.embedding)
|
||||
{
|
||||
LOG_WARNING("embedding disabled", {
|
||||
@@ -1376,38 +1311,10 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
const float *data = llama_get_embeddings(ctx);
|
||||
std::vector<float> embd_res(n_embd, 0.0f);
|
||||
std::vector<std::vector<float>> embedding;
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||
if (embd == NULL) {
|
||||
embd = llama_get_embeddings_ith(ctx, i);
|
||||
}
|
||||
|
||||
if (embd == NULL) {
|
||||
LOG("failed to get embeddings");
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// normalize only when there is pooling
|
||||
// TODO: configurable
|
||||
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
|
||||
common_embd_normalize(embd, embd_res.data(), n_embd, 2);
|
||||
embedding.push_back(embd_res);
|
||||
} else {
|
||||
embedding.push_back({ embd, embd + n_embd });
|
||||
}
|
||||
}
|
||||
|
||||
// OAI compat
|
||||
std::vector<float> embedding(data, data + n_embd);
|
||||
res.result_json = json
|
||||
{
|
||||
{"embedding", embedding[0] },
|
||||
{"embedding", embedding },
|
||||
};
|
||||
}
|
||||
queue_results.send(res);
|
||||
@@ -1472,6 +1379,7 @@ struct llama_server_context
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
if (llama_decode(ctx, batch_view))
|
||||
{
|
||||
@@ -1489,10 +1397,9 @@ struct llama_server_context
|
||||
n_eval = n_batch;
|
||||
}
|
||||
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
float * embd = img.image_embedding + i * n_embd;
|
||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
|
||||
if (llama_decode(ctx, llava_batch.batch))
|
||||
const int n_embd = llama_n_embd(model);
|
||||
llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
|
||||
if (llama_decode(ctx, batch_img))
|
||||
{
|
||||
LOG("%s : failed to eval image\n", __func__);
|
||||
return false;
|
||||
@@ -1501,7 +1408,7 @@ struct llama_server_context
|
||||
}
|
||||
image_idx++;
|
||||
|
||||
common_batch_clear(batch);
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// append prefix of next image
|
||||
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
|
||||
@@ -1511,7 +1418,7 @@ struct llama_server_context
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
@@ -1643,7 +1550,7 @@ struct llama_server_context
|
||||
update_system_prompt();
|
||||
}
|
||||
|
||||
common_batch_clear(batch);
|
||||
llama_batch_clear(batch);
|
||||
|
||||
if (all_slots_are_idle)
|
||||
{
|
||||
@@ -1667,17 +1574,17 @@ struct llama_server_context
|
||||
{
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
// this check is redundant (for good)
|
||||
// we should never get here, because generation should already stopped in process_token()
|
||||
|
||||
// START LOCALAI changes
|
||||
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
|
||||
// See: https://github.com/mudler/LocalAI/issues/1333
|
||||
// Context is exhausted, release the slot
|
||||
slot.release();
|
||||
send_final_response(slot);
|
||||
slot.has_next_token = false;
|
||||
LOG_ERROR("context is exhausted, release the slot", {});
|
||||
slot.cache_tokens.clear();
|
||||
slot.n_past = 0;
|
||||
slot.truncated = false;
|
||||
slot.has_next_token = true;
|
||||
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
||||
|
||||
continue;
|
||||
// END LOCALAI changes
|
||||
@@ -1721,7 +1628,7 @@ struct llama_server_context
|
||||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
|
||||
@@ -1770,11 +1677,11 @@ struct llama_server_context
|
||||
suffix_tokens.erase(suffix_tokens.begin());
|
||||
}
|
||||
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_vocab_fim_suf(vocab));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
|
||||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
prompt_tokens = prefix_tokens;
|
||||
}
|
||||
else
|
||||
@@ -1815,7 +1722,7 @@ struct llama_server_context
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
common_sampler_reset(slot.ctx_sampling);
|
||||
gpt_sampler_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
@@ -1827,7 +1734,7 @@ struct llama_server_context
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
common_sampler_accept(slot.ctx_sampling, token, false);
|
||||
gpt_sampler_accept(slot.ctx_sampling, token, false);
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
@@ -1919,7 +1826,7 @@ struct llama_server_context
|
||||
ga_i += ga_w/ga_n;
|
||||
}
|
||||
}
|
||||
common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
@@ -1997,6 +1904,7 @@ struct llama_server_context
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
@@ -2028,16 +1936,16 @@ struct llama_server_context
|
||||
// prompt evaluated for embedding
|
||||
if (slot.embedding)
|
||||
{
|
||||
send_embedding(slot, batch_view);
|
||||
send_embedding(slot);
|
||||
slot.release();
|
||||
slot.i_batch = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
|
||||
common_sampler_accept(slot.ctx_sampling, id, true);
|
||||
gpt_sampler_accept(slot.ctx_sampling, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1)
|
||||
@@ -2048,7 +1956,7 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
result.tok = id;
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
|
||||
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
@@ -2101,7 +2009,7 @@ static json format_partial_response(
|
||||
struct token_translator
|
||||
{
|
||||
llama_context * ctx;
|
||||
std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); }
|
||||
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
|
||||
};
|
||||
|
||||
@@ -2122,11 +2030,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
||||
}
|
||||
|
||||
std::function<void(int)> shutdown_handler;
|
||||
|
||||
inline void signal_handler(int signal) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
inline void signal_handler(int signal) { shutdown_handler(signal); }
|
||||
|
||||
/////////////////////////////////
|
||||
////////////////////////////////
|
||||
@@ -2170,6 +2074,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||
// slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
// slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
// slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
// slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
// slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
// slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
@@ -2179,6 +2084,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||
// slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
// slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
// slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
// slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
// slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
// slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
@@ -2192,6 +2098,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
data["n_predict"] = predict->tokens() == 0 ? -1 : predict->tokens();
|
||||
data["top_k"] = predict->topk();
|
||||
data["top_p"] = predict->topp();
|
||||
data["tfs_z"] = predict->tailfreesamplingz();
|
||||
data["typical_p"] = predict->typicalp();
|
||||
data["temperature"] = predict->temperature();
|
||||
data["repeat_last_n"] = predict->repeat();
|
||||
@@ -2201,6 +2108,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
data["mirostat"] = predict->mirostat();
|
||||
data["mirostat_tau"] = predict->mirostattau();
|
||||
data["mirostat_eta"] = predict->mirostateta();
|
||||
data["penalize_nl"] = predict->penalizenl();
|
||||
data["n_keep"] = predict->nkeep();
|
||||
data["seed"] = predict->seed();
|
||||
data["grammar"] = predict->grammar();
|
||||
@@ -2237,6 +2145,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
|
||||
// llama.params.sparams.top_k = predict->topk();
|
||||
// llama.params.sparams.top_p = predict->topp();
|
||||
// llama.params.sparams.tfs_z = predict->tailfreesamplingz();
|
||||
// llama.params.sparams.typical_p = predict->typicalp();
|
||||
// llama.params.sparams.penalty_last_n = predict->repeat();
|
||||
// llama.params.sparams.temp = predict->temperature();
|
||||
@@ -2246,6 +2155,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// llama.params.sparams.mirostat = predict->mirostat();
|
||||
// llama.params.sparams.mirostat_tau = predict->mirostattau();
|
||||
// llama.params.sparams.mirostat_eta = predict->mirostateta();
|
||||
// llama.params.sparams.penalize_nl = predict->penalizenl();
|
||||
// llama.params.n_keep = predict->nkeep();
|
||||
// llama.params.seed = predict->seed();
|
||||
// llama.params.sparams.grammar = predict->grammar();
|
||||
@@ -2292,37 +2202,8 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// }
|
||||
// }
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
GGML_TYPE_Q5_0,
|
||||
GGML_TYPE_Q5_1,
|
||||
};
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
for (const auto & type : kv_cache_types) {
|
||||
if (ggml_type_name(type) == s) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Unsupported cache type: " + s);
|
||||
}
|
||||
|
||||
static std::string get_all_kv_cache_types() {
|
||||
std::ostringstream msg;
|
||||
for (const auto & type : kv_cache_types) {
|
||||
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
|
||||
}
|
||||
return msg.str();
|
||||
}
|
||||
|
||||
static void params_parse(const backend::ModelOptions* request,
|
||||
common_params & params, llama_server_context &llama) {
|
||||
gpt_params & params) {
|
||||
|
||||
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
|
||||
|
||||
@@ -2334,12 +2215,6 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
}
|
||||
// params.model_alias ??
|
||||
params.model_alias = request->modelfile();
|
||||
if (!request->cachetypekey().empty()) {
|
||||
params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
|
||||
}
|
||||
if (!request->cachetypevalue().empty()) {
|
||||
params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
|
||||
}
|
||||
params.n_ctx = request->contextsize();
|
||||
//params.memory_f16 = request->f16memory();
|
||||
params.cpuparams.n_threads = request->threads();
|
||||
@@ -2357,23 +2232,9 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
|
||||
const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
|
||||
if (llama_grpc_servers != NULL) {
|
||||
add_rpc_devices(std::string(llama_grpc_servers));
|
||||
params.rpc_servers = std::string(llama_grpc_servers);
|
||||
}
|
||||
|
||||
// decode options. Options are in form optname:optvale, or if booleans only optname.
|
||||
for (int i = 0; i < request->options_size(); i++) {
|
||||
std::string opt = request->options(i);
|
||||
char *optname = strtok(&opt[0], ":");
|
||||
char *optval = strtok(NULL, ":");
|
||||
if (optval == NULL) {
|
||||
optval = "true";
|
||||
}
|
||||
|
||||
if (!strcmp(optname, "gpu")) {
|
||||
llama.has_gpu = true;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Add yarn
|
||||
|
||||
if (!request->tensorsplit().empty()) {
|
||||
@@ -2412,7 +2273,6 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
params.use_mmap = request->mmap();
|
||||
params.flash_attn = request->flashattention();
|
||||
params.no_kv_offload = request->nokvoffload();
|
||||
params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops)
|
||||
|
||||
params.embedding = request->embeddings();
|
||||
|
||||
@@ -2437,21 +2297,6 @@ static void params_parse(const backend::ModelOptions* request,
|
||||
if ( request->ropefreqscale() != 0.0f ) {
|
||||
params.rope_freq_scale = request->ropefreqscale();
|
||||
}
|
||||
|
||||
if (request->grammartriggers_size() > 0) {
|
||||
LOG_INFO("configuring grammar triggers", {});
|
||||
llama.grammar_lazy = true;
|
||||
for (int i = 0; i < request->grammartriggers_size(); i++) {
|
||||
common_grammar_trigger trigger;
|
||||
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
|
||||
trigger.value = request->grammartriggers(i).word();
|
||||
// trigger.at_start = request->grammartriggers(i).at_start();
|
||||
llama.grammar_triggers.push_back(trigger);
|
||||
LOG_INFO("grammar trigger", {
|
||||
{ "word", trigger.value },
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2466,8 +2311,8 @@ public:
|
||||
|
||||
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
|
||||
// Implement LoadModel RPC
|
||||
common_params params;
|
||||
params_parse(request, params, llama);
|
||||
gpt_params params;
|
||||
params_parse(request, params);
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -2512,13 +2357,6 @@ public:
|
||||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
reply.set_prompt_tokens(tokens_evaluated);
|
||||
|
||||
if (result.result_json.contains("timings")) {
|
||||
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
|
||||
reply.set_timing_prompt_processing(timing_prompt_processing);
|
||||
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
|
||||
reply.set_timing_token_generation(timing_token_generation);
|
||||
}
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
{ "id", data["correlation_id"] }
|
||||
@@ -2559,13 +2397,6 @@ public:
|
||||
reply->set_prompt_tokens(tokens_evaluated);
|
||||
reply->set_tokens(tokens_predicted);
|
||||
reply->set_message(completion_text);
|
||||
|
||||
if (result.result_json.contains("timings")) {
|
||||
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
|
||||
reply->set_timing_prompt_processing(timing_prompt_processing);
|
||||
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
|
||||
reply->set_timing_token_generation(timing_token_generation);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -2600,18 +2431,6 @@ public:
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
|
||||
json data = parse_options(false, request, llama);
|
||||
|
||||
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
|
||||
|
||||
for (int i=0 ; i< tokens.size(); i++){
|
||||
response->add_tokens(tokens[i]);
|
||||
}
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
|
||||
llama_client_slot* active_slot = llama.get_active_slot();
|
||||
|
||||
@@ -2653,20 +2472,6 @@ void RunServer(const std::string& server_address) {
|
||||
int main(int argc, char** argv) {
|
||||
std::string server_address("localhost:50051");
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
sigaction(SIGTERM, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
// Define long and short options
|
||||
struct option long_options[] = {
|
||||
{"addr", required_argument, nullptr, 'a'},
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 3cd0d2fa..6c5e811a 100644
|
||||
index 342042ff..224db9b5 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
- patches_data[i] = i + 1;
|
||||
+ patches_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
- patches_data[i] = i + 1;
|
||||
+ patches_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
@@ -1,25 +0,0 @@
|
||||
INCLUDE_PATH := $(abspath ./)
|
||||
LIBRARY_PATH := $(abspath ./)
|
||||
|
||||
AR?=ar
|
||||
|
||||
BUILD_TYPE?=
|
||||
# keep standard at C11 and C++11
|
||||
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
|
||||
LDFLAGS = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
|
||||
|
||||
# warnings
|
||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
||||
|
||||
gobark.o:
|
||||
$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
|
||||
|
||||
libbark.a: gobark.o
|
||||
cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
|
||||
$(AR) rcs libbark.a gobark.o
|
||||
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
|
||||
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
|
||||
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
|
||||
|
||||
clean:
|
||||
rm -f gobark.o libbark.a
|
||||
@@ -1,85 +0,0 @@
|
||||
#include <iostream>
|
||||
#include <tuple>
|
||||
|
||||
#include "bark.h"
|
||||
#include "gobark.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
struct bark_context *c;
|
||||
|
||||
void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
|
||||
if (step == bark_encoding_step::SEMANTIC) {
|
||||
printf("\rGenerating semantic tokens... %d%%", progress);
|
||||
} else if (step == bark_encoding_step::COARSE) {
|
||||
printf("\rGenerating coarse tokens... %d%%", progress);
|
||||
} else if (step == bark_encoding_step::FINE) {
|
||||
printf("\rGenerating fine tokens... %d%%", progress);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
int load_model(char *model) {
|
||||
// initialize bark context
|
||||
struct bark_context_params ctx_params = bark_context_default_params();
|
||||
bark_params params;
|
||||
|
||||
params.model_path = model;
|
||||
|
||||
// ctx_params.verbosity = verbosity;
|
||||
ctx_params.progress_callback = bark_print_progress_callback;
|
||||
ctx_params.progress_callback_user_data = nullptr;
|
||||
|
||||
struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
|
||||
if (!bctx) {
|
||||
fprintf(stderr, "%s: Could not load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
c = bctx;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int tts(char *text,int threads, char *dst ) {
|
||||
|
||||
ggml_time_init();
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
|
||||
// generate audio
|
||||
if (!bark_generate_audio(c, text, threads)) {
|
||||
fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const float *audio_data = bark_get_audio_data(c);
|
||||
if (audio_data == NULL) {
|
||||
fprintf(stderr, "%s: Could not get audio data\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int audio_arr_size = bark_get_audio_data_size(c);
|
||||
|
||||
std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
|
||||
|
||||
write_wav_on_disk(audio_arr, dst);
|
||||
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
const int64_t t_load_us = bark_get_load_time(c);
|
||||
const int64_t t_eval_us = bark_get_eval_time(c);
|
||||
|
||||
printf("\n\n");
|
||||
printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
|
||||
printf("%s: eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
|
||||
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unload() {
|
||||
bark_free(c);
|
||||
}
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
package main
|
||||
|
||||
// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
|
||||
// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
|
||||
// #include <gobark.h>
|
||||
// #include <stdlib.h>
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unsafe"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
type Bark struct {
|
||||
base.SingleThread
|
||||
threads int
|
||||
}
|
||||
|
||||
func (sd *Bark) Load(opts *pb.ModelOptions) error {
|
||||
|
||||
sd.threads = int(opts.Threads)
|
||||
|
||||
modelFile := C.CString(opts.ModelFile)
|
||||
defer C.free(unsafe.Pointer(modelFile))
|
||||
|
||||
ret := C.load_model(modelFile)
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("inference failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sd *Bark) TTS(opts *pb.TTSRequest) error {
|
||||
t := C.CString(opts.Text)
|
||||
defer C.free(unsafe.Pointer(t))
|
||||
|
||||
dst := C.CString(opts.Dst)
|
||||
defer C.free(unsafe.Pointer(dst))
|
||||
|
||||
threads := C.int(sd.threads)
|
||||
|
||||
ret := C.tts(t, threads, dst)
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("inference failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int load_model(char *model);
|
||||
int tts(char *text,int threads, char *dst );
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,96 +0,0 @@
|
||||
INCLUDE_PATH := $(abspath ./)
|
||||
LIBRARY_PATH := $(abspath ./)
|
||||
|
||||
AR?=ar
|
||||
CMAKE_ARGS?=
|
||||
BUILD_TYPE?=
|
||||
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||
# keep standard at C11 and C++11
|
||||
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
|
||||
|
||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
# to CMAKE_ARGS automatically
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON
|
||||
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
|
||||
# But if it's OSX without metal, disable it here
|
||||
else ifeq ($(OS),Darwin)
|
||||
ifneq ($(BUILD_TYPE),metal)
|
||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||
else
|
||||
CMAKE_ARGS+=-DGGML_METAL=ON
|
||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||
TARGET+=--target ggml-metal
|
||||
endif
|
||||
endif
|
||||
|
||||
# ifeq ($(BUILD_TYPE),sycl_f16)
|
||||
# CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
|
||||
# endif
|
||||
|
||||
# ifeq ($(BUILD_TYPE),sycl_f32)
|
||||
# CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
|
||||
# endif
|
||||
|
||||
# warnings
|
||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
||||
|
||||
# Find all .a archives in ARCHIVE_DIR
|
||||
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
|
||||
GGML_ARCHIVE_DIR := build/ggml/src/
|
||||
ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
|
||||
|
||||
# Name of the single merged library
|
||||
COMBINED_LIB := libggmlall.a
|
||||
|
||||
# Rule to merge all the .a files into one
|
||||
$(COMBINED_LIB): $(ALL_ARCHIVES)
|
||||
@echo "Merging all .a into $(COMBINED_LIB)"
|
||||
rm -f $@
|
||||
mkdir -p merge-tmp
|
||||
for a in $(ALL_ARCHIVES); do \
|
||||
( cd merge-tmp && ar x ../$$a ); \
|
||||
done
|
||||
( cd merge-tmp && ar rcs ../$@ *.o )
|
||||
# Ensure we have a proper index
|
||||
ranlib $@
|
||||
# Clean up
|
||||
rm -rf merge-tmp
|
||||
|
||||
build/libstable-diffusion.a:
|
||||
@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
|
||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||
+bash -c "source $(ONEAPI_VARS); \
|
||||
mkdir -p build && \
|
||||
cd build && \
|
||||
cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
|
||||
cmake --build . --config Release"
|
||||
else
|
||||
mkdir -p build && \
|
||||
cd build && \
|
||||
cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
|
||||
cmake --build . --config Release
|
||||
endif
|
||||
$(MAKE) $(COMBINED_LIB)
|
||||
|
||||
gosd.o:
|
||||
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
|
||||
|
||||
libsd.a: gosd.o
|
||||
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
|
||||
$(AR) rcs libsd.a gosd.o
|
||||
|
||||
clean:
|
||||
rm -rf gosd.o libsd.a build $(COMBINED_LIB)
|
||||
@@ -1,231 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "gosd.h"
|
||||
|
||||
// #include "preprocessing.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "stable-diffusion.h"
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#define STB_IMAGE_STATIC
|
||||
#include "stb_image.h"
|
||||
|
||||
#define STB_IMAGE_WRITE_IMPLEMENTATION
|
||||
#define STB_IMAGE_WRITE_STATIC
|
||||
#include "stb_image_write.h"
|
||||
|
||||
#define STB_IMAGE_RESIZE_IMPLEMENTATION
|
||||
#define STB_IMAGE_RESIZE_STATIC
|
||||
#include "stb_image_resize.h"
|
||||
|
||||
// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
|
||||
const char* sample_method_str[] = {
|
||||
"euler_a",
|
||||
"euler",
|
||||
"heun",
|
||||
"dpm2",
|
||||
"dpm++2s_a",
|
||||
"dpm++2m",
|
||||
"dpm++2mv2",
|
||||
"ipndm",
|
||||
"ipndm_v",
|
||||
"lcm",
|
||||
"ddim_trailing",
|
||||
"tcd",
|
||||
};
|
||||
|
||||
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
|
||||
const char* schedule_str[] = {
|
||||
"default",
|
||||
"discrete",
|
||||
"karras",
|
||||
"exponential",
|
||||
"ays",
|
||||
"gits",
|
||||
};
|
||||
|
||||
sd_ctx_t* sd_c;
|
||||
|
||||
sample_method_t sample_method;
|
||||
|
||||
int load_model(char *model, char* options[], int threads, int diff) {
|
||||
fprintf (stderr, "Loading model!\n");
|
||||
|
||||
char *stableDiffusionModel = "";
|
||||
if (diff == 1 ) {
|
||||
stableDiffusionModel = model;
|
||||
model = "";
|
||||
}
|
||||
|
||||
// decode options. Options are in form optname:optvale, or if booleans only optname.
|
||||
char *clip_l_path = "";
|
||||
char *clip_g_path = "";
|
||||
char *t5xxl_path = "";
|
||||
char *vae_path = "";
|
||||
char *scheduler = "";
|
||||
char *sampler = "";
|
||||
|
||||
// If options is not NULL, parse options
|
||||
for (int i = 0; options[i] != NULL; i++) {
|
||||
char *optname = strtok(options[i], ":");
|
||||
char *optval = strtok(NULL, ":");
|
||||
if (optval == NULL) {
|
||||
optval = "true";
|
||||
}
|
||||
|
||||
if (!strcmp(optname, "clip_l_path")) {
|
||||
clip_l_path = optval;
|
||||
}
|
||||
if (!strcmp(optname, "clip_g_path")) {
|
||||
clip_g_path = optval;
|
||||
}
|
||||
if (!strcmp(optname, "t5xxl_path")) {
|
||||
t5xxl_path = optval;
|
||||
}
|
||||
if (!strcmp(optname, "vae_path")) {
|
||||
vae_path = optval;
|
||||
}
|
||||
if (!strcmp(optname, "scheduler")) {
|
||||
scheduler = optval;
|
||||
}
|
||||
if (!strcmp(optname, "sampler")) {
|
||||
sampler = optval;
|
||||
}
|
||||
}
|
||||
|
||||
int sample_method_found = -1;
|
||||
for (int m = 0; m < N_SAMPLE_METHODS; m++) {
|
||||
if (!strcmp(sampler, sample_method_str[m])) {
|
||||
sample_method_found = m;
|
||||
}
|
||||
}
|
||||
if (sample_method_found == -1) {
|
||||
fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
|
||||
sample_method_found = EULER_A;
|
||||
}
|
||||
sample_method = (sample_method_t)sample_method_found;
|
||||
|
||||
int schedule_found = -1;
|
||||
for (int d = 0; d < N_SCHEDULES; d++) {
|
||||
if (!strcmp(scheduler, schedule_str[d])) {
|
||||
schedule_found = d;
|
||||
fprintf (stderr, "Found scheduler: %s\n", scheduler);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (schedule_found == -1) {
|
||||
fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
|
||||
schedule_found = DEFAULT;
|
||||
}
|
||||
|
||||
schedule_t schedule = (schedule_t)schedule_found;
|
||||
|
||||
fprintf (stderr, "Creating context\n");
|
||||
sd_ctx_t* sd_ctx = new_sd_ctx(model,
|
||||
clip_l_path,
|
||||
clip_g_path,
|
||||
t5xxl_path,
|
||||
stableDiffusionModel,
|
||||
vae_path,
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
threads,
|
||||
SD_TYPE_COUNT,
|
||||
STD_DEFAULT_RNG,
|
||||
schedule,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false);
|
||||
|
||||
if (sd_ctx == NULL) {
|
||||
fprintf (stderr, "failed loading model (generic error)\n");
|
||||
return 1;
|
||||
}
|
||||
fprintf (stderr, "Created context: OK\n");
|
||||
|
||||
sd_c = sd_ctx;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
|
||||
|
||||
sd_image_t* results;
|
||||
|
||||
std::vector<int> skip_layers = {7, 8, 9};
|
||||
|
||||
fprintf (stderr, "Generating image\n");
|
||||
|
||||
results = txt2img(sd_c,
|
||||
text,
|
||||
negativeText,
|
||||
-1, //clip_skip
|
||||
cfg_scale, // sfg_scale
|
||||
3.5f,
|
||||
0, // eta
|
||||
width,
|
||||
height,
|
||||
sample_method,
|
||||
steps,
|
||||
seed,
|
||||
1,
|
||||
NULL,
|
||||
0.9f,
|
||||
20.f,
|
||||
false,
|
||||
"",
|
||||
skip_layers.data(),
|
||||
skip_layers.size(),
|
||||
0,
|
||||
0.01,
|
||||
0.2);
|
||||
|
||||
if (results == NULL) {
|
||||
fprintf (stderr, "NO results\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (results[0].data == NULL) {
|
||||
fprintf (stderr, "Results with no data\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf (stderr, "Writing PNG\n");
|
||||
|
||||
fprintf (stderr, "DST: %s\n", dst);
|
||||
fprintf (stderr, "Width: %d\n", results[0].width);
|
||||
fprintf (stderr, "Height: %d\n", results[0].height);
|
||||
fprintf (stderr, "Channel: %d\n", results[0].channel);
|
||||
fprintf (stderr, "Data: %p\n", results[0].data);
|
||||
|
||||
stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
|
||||
results[0].data, 0, NULL);
|
||||
fprintf (stderr, "Saved resulting image to '%s'\n", dst);
|
||||
|
||||
// TODO: free results. Why does it crash?
|
||||
|
||||
free(results[0].data);
|
||||
results[0].data = NULL;
|
||||
free(results);
|
||||
fprintf (stderr, "gen_image is done", dst);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unload() {
|
||||
free_sd_ctx(sd_c);
|
||||
}
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
package main
|
||||
|
||||
// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
|
||||
// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
|
||||
// #include <gosd.h>
|
||||
// #include <stdlib.h>
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
)
|
||||
|
||||
type SDGGML struct {
|
||||
base.SingleThread
|
||||
threads int
|
||||
sampleMethod string
|
||||
cfgScale float32
|
||||
}
|
||||
|
||||
func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
|
||||
|
||||
sd.threads = int(opts.Threads)
|
||||
|
||||
modelFile := C.CString(opts.ModelFile)
|
||||
defer C.free(unsafe.Pointer(modelFile))
|
||||
|
||||
var options **C.char
|
||||
// prepare the options array to pass to C
|
||||
|
||||
size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
|
||||
length := C.size_t(len(opts.Options))
|
||||
options = (**C.char)(C.malloc(length * size))
|
||||
view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
|
||||
|
||||
var diffusionModel int
|
||||
|
||||
var oo []string
|
||||
for _, op := range opts.Options {
|
||||
if op == "diffusion_model" {
|
||||
diffusionModel = 1
|
||||
continue
|
||||
}
|
||||
|
||||
// If it's an option path, we resolve absolute path from the model path
|
||||
if strings.Contains(op, ":") && strings.Contains(op, "path") {
|
||||
data := strings.Split(op, ":")
|
||||
data[1] = filepath.Join(opts.ModelPath, data[1])
|
||||
if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
|
||||
oo = append(oo, strings.Join(data, ":"))
|
||||
}
|
||||
} else {
|
||||
oo = append(oo, op)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
|
||||
|
||||
for i, x := range oo {
|
||||
view[i] = C.CString(x)
|
||||
}
|
||||
|
||||
sd.cfgScale = opts.CFGScale
|
||||
|
||||
ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("could not load model")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
|
||||
t := C.CString(opts.PositivePrompt)
|
||||
defer C.free(unsafe.Pointer(t))
|
||||
|
||||
dst := C.CString(opts.Dst)
|
||||
defer C.free(unsafe.Pointer(dst))
|
||||
|
||||
negative := C.CString(opts.NegativePrompt)
|
||||
defer C.free(unsafe.Pointer(negative))
|
||||
|
||||
ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("inference failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int load_model(char *model, char* options[], int threads, int diffusionModel);
|
||||
int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,6 +1,7 @@
|
||||
package main
|
||||
|
||||
// Note: this is started internally by LocalAI and a server is allocated for each model
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
@@ -14,7 +15,7 @@ var (
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &Bark{}); err != nil {
|
||||
if err := grpc.StartServer(*addr, &Image{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
33
backend/go/image/stablediffusion/stablediffusion.go
Normal file
33
backend/go/image/stablediffusion/stablediffusion.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/stablediffusion"
|
||||
)
|
||||
|
||||
type Image struct {
|
||||
base.SingleThread
|
||||
stablediffusion *stablediffusion.StableDiffusion
|
||||
}
|
||||
|
||||
func (image *Image) Load(opts *pb.ModelOptions) error {
|
||||
var err error
|
||||
// Note: the Model here is a path to a directory containing the model files
|
||||
image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
|
||||
return err
|
||||
}
|
||||
|
||||
func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
|
||||
return image.stablediffusion.GenerateImage(
|
||||
int(opts.Height),
|
||||
int(opts.Width),
|
||||
int(opts.Mode),
|
||||
int(opts.Step),
|
||||
int(opts.Seed),
|
||||
opts.PositivePrompt,
|
||||
opts.NegativePrompt,
|
||||
opts.Dst)
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package main
|
||||
|
||||
// Note: this is started internally by LocalAI and a server is allocated for each model
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
@@ -14,7 +15,7 @@ var (
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
|
||||
if err := grpc.StartServer(*addr, &Image{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
32
backend/go/image/tinydream/tinydream.go
Normal file
32
backend/go/image/tinydream/tinydream.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/tinydream"
|
||||
)
|
||||
|
||||
type Image struct {
|
||||
base.SingleThread
|
||||
tinydream *tinydream.TinyDream
|
||||
}
|
||||
|
||||
func (image *Image) Load(opts *pb.ModelOptions) error {
|
||||
var err error
|
||||
// Note: the Model here is a path to a directory containing the model files
|
||||
image.tinydream, err = tinydream.New(opts.ModelFile)
|
||||
return err
|
||||
}
|
||||
|
||||
func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
|
||||
return image.tinydream.GenerateImage(
|
||||
int(opts.Height),
|
||||
int(opts.Width),
|
||||
int(opts.Step),
|
||||
int(opts.Seed),
|
||||
opts.PositivePrompt,
|
||||
opts.NegativePrompt,
|
||||
opts.Dst)
|
||||
}
|
||||
34
backend/go/llm/bert/bert.go
Normal file
34
backend/go/llm/bert/bert.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
bert "github.com/go-skynet/go-bert.cpp"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
type Embeddings struct {
|
||||
base.SingleThread
|
||||
bert *bert.Bert
|
||||
}
|
||||
|
||||
func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
|
||||
model, err := bert.New(opts.ModelFile)
|
||||
llm.bert = model
|
||||
return err
|
||||
}
|
||||
|
||||
func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
|
||||
|
||||
if len(opts.EmbeddingTokens) > 0 {
|
||||
tokens := []int{}
|
||||
for _, t := range opts.EmbeddingTokens {
|
||||
tokens = append(tokens, int(t))
|
||||
}
|
||||
return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
|
||||
}
|
||||
|
||||
return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
|
||||
}
|
||||
21
backend/go/llm/bert/main.go
Normal file
21
backend/go/llm/bert/main.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package main
|
||||
|
||||
// Note: this is started internally by LocalAI and a server is allocated for each model
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
204
backend/go/llm/llama-ggml/llama.go
Normal file
204
backend/go/llm/llama-ggml/llama.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/go-skynet/go-llama.cpp"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
type LLM struct {
|
||||
base.SingleThread
|
||||
|
||||
llama *llama.LLama
|
||||
}
|
||||
|
||||
func (llm *LLM) Load(opts *pb.ModelOptions) error {
|
||||
ropeFreqBase := float32(10000)
|
||||
ropeFreqScale := float32(1)
|
||||
|
||||
if opts.RopeFreqBase != 0 {
|
||||
ropeFreqBase = opts.RopeFreqBase
|
||||
}
|
||||
if opts.RopeFreqScale != 0 {
|
||||
ropeFreqScale = opts.RopeFreqScale
|
||||
}
|
||||
|
||||
llamaOpts := []llama.ModelOption{
|
||||
llama.WithRopeFreqBase(ropeFreqBase),
|
||||
llama.WithRopeFreqScale(ropeFreqScale),
|
||||
}
|
||||
|
||||
if opts.NGQA != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
|
||||
}
|
||||
|
||||
if opts.RMSNormEps != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
|
||||
}
|
||||
|
||||
if opts.ContextSize != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
|
||||
}
|
||||
if opts.F16Memory {
|
||||
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
|
||||
}
|
||||
if opts.Embeddings {
|
||||
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
|
||||
}
|
||||
if opts.NGPULayers != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
|
||||
}
|
||||
|
||||
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
|
||||
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
|
||||
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
|
||||
if opts.NBatch != 0 {
|
||||
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
|
||||
} else {
|
||||
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
|
||||
}
|
||||
|
||||
if opts.NUMA {
|
||||
llamaOpts = append(llamaOpts, llama.EnableNUMA)
|
||||
}
|
||||
|
||||
if opts.LowVRAM {
|
||||
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
|
||||
}
|
||||
|
||||
model, err := llama.New(opts.ModelFile, llamaOpts...)
|
||||
llm.llama = model
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
|
||||
ropeFreqBase := float32(10000)
|
||||
ropeFreqScale := float32(1)
|
||||
|
||||
if opts.RopeFreqBase != 0 {
|
||||
ropeFreqBase = opts.RopeFreqBase
|
||||
}
|
||||
if opts.RopeFreqScale != 0 {
|
||||
ropeFreqScale = opts.RopeFreqScale
|
||||
}
|
||||
predictOptions := []llama.PredictOption{
|
||||
llama.SetTemperature(opts.Temperature),
|
||||
llama.SetTopP(opts.TopP),
|
||||
llama.SetTopK(int(opts.TopK)),
|
||||
llama.SetTokens(int(opts.Tokens)),
|
||||
llama.SetThreads(int(opts.Threads)),
|
||||
llama.WithGrammar(opts.Grammar),
|
||||
llama.SetRopeFreqBase(ropeFreqBase),
|
||||
llama.SetRopeFreqScale(ropeFreqScale),
|
||||
llama.SetNegativePromptScale(opts.NegativePromptScale),
|
||||
llama.SetNegativePrompt(opts.NegativePrompt),
|
||||
}
|
||||
|
||||
if opts.PromptCacheAll {
|
||||
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
|
||||
}
|
||||
|
||||
if opts.PromptCacheRO {
|
||||
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
|
||||
}
|
||||
|
||||
// Expected absolute path
|
||||
if opts.PromptCachePath != "" {
|
||||
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
|
||||
}
|
||||
|
||||
if opts.Mirostat != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
|
||||
}
|
||||
|
||||
if opts.MirostatETA != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
|
||||
}
|
||||
|
||||
if opts.MirostatTAU != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
|
||||
}
|
||||
|
||||
if opts.Debug {
|
||||
predictOptions = append(predictOptions, llama.Debug)
|
||||
}
|
||||
|
||||
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
|
||||
|
||||
if opts.PresencePenalty != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
|
||||
}
|
||||
|
||||
if opts.NKeep != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
|
||||
}
|
||||
|
||||
if opts.Batch != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
|
||||
}
|
||||
|
||||
if opts.F16KV {
|
||||
predictOptions = append(predictOptions, llama.EnableF16KV)
|
||||
}
|
||||
|
||||
if opts.IgnoreEOS {
|
||||
predictOptions = append(predictOptions, llama.IgnoreEOS)
|
||||
}
|
||||
|
||||
if opts.Seed != 0 {
|
||||
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
|
||||
}
|
||||
|
||||
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
|
||||
|
||||
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
|
||||
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
|
||||
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
|
||||
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
|
||||
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
|
||||
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
|
||||
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
|
||||
return predictOptions
|
||||
}
|
||||
|
||||
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
|
||||
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
|
||||
}
|
||||
|
||||
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
||||
predictOptions := buildPredictOptions(opts)
|
||||
|
||||
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
|
||||
results <- token
|
||||
return true
|
||||
}))
|
||||
|
||||
go func() {
|
||||
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
|
||||
if err != nil {
|
||||
fmt.Println("err: ", err)
|
||||
}
|
||||
close(results)
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
|
||||
predictOptions := buildPredictOptions(opts)
|
||||
|
||||
if len(opts.EmbeddingTokens) > 0 {
|
||||
tokens := []int{}
|
||||
for _, t := range opts.EmbeddingTokens {
|
||||
tokens = append(tokens, int(t))
|
||||
}
|
||||
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
|
||||
}
|
||||
|
||||
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
|
||||
}
|
||||
19
backend/go/llm/llama-ggml/main.go
Normal file
19
backend/go/llm/llama-ggml/main.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
)
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,7 @@ var (
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &VAD{}); err != nil {
|
||||
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
95
backend/go/llm/rwkv/rwkv.go
Normal file
95
backend/go/llm/rwkv/rwkv.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/donomii/go-rwkv.cpp"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
const tokenizerSuffix = ".tokenizer.json"
|
||||
|
||||
type LLM struct {
|
||||
base.SingleThread
|
||||
|
||||
rwkv *rwkv.RwkvState
|
||||
}
|
||||
|
||||
func (llm *LLM) Load(opts *pb.ModelOptions) error {
|
||||
tokenizerFile := opts.Tokenizer
|
||||
if tokenizerFile == "" {
|
||||
modelFile := filepath.Base(opts.ModelFile)
|
||||
tokenizerFile = modelFile + tokenizerSuffix
|
||||
}
|
||||
modelPath := filepath.Dir(opts.ModelFile)
|
||||
tokenizerPath := filepath.Join(modelPath, tokenizerFile)
|
||||
|
||||
model := rwkv.LoadFiles(opts.ModelFile, tokenizerPath, uint32(opts.GetThreads()))
|
||||
|
||||
if model == nil {
|
||||
return fmt.Errorf("rwkv could not load model")
|
||||
}
|
||||
llm.rwkv = model
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
|
||||
stopWord := "\n"
|
||||
if len(opts.StopPrompts) > 0 {
|
||||
stopWord = opts.StopPrompts[0]
|
||||
}
|
||||
|
||||
if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
response := llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), nil)
|
||||
|
||||
return response, nil
|
||||
}
|
||||
|
||||
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
||||
go func() {
|
||||
|
||||
stopWord := "\n"
|
||||
if len(opts.StopPrompts) > 0 {
|
||||
stopWord = opts.StopPrompts[0]
|
||||
}
|
||||
|
||||
if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
|
||||
fmt.Println("Error processing input: ", err)
|
||||
return
|
||||
}
|
||||
|
||||
llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), func(s string) bool {
|
||||
results <- s
|
||||
return true
|
||||
})
|
||||
close(results)
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
|
||||
tokens, err := llm.rwkv.Tokenizer.Encode(opts.Prompt)
|
||||
if err != nil {
|
||||
return pb.TokenizationResponse{}, err
|
||||
}
|
||||
|
||||
l := len(tokens)
|
||||
i32Tokens := make([]int32, l)
|
||||
|
||||
for i, t := range tokens {
|
||||
i32Tokens[i] = int32(t.ID)
|
||||
}
|
||||
|
||||
return pb.TokenizationResponse{
|
||||
Length: int32(l),
|
||||
Tokens: i32Tokens,
|
||||
}, nil
|
||||
}
|
||||
@@ -311,16 +311,12 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
|
||||
}
|
||||
|
||||
func isNormalized(k []float32) bool {
|
||||
var sum float64
|
||||
|
||||
var sum float32
|
||||
for _, v := range k {
|
||||
v64 := float64(v)
|
||||
sum += v64*v64
|
||||
sum += v
|
||||
}
|
||||
|
||||
s := math.Sqrt(sum)
|
||||
|
||||
return s >= 0.99 && s <= 1.01
|
||||
return sum == 1.0
|
||||
}
|
||||
|
||||
// TODO: This we could replace with handwritten SIMD code
|
||||
@@ -332,7 +328,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
|
||||
dot += k1[i] * k2[i]
|
||||
}
|
||||
|
||||
assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
|
||||
assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
|
||||
|
||||
// 2.0 * (1.0 - dot) would be the Euclidean distance
|
||||
return dot
|
||||
@@ -422,7 +418,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
|
||||
|
||||
sim := float32(dot / (mag1 * math.Sqrt(mag2)))
|
||||
|
||||
assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
|
||||
assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
|
||||
|
||||
return sim
|
||||
}
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
package main
|
||||
|
||||
// This is a wrapper to statisfy the GRPC service interface
|
||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/streamer45/silero-vad-go/speech"
|
||||
)
|
||||
|
||||
type VAD struct {
|
||||
base.SingleThread
|
||||
detector *speech.Detector
|
||||
}
|
||||
|
||||
func (vad *VAD) Load(opts *pb.ModelOptions) error {
|
||||
v, err := speech.NewDetector(speech.DetectorConfig{
|
||||
ModelPath: opts.ModelFile,
|
||||
SampleRate: 16000,
|
||||
//WindowSize: 1024,
|
||||
Threshold: 0.5,
|
||||
MinSilenceDurationMs: 0,
|
||||
SpeechPadMs: 0,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("create silero detector: %w", err)
|
||||
}
|
||||
|
||||
vad.detector = v
|
||||
return err
|
||||
}
|
||||
|
||||
func (vad *VAD) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
|
||||
audio := req.Audio
|
||||
|
||||
segments, err := vad.detector.Detect(audio)
|
||||
if err != nil {
|
||||
return pb.VADResponse{}, fmt.Errorf("detect: %w", err)
|
||||
}
|
||||
|
||||
vadSegments := []*pb.VADSegment{}
|
||||
for _, s := range segments {
|
||||
vadSegments = append(vadSegments, &pb.VADSegment{
|
||||
Start: float32(s.SpeechStartAt),
|
||||
End: float32(s.SpeechEndAt),
|
||||
})
|
||||
}
|
||||
|
||||
return pb.VADResponse{
|
||||
Segments: vadSegments,
|
||||
}, nil
|
||||
}
|
||||
@@ -1,2 +1,2 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
torch
|
||||
|
||||
@@ -1 +1 @@
|
||||
torch==2.4.1
|
||||
torch
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch==2.4.1+rocm6.0
|
||||
torch
|
||||
@@ -1,6 +1,5 @@
|
||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.3.110+xpu
|
||||
torch==2.3.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,6 +1,6 @@
|
||||
accelerate
|
||||
auto-gptq==0.7.1
|
||||
grpcio==1.71.0
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
transformers
|
||||
@@ -1,4 +1,4 @@
|
||||
transformers
|
||||
accelerate
|
||||
torch==2.4.1
|
||||
torchaudio==2.4.1
|
||||
torch
|
||||
torchaudio
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
torchaudio==2.4.1+cu118
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
torch==2.4.1
|
||||
torchaudio==2.4.1
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch==2.4.1+rocm6.0
|
||||
torchaudio==2.4.1+rocm6.0
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,9 +1,8 @@
|
||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.3.110+xpu
|
||||
torch==2.3.1+cxx11.abi
|
||||
torchaudio==2.3.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
bark==0.1.5
|
||||
grpcio==1.71.0
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -17,9 +17,6 @@
|
||||
# LIMIT_TARGETS="cublas12"
|
||||
# source $(dirname $0)/../common/libbackend.sh
|
||||
#
|
||||
|
||||
PYTHON_VERSION="3.10"
|
||||
|
||||
function init() {
|
||||
# Name of the backend (directory name)
|
||||
BACKEND_NAME=${PWD##*/}
|
||||
@@ -91,7 +88,7 @@ function getBuildProfile() {
|
||||
# always result in an activated virtual environment
|
||||
function ensureVenv() {
|
||||
if [ ! -d "${EDIR}/venv" ]; then
|
||||
uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
|
||||
uv venv ${EDIR}/venv
|
||||
echo "virtualenv created"
|
||||
fi
|
||||
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
.DEFAULT_GOAL := install
|
||||
|
||||
.PHONY: install
|
||||
install:
|
||||
install: protogen
|
||||
bash install.sh
|
||||
$(MAKE) protogen
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
@@ -13,7 +12,7 @@ protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
bash protogen.sh
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
@@ -1,5 +1,4 @@
|
||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.3.110+xpu
|
||||
torch==2.3.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
optimum[openvino]
|
||||
@@ -1,3 +1,2 @@
|
||||
grpcio==1.71.0
|
||||
protobuf
|
||||
grpcio-tools
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
@@ -1,4 +1,3 @@
|
||||
transformers==4.48.3
|
||||
transformers
|
||||
accelerate
|
||||
torch==2.4.1
|
||||
coqui-tts
|
||||
torch
|
||||
@@ -1,6 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
torchaudio==2.4.1+cu118
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,5 +1,4 @@
|
||||
torch==2.4.1
|
||||
torchaudio==2.4.1
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,6 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
torch==2.4.1+rocm6.0
|
||||
torchaudio==2.4.1+rocm6.0
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
torch
|
||||
torchaudio
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,10 +1,8 @@
|
||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.3.110+xpu
|
||||
torch==2.3.1+cxx11.abi
|
||||
torchaudio==2.3.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
torchaudio
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
transformers==4.48.3
|
||||
accelerate
|
||||
coqui-tts
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.71.0
|
||||
coqui-tts
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
packaging==24.1
|
||||
certifi
|
||||
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
This method sets up the gRPC service by starting the server
|
||||
"""
|
||||
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(30)
|
||||
time.sleep(10)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -17,7 +17,7 @@ import backend_pb2_grpc
|
||||
|
||||
import grpc
|
||||
|
||||
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
|
||||
from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
|
||||
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
|
||||
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import safety_checker
|
||||
@@ -159,18 +159,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
torchType = torch.float16
|
||||
variant = "fp16"
|
||||
|
||||
options = request.Options
|
||||
|
||||
# empty dict
|
||||
self.options = {}
|
||||
|
||||
# The options are a list of strings in this form optname:optvalue
|
||||
# We are storing all the options in a dict so we can use it later when
|
||||
# generating the images
|
||||
for opt in options:
|
||||
key, value = opt.split(":")
|
||||
self.options[key] = value
|
||||
|
||||
local = False
|
||||
modelFile = request.Model
|
||||
|
||||
@@ -259,16 +247,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
use_safetensors=True,
|
||||
variant=variant)
|
||||
elif request.PipelineType == "FluxPipeline":
|
||||
if fromSingleFile:
|
||||
self.pipe = FluxPipeline.from_single_file(modelFile,
|
||||
torch_dtype=torchType,
|
||||
use_safetensors=True)
|
||||
else:
|
||||
self.pipe = FluxPipeline.from_pretrained(
|
||||
request.Model,
|
||||
torch_dtype=torch.bfloat16)
|
||||
if request.LowVRAM:
|
||||
self.pipe.enable_model_cpu_offload()
|
||||
if request.LowVRAM:
|
||||
self.pipe.enable_model_cpu_offload()
|
||||
elif request.PipelineType == "FluxTransformer2DModel":
|
||||
dtype = torch.bfloat16
|
||||
# specify from environment or default to "ChuckMcSneed/FLUX.1-dev"
|
||||
@@ -287,13 +270,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
if request.LowVRAM:
|
||||
self.pipe.enable_model_cpu_offload()
|
||||
elif request.PipelineType == "SanaPipeline":
|
||||
self.pipe = SanaPipeline.from_pretrained(
|
||||
request.Model,
|
||||
variant="bf16",
|
||||
torch_dtype=torch.bfloat16)
|
||||
self.pipe.vae.to(torch.bfloat16)
|
||||
self.pipe.text_encoder.to(torch.bfloat16)
|
||||
|
||||
if CLIPSKIP and request.CLIPSkip != 0:
|
||||
self.clip_skip = request.CLIPSkip
|
||||
@@ -320,34 +296,22 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
self.pipe.controlnet = self.controlnet
|
||||
else:
|
||||
self.controlnet = None
|
||||
|
||||
if request.LoraAdapter and not os.path.isabs(request.LoraAdapter):
|
||||
# Assume directory from request.ModelFile.
|
||||
# Only if request.LoraAdapter it's not an absolute path
|
||||
if request.LoraAdapter and request.ModelFile != "" and not os.path.isabs(request.LoraAdapter) and request.LoraAdapter:
|
||||
# get base path of modelFile
|
||||
modelFileBase = os.path.dirname(request.ModelFile)
|
||||
# modify LoraAdapter to be relative to modelFileBase
|
||||
request.LoraAdapter = os.path.join(request.ModelPath, request.LoraAdapter)
|
||||
|
||||
request.LoraAdapter = os.path.join(modelFileBase, request.LoraAdapter)
|
||||
device = "cpu" if not request.CUDA else "cuda"
|
||||
self.device = device
|
||||
if request.LoraAdapter:
|
||||
# Check if its a local file and not a directory ( we load lora differently for a safetensor file )
|
||||
if os.path.exists(request.LoraAdapter) and not os.path.isdir(request.LoraAdapter):
|
||||
# self.load_lora_weights(request.LoraAdapter, 1, device, torchType)
|
||||
self.pipe.load_lora_weights(request.LoraAdapter)
|
||||
else:
|
||||
self.pipe.unet.load_attn_procs(request.LoraAdapter)
|
||||
if len(request.LoraAdapters) > 0:
|
||||
i = 0
|
||||
adapters_name = []
|
||||
adapters_weights = []
|
||||
for adapter in request.LoraAdapters:
|
||||
if not os.path.isabs(adapter):
|
||||
adapter = os.path.join(request.ModelPath, adapter)
|
||||
self.pipe.load_lora_weights(adapter, adapter_name=f"adapter_{i}")
|
||||
adapters_name.append(f"adapter_{i}")
|
||||
i += 1
|
||||
|
||||
for adapters_weight in request.LoraScales:
|
||||
adapters_weights.append(adapters_weight)
|
||||
|
||||
self.pipe.set_adapters(adapters_name, adapter_weights=adapters_weights)
|
||||
|
||||
if request.CUDA:
|
||||
self.pipe.to('cuda')
|
||||
@@ -428,6 +392,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
# create a dictionary of values for the parameters
|
||||
options = {
|
||||
"negative_prompt": request.negative_prompt,
|
||||
"width": request.width,
|
||||
"height": request.height,
|
||||
"num_inference_steps": steps,
|
||||
}
|
||||
|
||||
@@ -445,16 +411,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
keys = options.keys()
|
||||
|
||||
if request.EnableParameters != "":
|
||||
keys = [key.strip() for key in request.EnableParameters.split(",")]
|
||||
keys = request.EnableParameters.split(",")
|
||||
|
||||
if request.EnableParameters == "none":
|
||||
keys = []
|
||||
|
||||
# create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
|
||||
kwargs = {key: options.get(key) for key in keys if key in options}
|
||||
|
||||
# populate kwargs from self.options.
|
||||
kwargs.update(self.options)
|
||||
kwargs = {key: options[key] for key in keys}
|
||||
|
||||
# Set seed
|
||||
if request.seed > 0:
|
||||
@@ -465,12 +428,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if self.PipelineType == "FluxPipeline":
|
||||
kwargs["max_sequence_length"] = 256
|
||||
|
||||
if request.width:
|
||||
kwargs["width"] = request.width
|
||||
|
||||
if request.height:
|
||||
kwargs["height"] = request.height
|
||||
|
||||
if self.PipelineType == "FluxTransformer2DModel":
|
||||
kwargs["output_type"] = "pil"
|
||||
kwargs["generator"] = torch.Generator("cpu").manual_seed(0)
|
||||
@@ -490,7 +447,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
export_to_video(video_frames, request.dst)
|
||||
return backend_pb2.Result(message="Media generated successfully", success=True)
|
||||
|
||||
print(f"Generating image with {kwargs=}", file=sys.stderr)
|
||||
image = {}
|
||||
if COMPEL:
|
||||
conditioning, pooled = self.compel.build_conditioning_tensor(prompt)
|
||||
|
||||
@@ -5,5 +5,5 @@ accelerate
|
||||
compel
|
||||
peft
|
||||
sentencepiece
|
||||
torch==2.4.1
|
||||
torch
|
||||
optimum-quanto
|
||||
@@ -1,5 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
torch
|
||||
diffusers
|
||||
opencv-python
|
||||
transformers
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
torch==2.4.1
|
||||
torch
|
||||
diffusers
|
||||
opencv-python
|
||||
transformers
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.3.110+xpu
|
||||
torch==2.3.1+cxx11.abi
|
||||
torchvision==0.18.1+cxx11.abi
|
||||
oneccl_bind_pt==2.3.100+xpu
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
torchvision
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
diffusers
|
||||
opencv-python
|
||||
transformers
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
setuptools
|
||||
grpcio==1.71.0
|
||||
grpcio==1.66.2
|
||||
pillow
|
||||
protobuf
|
||||
certifi
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
transformers
|
||||
accelerate
|
||||
torch==2.4.1
|
||||
torch
|
||||
@@ -1,4 +1,4 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
torch
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,3 +1,3 @@
|
||||
torch==2.4.1
|
||||
torch
|
||||
transformers
|
||||
accelerate
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.71.0
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
wheel
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
.DEFAULT_GOAL := install
|
||||
|
||||
.PHONY: install
|
||||
install:
|
||||
bash install.sh
|
||||
$(MAKE) protogen
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
bash protogen.sh
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
@@ -1,8 +0,0 @@
|
||||
faster-whisper
|
||||
opencv-python
|
||||
accelerate
|
||||
compel
|
||||
peft
|
||||
sentencepiece
|
||||
torch==2.4.1
|
||||
optimum-quanto
|
||||
@@ -1,9 +0,0 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch==2.4.1+cu118
|
||||
faster-whisper
|
||||
opencv-python
|
||||
accelerate
|
||||
compel
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user