Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
3826edb9da chore(deps): bump llama.cpp to '10f2e81809bbb69ecfe64fc8b4686285f84b0c07'
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-03-12 09:12:59 +01:00
180 changed files with 2252 additions and 7306 deletions

5
.env
View File

@@ -29,9 +29,6 @@
## Enable/Disable single backend (useful if only one GPU is available) ## Enable/Disable single backend (useful if only one GPU is available)
# LOCALAI_SINGLE_ACTIVE_BACKEND=true # LOCALAI_SINGLE_ACTIVE_BACKEND=true
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
## Specify a build type. Available: cublas, openblas, clblas. ## Specify a build type. Available: cublas, openblas, clblas.
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit. ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM. ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -76,7 +73,7 @@
### Define a list of GRPC Servers for llama-cpp workers to distribute the load ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
# https://github.com/ggerganov/llama.cpp/pull/6829 # https://github.com/ggerganov/llama.cpp/pull/6829
# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md # https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
# LLAMACPP_GRPC_SERVERS="" # LLAMACPP_GRPC_SERVERS=""
### Enable to run parallel requests ### Enable to run parallel requests

View File

@@ -29,6 +29,10 @@ updates:
schedule: schedule:
# Check for updates to GitHub Actions every weekday # Check for updates to GitHub Actions every weekday
interval: "weekly" interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/autogptq"
schedule:
interval: "weekly"
- package-ecosystem: "pip" - package-ecosystem: "pip"
directory: "/backend/python/bark" directory: "/backend/python/bark"
schedule: schedule:

View File

@@ -12,7 +12,7 @@ jobs:
- repository: "ggml-org/llama.cpp" - repository: "ggml-org/llama.cpp"
variable: "CPPLLAMA_VERSION" variable: "CPPLLAMA_VERSION"
branch: "master" branch: "master"
- repository: "ggml-org/whisper.cpp" - repository: "ggerganov/whisper.cpp"
variable: "WHISPER_CPP_VERSION" variable: "WHISPER_CPP_VERSION"
branch: "master" branch: "master"
- repository: "PABannier/bark.cpp" - repository: "PABannier/bark.cpp"

View File

@@ -14,7 +14,7 @@ jobs:
steps: steps:
- name: Dependabot metadata - name: Dependabot metadata
id: metadata id: metadata
uses: dependabot/fetch-metadata@v2.4.0 uses: dependabot/fetch-metadata@v2.3.0
with: with:
github-token: "${{ secrets.GITHUB_TOKEN }}" github-token: "${{ secrets.GITHUB_TOKEN }}"
skip-commit-verification: true skip-commit-verification: true

View File

@@ -42,7 +42,7 @@ jobs:
script: | script: |
sudo rm -rf local-ai/ || true sudo rm -rf local-ai/ || true
- name: copy file via ssh - name: copy file via ssh
uses: appleboy/scp-action@v1.0.0 uses: appleboy/scp-action@v0.1.7
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}

View File

@@ -15,7 +15,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
runs-on: 'ubuntu-latest' runs-on: 'ubuntu-latest'
platforms: 'linux/amd64' platforms: 'linux/amd64'
runs-on: ${{matrix.runs-on}} runs-on: ${{matrix.runs-on}}

View File

@@ -33,7 +33,6 @@ jobs:
# Pushing with all jobs in parallel # Pushing with all jobs in parallel
# eats the bandwidth of all the nodes # eats the bandwidth of all the nodes
max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }} max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
fail-fast: false
matrix: matrix:
include: include:
# This is basically covered by the AIO test # This is basically covered by the AIO test
@@ -57,35 +56,26 @@ jobs:
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas' # - build-type: 'hipblas'
platforms: 'linux/amd64' # platforms: 'linux/amd64'
tag-latest: 'false' # tag-latest: 'false'
tag-suffix: '-hipblas' # tag-suffix: '-hipblas'
ffmpeg: 'false' # ffmpeg: 'false'
image-type: 'extras' # image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1" # base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" # grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set' # runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" # makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16' # - build-type: 'sycl_f16'
platforms: 'linux/amd64' # platforms: 'linux/amd64'
tag-latest: 'false' # tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" # base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" # grpc-base-image: "ubuntu:22.04"
tag-suffix: 'sycl-f16-ffmpeg' # tag-suffix: 'sycl-f16-ffmpeg'
ffmpeg: 'true' # ffmpeg: 'true'
image-type: 'extras' # image-type: 'extras'
runs-on: 'arc-runner-set' # runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" # makeflags: "--jobs=3 --output-sync=target"
- build-type: 'vulkan'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-vulkan-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
makeflags: "--jobs=4 --output-sync=target"
# core-image-build: # core-image-build:
# uses: ./.github/workflows/image_build.yml # uses: ./.github/workflows/image_build.yml
# with: # with:

View File

@@ -45,13 +45,13 @@ jobs:
- build-type: 'hipblas' - build-type: 'hipblas'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'auto' tag-latest: 'auto'
tag-suffix: '-hipblas-extras' tag-suffix: '-hipblas-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
aio: "-aio-gpu-hipblas" aio: "-aio-gpu-hipblas"
base-image: "rocm/dev-ubuntu-22.04:6.1" base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
latest-image: 'latest-gpu-hipblas-extras' latest-image: 'latest-gpu-hipblas'
latest-image-aio: 'latest-aio-gpu-hipblas' latest-image-aio: 'latest-aio-gpu-hipblas'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
@@ -59,13 +59,32 @@ jobs:
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-hipblas' tag-suffix: '-hipblas'
ffmpeg: 'false'
image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1" base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-hipblas' - build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-core'
ffmpeg: 'false'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
self-hosted-jobs: self-hosted-jobs:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
with: with:
@@ -95,58 +114,110 @@ jobs:
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }} max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
matrix: matrix:
include: include:
# Extra images
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "11" cuda-major-version: "11"
cuda-minor-version: "7" cuda-minor-version: "7"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda11-extras' tag-suffix: '-cublas-cuda11'
ffmpeg: 'true' ffmpeg: ''
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-11"
latest-image: 'latest-gpu-nvidia-cuda-11-extras'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "12" cuda-major-version: "12"
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda12-extras' tag-suffix: '-cublas-cuda12'
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda11-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-11"
latest-image: 'latest-gpu-nvidia-cuda-11'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda12-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-12" aio: "-aio-gpu-nvidia-cuda-12"
latest-image: 'latest-gpu-nvidia-cuda-12-extras' latest-image: 'latest-gpu-nvidia-cuda-12'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12' latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16' - build-type: 'sycl_f16'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'auto'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16-extras' tag-suffix: '-sycl-f16-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f16" aio: "-aio-gpu-intel-f16"
latest-image: 'latest-gpu-intel-f16-extras' latest-image: 'latest-gpu-intel-f16'
latest-image-aio: 'latest-aio-gpu-intel-f16' latest-image-aio: 'latest-aio-gpu-intel-f16'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32' - build-type: 'sycl_f32'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'auto'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32-extras' tag-suffix: '-sycl-f32-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f32" aio: "-aio-gpu-intel-f32"
latest-image: 'latest-gpu-intel-f32-extras' latest-image: 'latest-gpu-intel-f32'
latest-image-aio: 'latest-aio-gpu-intel-f32' latest-image-aio: 'latest-aio-gpu-intel-f32'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
# Core images # Core images
@@ -155,23 +226,41 @@ jobs:
tag-latest: 'false' tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16' tag-suffix: '-sycl-f16-core'
ffmpeg: 'true' ffmpeg: 'false'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-intel-f16'
- build-type: 'sycl_f32' - build-type: 'sycl_f32'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32' tag-suffix: '-sycl-f32-core'
ffmpeg: 'false'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-intel-f32'
core-image-build: core-image-build:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
@@ -204,7 +293,7 @@ jobs:
- build-type: '' - build-type: ''
platforms: 'linux/amd64,linux/arm64' platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto' tag-latest: 'auto'
tag-suffix: '' tag-suffix: '-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
@@ -219,38 +308,60 @@ jobs:
cuda-minor-version: "7" cuda-minor-version: "7"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda11' tag-suffix: '-cublas-cuda11-core'
ffmpeg: 'true' ffmpeg: ''
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false' skip-drivers: 'false'
latest-image: 'latest-gpu-nvidia-cuda-12'
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "12" cuda-major-version: "12"
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda12' tag-suffix: '-cublas-cuda12-core'
ffmpeg: ''
image-type: 'core'
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda11-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda12-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
skip-drivers: 'false' skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
latest-image: 'latest-gpu-nvidia-cuda-12'
- build-type: 'vulkan' - build-type: 'vulkan'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-vulkan' tag-suffix: '-vulkan-ffmpeg-core'
latest-image: 'latest-vulkan-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
skip-drivers: 'false' skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
latest-image: 'latest-gpu-vulkan'
gh-runner: gh-runner:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
with: with:
@@ -283,8 +394,8 @@ jobs:
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/arm64' platforms: 'linux/arm64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-nvidia-l4t-arm64' tag-suffix: '-nvidia-l4t-arm64-core'
latest-image: 'latest-nvidia-l4t-arm64' latest-image: 'latest-nvidia-l4t-arm64-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"

View File

@@ -8,7 +8,7 @@ jobs:
notify-discord: notify-discord:
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }} if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
env: env:
MODEL_NAME: gemma-3-12b-it MODEL_NAME: hermes-2-theta-llama-3-8b
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -16,7 +16,7 @@ jobs:
fetch-depth: 0 # needed to checkout all branches for this Action to work fetch-depth: 0 # needed to checkout all branches for this Action to work
- uses: mudler/localai-github-action@v1 - uses: mudler/localai-github-action@v1
with: with:
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file" model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
# Check the PR diff using the current branch and the base branch of the PR # Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0 - uses: GrantBirki/git-diff-action@v2.8.0
id: git-diff-action id: git-diff-action
@@ -79,7 +79,7 @@ jobs:
args: ${{ steps.summarize.outputs.message }} args: ${{ steps.summarize.outputs.message }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -87,7 +87,7 @@ jobs:
notify-twitter: notify-twitter:
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }} if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
env: env:
MODEL_NAME: gemma-3-12b-it MODEL_NAME: hermes-2-theta-llama-3-8b
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@@ -161,7 +161,7 @@ jobs:
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View File

@@ -14,7 +14,7 @@ jobs:
steps: steps:
- uses: mudler/localai-github-action@v1 - uses: mudler/localai-github-action@v1
with: with:
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file" model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
- name: Summarize - name: Summarize
id: summarize id: summarize
run: | run: |

View File

@@ -36,7 +36,6 @@ jobs:
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
make install-go-tools
- name: Install CUDA Dependencies - name: Install CUDA Dependencies
run: | run: |
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@@ -124,7 +123,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -152,7 +151,6 @@ jobs:
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
make install-go-tools
- name: Intel Dependencies - name: Intel Dependencies
run: | run: |
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@@ -234,7 +232,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -255,7 +253,8 @@ jobs:
- name: Dependencies - name: Dependencies
run: | run: |
brew install protobuf grpc brew install protobuf grpc
make install-go-tools go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
- name: Build - name: Build
id: build id: build
run: | run: |
@@ -276,7 +275,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -296,7 +295,8 @@ jobs:
- name: Dependencies - name: Dependencies
run: | run: |
brew install protobuf grpc libomp llvm brew install protobuf grpc libomp llvm
make install-go-tools go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
- name: Build - name: Build
id: build id: build
run: | run: |
@@ -317,7 +317,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View File

@@ -18,7 +18,7 @@ jobs:
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
- name: Run Gosec Security Scanner - name: Run Gosec Security Scanner
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
uses: securego/gosec@v2.22.4 uses: securego/gosec@v2.22.0
with: with:
# we let the report trigger content trigger a failure using the GitHub Security features. # we let the report trigger content trigger a failure using the GitHub Security features.
args: '-no-fail -fmt sarif -out results.sarif ./...' args: '-no-fail -fmt sarif -out results.sarif ./...'

View File

@@ -78,26 +78,6 @@ jobs:
make --jobs=5 --output-sync=target -C backend/python/diffusers make --jobs=5 --output-sync=target -C backend/python/diffusers
make --jobs=5 --output-sync=target -C backend/python/diffusers test make --jobs=5 --output-sync=target -C backend/python/diffusers test
#tests-vllm:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install -y build-essential ffmpeg
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
# sudo apt-get install -y libopencv-dev
# # Install UV
# curl -LsSf https://astral.sh/uv/install.sh | sh
# pip install --user --no-cache-dir grpcio-tools==1.64.1
# - name: Test vllm backend
# run: |
# make --jobs=5 --output-sync=target -C backend/python/vllm
# make --jobs=5 --output-sync=target -C backend/python/vllm test
# tests-transformers-musicgen: # tests-transformers-musicgen:
# runs-on: ubuntu-latest # runs-on: ubuntu-latest
# steps: # steps:

View File

@@ -71,7 +71,7 @@ jobs:
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
sudo apt-get install -y libgmock-dev clang sudo apt-get install -y libgmock-dev
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \ gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@@ -96,7 +96,6 @@ jobs:
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
# The python3-grpc-tools package in 22.04 is too old # The python3-grpc-tools package in 22.04 is too old
pip install --user grpcio-tools pip install --user grpcio-tools
@@ -131,7 +130,7 @@ jobs:
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -184,7 +183,6 @@ jobs:
rm protoc.zip rm protoc.zip
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
PATH="$PATH:$HOME/go/bin" make protogen-go PATH="$PATH:$HOME/go/bin" make protogen-go
- name: Build images - name: Build images
run: | run: |
@@ -196,7 +194,7 @@ jobs:
make run-e2e-aio make run-e2e-aio
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -224,7 +222,6 @@ jobs:
run: | run: |
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
pip install --user --no-cache-dir grpcio-tools pip install --user --no-cache-dir grpcio-tools
go install github.com/GeertJohan/go.rice/rice@latest
- name: Test - name: Test
run: | run: |
export C_INCLUDE_PATH=/usr/local/include export C_INCLUDE_PATH=/usr/local/include
@@ -235,7 +232,7 @@ jobs:
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View File

@@ -15,7 +15,7 @@ ARG TARGETARCH
ARG TARGETVARIANT ARG TARGETVARIANT
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh" ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
@@ -24,7 +24,6 @@ RUN apt-get update && \
ca-certificates \ ca-certificates \
curl libssl-dev \ curl libssl-dev \
git \ git \
git-lfs \
unzip upx-ucl && \ unzip upx-ucl && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
@@ -46,10 +45,9 @@ EOT
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
# Install grpc compilers and rice # Install grpc compilers
RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/ COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
RUN update-ca-certificates RUN update-ca-certificates
@@ -301,9 +299,10 @@ COPY .git .
RUN make prepare RUN make prepare
## Build the binary ## Build the binary
## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space ## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
## Otherwise just run the normal build ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \ ## (both will use CUDA or hipblas for the actual computation)
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \ SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \ else \
make build; \ make build; \
@@ -431,6 +430,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/vllm \ make -C backend/python/vllm \
; fi && \ ; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/autogptq \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/bark \ make -C backend/python/bark \
; fi && \ ; fi && \

102
Makefile
View File

@@ -6,11 +6,11 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c CPPLLAMA_VERSION?=10f2e81809bbb69ecfe64fc8b4686285f84b0c07
# whisper.cpp version # whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
WHISPER_CPP_VERSION?=e41bc5c61ae66af6be2bd7011769bb821a83e8ae WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
# go-piper version # go-piper version
PIPER_REPO?=https://github.com/mudler/go-piper PIPER_REPO?=https://github.com/mudler/go-piper
@@ -21,11 +21,8 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=v1.0.0 BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml) # stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae STABLEDIFFUSION_GGML_VERSION?=19d876ee300a055629926ff836489901f734f2b7
# ONEAPI variables for SYCL
export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
ONNX_VERSION?=1.20.0 ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64 ONNX_ARCH?=x64
@@ -33,12 +30,8 @@ ONNX_OS?=linux
export BUILD_TYPE?= export BUILD_TYPE?=
export STABLE_BUILD_TYPE?=$(BUILD_TYPE) export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF export CMAKE_ARGS?=
export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
export BACKEND_LIBS?= export BACKEND_LIBS?=
export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
CGO_LDFLAGS?= CGO_LDFLAGS?=
CGO_LDFLAGS_WHISPER?= CGO_LDFLAGS_WHISPER?=
@@ -88,7 +81,6 @@ endif
# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
ifeq ($(NATIVE),false) ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF CMAKE_ARGS+=-DGGML_NATIVE=OFF
WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif endif
# Detect if we are running on arm64 # Detect if we are running on arm64
@@ -116,31 +108,13 @@ ifeq ($(OS),Darwin)
# disable metal if on Darwin and any other value is explicitly passed. # disable metal if on Darwin and any other value is explicitly passed.
else ifneq ($(BUILD_TYPE),metal) else ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF CMAKE_ARGS+=-DGGML_METAL=OFF
WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
export GGML_NO_ACCELERATE=1 export GGML_NO_ACCELERATE=1
export GGML_NO_METAL=1 export GGML_NO_METAL=1
GO_LDFLAGS_WHISPER+=-lggml-blas
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
endif endif
ifeq ($(BUILD_TYPE),metal) ifeq ($(BUILD_TYPE),metal)
# -lcblas removed: it seems to always be listed as a duplicate flag.
CGO_LDFLAGS += -framework Accelerate CGO_LDFLAGS += -framework Accelerate
CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
CMAKE_ARGS+=-DGGML_OPENMP=OFF
WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
else
CGO_LDFLAGS_WHISPER+=-lggml-blas
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
endif endif
else else
CGO_LDFLAGS_WHISPER+=-lgomp CGO_LDFLAGS_WHISPER+=-lgomp
@@ -152,29 +126,21 @@ ifeq ($(BUILD_TYPE),openblas)
endif endif
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
export GGML_CUDA=1 export GGML_CUDA=1
CMAKE_ARGS+=-DGGML_CUDA=ON CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
endif endif
ifeq ($(BUILD_TYPE),vulkan) ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=1 CMAKE_ARGS+=-DGGML_VULKAN=1
WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
endif endif
ifneq (,$(findstring sycl,$(BUILD_TYPE))) ifneq (,$(findstring sycl,$(BUILD_TYPE)))
export GGML_SYCL=1 export GGML_SYCL=1
CMAKE_ARGS+=-DGGML_SYCL=ON
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)
export GGML_SYCL_F16=1 export GGML_SYCL_F16=1
CMAKE_ARGS+=-DGGML_SYCL_F16=ON
endif endif
ifeq ($(BUILD_TYPE),hipblas) ifeq ($(BUILD_TYPE),hipblas)
@@ -185,7 +151,7 @@ ifeq ($(BUILD_TYPE),hipblas)
export CC=$(ROCM_HOME)/llvm/bin/clang export CC=$(ROCM_HOME)/llvm/bin/clang
export STABLE_BUILD_TYPE= export STABLE_BUILD_TYPE=
export GGML_HIP=1 export GGML_HIP=1
GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
AMDGPU_TARGETS ?= "$(GPU_TARGETS)" AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)" CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@@ -294,7 +260,11 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a $(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion-ggml
endif
sources/onnxruntime: sources/onnxruntime:
mkdir -p sources/onnxruntime mkdir -p sources/onnxruntime
@@ -320,9 +290,8 @@ sources/whisper.cpp:
git checkout $(WHISPER_CPP_VERSION) && \ git checkout $(WHISPER_CPP_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch git submodule update --init --recursive --depth 1 --single-branch
sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
cd sources/whisper.cpp/build && cmake --build . --config Release
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
@@ -372,14 +341,8 @@ clean-tests:
clean-dc: clean clean-dc: clean
cp -r /build/backend-assets /workspace/backend-assets cp -r /build/backend-assets /workspace/backend-assets
## Install Go tools
install-go-tools:
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install github.com/GeertJohan/go.rice/rice@latest
## Build: ## Build:
build: prepare backend-assets grpcs install-go-tools ## Build the project build: prepare backend-assets grpcs ## Build the project
$(info ${GREEN}I local-ai build info:${RESET}) $(info ${GREEN}I local-ai build info:${RESET})
$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET}) $(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET}) $(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@@ -389,9 +352,7 @@ ifneq ($(BACKEND_LIBS),)
$(MAKE) backend-assets/lib $(MAKE) backend-assets/lib
cp -f $(BACKEND_LIBS) backend-assets/lib/ cp -f $(BACKEND_LIBS) backend-assets/lib/
endif endif
rm -rf $(BINARY_NAME) || true
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./ CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
rice append --exec $(BINARY_NAME)
build-minimal: build-minimal:
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@@ -463,7 +424,6 @@ prepare-test: grpcs
cp -rf backend-assets core/http cp -rf backend-assets core/http
cp tests/models_fixtures/* test-models cp tests/models_fixtures/* test-models
## Test targets
test: prepare test-models/testmodel.ggml grpcs test: prepare test-models/testmodel.ggml grpcs
@echo 'Running tests' @echo 'Running tests'
export GO_TAGS="tts debug" export GO_TAGS="tts debug"
@@ -538,7 +498,7 @@ protogen: protogen-go protogen-python
protogen-clean: protogen-go-clean protogen-python-clean protogen-clean: protogen-go-clean protogen-python-clean
.PHONY: protogen-go .PHONY: protogen-go
protogen-go: install-go-tools protogen-go:
mkdir -p pkg/grpc/proto mkdir -p pkg/grpc/proto
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \ protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
backend/backend.proto backend/backend.proto
@@ -549,10 +509,18 @@ protogen-go-clean:
$(RM) bin/* $(RM) bin/*
.PHONY: protogen-python .PHONY: protogen-python
protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
.PHONY: protogen-python-clean .PHONY: protogen-python-clean
protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
.PHONY: autogptq-protogen
autogptq-protogen:
$(MAKE) -C backend/python/autogptq protogen
.PHONY: autogptq-protogen-clean
autogptq-protogen-clean:
$(MAKE) -C backend/python/autogptq protogen-clean
.PHONY: bark-protogen .PHONY: bark-protogen
bark-protogen: bark-protogen:
@@ -629,6 +597,7 @@ vllm-protogen-clean:
## GRPC ## GRPC
# Note: it is duplicated in the Dockerfile # Note: it is duplicated in the Dockerfile
prepare-extra-conda-environments: protogen-python prepare-extra-conda-environments: protogen-python
$(MAKE) -C backend/python/autogptq
$(MAKE) -C backend/python/bark $(MAKE) -C backend/python/bark
$(MAKE) -C backend/python/coqui $(MAKE) -C backend/python/coqui
$(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/diffusers
@@ -642,12 +611,10 @@ prepare-extra-conda-environments: protogen-python
prepare-test-extra: protogen-python prepare-test-extra: protogen-python
$(MAKE) -C backend/python/transformers $(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/vllm
test-extra: prepare-test-extra test-extra: prepare-test-extra
$(MAKE) -C backend/python/transformers test $(MAKE) -C backend/python/transformers test
$(MAKE) -C backend/python/diffusers test $(MAKE) -C backend/python/diffusers test
$(MAKE) -C backend/python/vllm test
backend-assets: backend-assets:
mkdir -p backend-assets mkdir -p backend-assets
@@ -789,8 +756,8 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/silero-vad $(UPX) backend-assets/grpc/silero-vad
endif endif
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \ CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
ifneq ($(UPX),) ifneq ($(UPX),)
$(UPX) backend-assets/grpc/whisper $(UPX) backend-assets/grpc/whisper
@@ -842,8 +809,7 @@ docker-aio-all:
docker-image-intel: docker-image-intel:
docker build \ docker build \
--progress plain \ --build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \ --build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -851,7 +817,7 @@ docker-image-intel:
docker-image-intel-xpu: docker-image-intel-xpu:
docker build \ docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \ --build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \ --build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \

104
README.md
View File

@@ -1,6 +1,7 @@
<h1 align="center"> <h1 align="center">
<br> <br>
<img height="300" src="./core/http/static/logo.png"> <br> <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
LocalAI
<br> <br>
</h1> </h1>
@@ -30,7 +31,7 @@
<p align="center"> <p align="center">
<a href="https://twitter.com/LocalAI_API" target="blank"> <a href="https://twitter.com/LocalAI_API" target="blank">
<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/> <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
</a> </a>
<a href="https://discord.gg/uJAeKSAGDy" target="blank"> <a href="https://discord.gg/uJAeKSAGDy" target="blank">
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/> <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
@@ -43,89 +44,32 @@
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/) > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
> >
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on > [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples)
[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)
[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai) [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler). **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
## 📚🆕 Local Stack Family
🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
<table>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalAGI">
<img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
<p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
</td>
</tr>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalRecall">
<img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
<p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
</td>
</tr>
</table>
## Screenshots
| Talk Interface | Generate Audio |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
| Models Overview | Generate Images |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
| Chat Interface | Home |
| --- | --- |
| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
| Login | Swarm |
| --- | --- |
|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
## 💻 Quickstart
Run the installer script: Run the installer script:
```bash ```bash
# Basic installation
curl https://localai.io/install.sh | sh curl https://localai.io/install.sh | sh
``` ```
For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
Or run with docker: Or run with docker:
### CPU only image:
```bash ```bash
# CPU only image:
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
```
### Nvidia GPU: # Nvidia GPU:
```bash
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
```
### CPU and GPU image (bigger size): # CPU and GPU image (bigger size):
```bash
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
```
### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/) # AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
```bash
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
``` ```
@@ -144,13 +88,10 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
local-ai run oci://localai/phi-2:latest local-ai run oci://localai/phi-2:latest
``` ```
For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html) [💻 Getting started](https://localai.io/basics/getting_started/index.html)
## 📰 Latest project news ## 📰 Latest project news
- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
- Apr 2025: WebUI overhaul, AIO images updates
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 ) - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 ) - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@@ -164,6 +105,19 @@ For more information, see [💻 Getting started](https://localai.io/basics/getti
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
## 🔥🔥 Hot topics (looking for help):
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
## 🚀 [Features](https://localai.io/features/) ## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table)) - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -177,10 +131,12 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
- 🥽 [Vision API](https://localai.io/features/gpt-vision/) - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
- 📈 [Reranker API](https://localai.io/features/reranker/) - 📈 [Reranker API](https://localai.io/features/reranker/)
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/) - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
- 🔊 Voice activity detection (Silero-VAD support) - 🔊 Voice activity detection (Silero-VAD support)
- 🌍 Integrated WebUI! - 🌍 Integrated WebUI!
## 💻 Usage
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
### 🔗 Community and integrations ### 🔗 Community and integrations

View File

@@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
embeddings: true
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View File

@@ -1,57 +1,101 @@
context_size: 8192
f16: true
function:
grammar:
no_mixed_free_string: true
schema_type: llama3.1 # or JSON is supported too (json)
response_regex:
- <function=(?P<name>\w+)>(?P<arguments>.*)</function>
mmap: true
name: gpt-4 name: gpt-4
mmap: true
parameters: parameters:
model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- <|eot_id|> - "</tool_call>"
- <|end_of_text|> - "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
<|begin_of_text|><|start_header_id|>system<|end_header_id|> {{.Input -}}
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|> <|im_start|>assistant
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
chat_message: | chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|> <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
{{ else if eq .RoleName "tool" -}} <tool_call>
The Function was executed and the response was: {{- else if eq .RoleName "tool" }}
{{ end -}} <tool_response>
{{ if .Content -}} {{- end }}
{{.Content -}} {{- if .Content}}
{{ else if .FunctionCall -}} {{.Content }}
{{ range .FunctionCall }} {{- end }}
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})] {{- if .FunctionCall}}
{{ end }} {{toJson .FunctionCall}}
{{ end -}} {{- end }}
<|eot_id|> {{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|start_header_id|>system<|end_header_id|> <|im_start|>system
You are an expert in composing functions. You are given a question and a set of possible functions. You are a function calling AI model.
Based on the question, you will need to make one or more function/tool calls to achieve the purpose. Here are the available tools:
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections. <tools>
If you decide to invoke any of the function(s), you MUST put it in the format as follows: {{range .Functions}}
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)] {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
You SHOULD NOT include any other text in the response. {{end}}
Here is a list of functions in JSON format that you can invoke. </tools>
{{toJson .Functions}} You should call the tools provided to you sequentially
<|eot_id|><|start_header_id|>user<|end_header_id|> Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
{{.Input}} <scratchpad>
<|eot_id|><|start_header_id|>assistant<|end_header_id|> {step-by-step reasoning and plan in bullet points}
</scratchpad>
download_files: For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf <tool_call>
sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5 {"arguments": <args-dict>, "name": <function-name>}
uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf </tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant

View File

@@ -1,49 +1,31 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: bakllava.gguf
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: bakllava.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: bakllava-mmproj.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
backend: sentencetransformers
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: all-MiniLM-L6-v2
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View File

@@ -1,53 +1,101 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4 name: gpt-4
mmap: true
parameters: parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- </s> - "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
chat_message: | chat_message: |
<|im_start|>{{ .RoleName }} <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
Function call: <tool_call>
{{ else if eq .RoleName "tool" -}} {{- else if eq .RoleName "tool" }}
Function response: <tool_response>
{{ end -}} {{- end }}
{{ if .Content -}} {{- if .Content}}
{{.Content }} {{.Content }}
{{ end -}} {{- end }}
{{ if .FunctionCall -}} {{- if .FunctionCall}}
{{toJson .FunctionCall}} {{toJson .FunctionCall}}
{{ end -}}<|im_end|> {{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|im_start|>system <|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal: You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}} {{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}} {{end}}
<|im_end|> </tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
download_files:
- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf

View File

@@ -1,49 +1,35 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: llava-v1.6-mistral-7b.Q5_K_M.gguf
stopwords: temperature: 0.2
- <|im_end|> top_k: 40
- <dummy32000> top_p: 0.95
- </s> seed: -1
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
backend: sentencetransformers
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: all-MiniLM-L6-v2
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View File

@@ -1,53 +1,103 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4 name: gpt-4
mmap: false
context_size: 8192
f16: false
parameters: parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- </s> - "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
chat_message: | chat_message: |
<|im_start|>{{ .RoleName }} <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
Function call: <tool_call>
{{ else if eq .RoleName "tool" -}} {{- else if eq .RoleName "tool" }}
Function response: <tool_response>
{{ end -}} {{- end }}
{{ if .Content -}} {{- if .Content}}
{{.Content }} {{.Content }}
{{ end -}} {{- end }}
{{ if .FunctionCall -}} {{- if .FunctionCall}}
{{toJson .FunctionCall}} {{toJson .FunctionCall}}
{{ end -}}<|im_end|> {{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|im_start|>system <|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal: You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}} {{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}} {{end}}
<|im_end|> </tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf

View File

@@ -1,50 +1,35 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true mmap: false
mmap: true f16: false
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: llava-v1.6-mistral-7b.Q5_K_M.gguf
stopwords: temperature: 0.2
- <|im_end|> top_k: 40
- <dummy32000> top_p: 0.95
- </s> seed: -1
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -1,15 +1,6 @@
package main package main
import ( import "embed"
rice "github.com/GeertJohan/go.rice"
)
var backendAssets *rice.Box //go:embed backend-assets/*
var backendAssets embed.FS
func init() {
var err error
backendAssets, err = rice.FindBox("backend-assets")
if err != nil {
panic(err)
}
}

View File

@@ -14,7 +14,6 @@ service Backend {
rpc PredictStream(PredictOptions) returns (stream Reply) {} rpc PredictStream(PredictOptions) returns (stream Reply) {}
rpc Embedding(PredictOptions) returns (EmbeddingResult) {} rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
rpc GenerateImage(GenerateImageRequest) returns (Result) {} rpc GenerateImage(GenerateImageRequest) returns (Result) {}
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {} rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
rpc TTS(TTSRequest) returns (Result) {} rpc TTS(TTSRequest) returns (Result) {}
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {} rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -191,7 +190,11 @@ message ModelOptions {
int32 NGQA = 20; int32 NGQA = 20;
string ModelFile = 21; string ModelFile = 21;
// AutoGPTQ
string Device = 22;
bool UseTriton = 23;
string ModelBaseName = 24;
bool UseFastTokenizer = 25;
// Diffusers // Diffusers
string PipelineType = 26; string PipelineType = 26;
@@ -302,19 +305,6 @@ message GenerateImageRequest {
int32 CLIPSkip = 11; int32 CLIPSkip = 11;
} }
message GenerateVideoRequest {
string prompt = 1;
string start_image = 2; // Path or base64 encoded image for the start frame
string end_image = 3; // Path or base64 encoded image for the end frame
int32 width = 4;
int32 height = 5;
int32 num_frames = 6; // Number of frames to generate
int32 fps = 7; // Frames per second
int32 seed = 8;
float cfg_scale = 9; // Classifier-free guidance scale
string dst = 10; // Output path for the generated video
}
message TTSRequest { message TTSRequest {
string text = 1; string text = 1;
string model = 2; string model = 2;

View File

@@ -1,17 +1,17 @@
## XXX: In some versions of CMake clip wasn't being built before llama. ## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future. ## This is an hack for now, but it should be fixed in the future.
# set(TARGET myclip) set(TARGET myclip)
# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h) add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
# install(TARGETS ${TARGET} LIBRARY) install(TARGETS ${TARGET} LIBRARY)
# target_include_directories(myclip PUBLIC .) target_include_directories(myclip PUBLIC .)
# target_include_directories(myclip PUBLIC ../..) target_include_directories(myclip PUBLIC ../..)
# target_include_directories(myclip PUBLIC ../../common) target_include_directories(myclip PUBLIC ../../common)
# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
# target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
# if (NOT MSVC) if (NOT MSVC)
# target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
# endif() endif()
# END CLIP hack # END CLIP hack
@@ -75,11 +75,7 @@ add_library(hw_grpc_proto
${hw_proto_hdrs} ) ${hw_proto_hdrs} )
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp) add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
absl::flags_parse absl::flags_parse
gRPC::${_REFLECTION} gRPC::${_REFLECTION}
gRPC::${_GRPC_GRPCPP} gRPC::${_GRPC_GRPCPP}

View File

@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
TARGET?=--target grpc-server TARGET?=--target grpc-server
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
@@ -36,18 +36,11 @@ else ifeq ($(OS),Darwin)
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl" \
-DGGML_SYCL_F16=ON
endif endif
ifeq ($(BUILD_TYPE),sycl_f32) ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl"
endif endif
llama.cpp: llama.cpp:
@@ -59,8 +52,8 @@ llama.cpp:
git checkout -b build $(LLAMA_VERSION) && \ git checkout -b build $(LLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp llama.cpp/examples/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server mkdir -p llama.cpp/examples/grpc-server
bash prepare.sh bash prepare.sh
rebuild: rebuild:
@@ -70,13 +63,13 @@ rebuild:
purge: purge:
rm -rf llama.cpp/build rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server rm -rf llama.cpp/examples/grpc-server
rm -rf grpc-server rm -rf grpc-server
clean: purge clean: purge
rm -rf llama.cpp rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server grpc-server: llama.cpp llama.cpp/examples/grpc-server
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)" @echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE))) ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \ +bash -c "source $(ONEAPI_VARS); \

View File

@@ -11,7 +11,8 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <getopt.h> #include <getopt.h>
#include "mtmd.h" #include "clip.h"
#include "llava.h"
#include "log.h" #include "log.h"
#include "stb_image.h" #include "stb_image.h"
#include "common.h" #include "common.h"
@@ -51,7 +52,7 @@ struct server_params
{ {
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
std::string public_path = "tools/server/public"; std::string public_path = "examples/server/public";
std::string chat_template = ""; std::string chat_template = "";
int32_t port = 8080; int32_t port = 8080;
int32_t read_timeout = 600; int32_t read_timeout = 600;
@@ -209,8 +210,6 @@ struct llama_client_slot
int32_t num_prompt_tokens_processed = 0; int32_t num_prompt_tokens_processed = 0;
json prompt; json prompt;
json data;
std::string generated_text; std::string generated_text;
llama_token sampled; llama_token sampled;
std::vector<llama_token> cache_tokens; std::vector<llama_token> cache_tokens;
@@ -240,7 +239,7 @@ struct llama_client_slot
int32_t n_past_se = 0; // self-extend int32_t n_past_se = 0; // self-extend
// multimodal // multimodal
mtmd_context * mctx = nullptr; std::vector<slot_image> images;
// stats // stats
size_t sent_count = 0; size_t sent_count = 0;
@@ -271,6 +270,17 @@ struct llama_client_slot
n_past_se = 0; n_past_se = 0;
generated_token_probs.clear(); generated_token_probs.clear();
for (slot_image & img : images)
{
free(img.image_embedding);
if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = "";
}
images.clear();
} }
bool has_budget(common_params &global_params) { bool has_budget(common_params &global_params) {
@@ -446,9 +456,6 @@ struct llama_server_context
llama_context *ctx = nullptr; llama_context *ctx = nullptr;
const llama_vocab * vocab = nullptr; const llama_vocab * vocab = nullptr;
// multimodal
mtmd_context * mctx = nullptr;
clip_ctx *clp_ctx = nullptr; clip_ctx *clp_ctx = nullptr;
common_params params; common_params params;
@@ -460,7 +467,6 @@ struct llama_server_context
bool all_slots_are_idle = false; bool all_slots_are_idle = false;
bool add_bos_token = true; bool add_bos_token = true;
bool has_eos_token = true; bool has_eos_token = true;
bool has_gpu = false;
bool grammar_lazy = false; bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_triggers; std::vector<common_grammar_trigger> grammar_triggers;
@@ -487,10 +493,6 @@ struct llama_server_context
~llama_server_context() ~llama_server_context()
{ {
if (mctx) {
mtmd_free(mctx);
mctx = nullptr;
}
if (ctx) if (ctx)
{ {
llama_free(ctx); llama_free(ctx);
@@ -506,17 +508,12 @@ struct llama_server_context
bool load_model(const common_params &params_) bool load_model(const common_params &params_)
{ {
params = params_; params = params_;
if (!params.mmproj.path.empty()) { if (!params.mmproj.empty()) {
multimodal = true; multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {}); LOG_INFO("Multi Modal Mode Enabled", {});
mtmd_context_params mparams = mtmd_context_params_default(); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
mparams.use_gpu = has_gpu; if(clp_ctx == nullptr) {
mparams.print_timings = false; LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
mparams.n_threads = params.cpuparams.n_threads;
mparams.verbosity = GGML_LOG_LEVEL_INFO;
mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
if (mctx == nullptr) {
LOG_ERR("failed to load multimodal model, '%s'\n", params.mmproj.path.c_str());
return false; return false;
} }
@@ -530,7 +527,7 @@ struct llama_server_context
ctx = common_init.context.release(); ctx = common_init.context.release();
if (model == nullptr) if (model == nullptr)
{ {
LOG_ERR("unable to load model: %s", params.model.path.c_str()); LOG_ERR("unable to load model: %s", params.model.c_str());
return false; return false;
} }
@@ -578,8 +575,6 @@ struct llama_server_context
slot.id = i; slot.id = i;
slot.n_ctx = n_ctx_slot; slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict; slot.n_predict = params.n_predict;
slot.mctx = mctx;
//slot.cache_tokens.has_mtmd = mctx != nullptr;
LOG_INFO("new slot", { LOG_INFO("new slot", {
{"slot_id", slot.id}, {"slot_id", slot.id},
@@ -617,61 +612,54 @@ struct llama_server_context
batch = llama_batch_init(n_ctx, 0, params.n_parallel); batch = llama_batch_init(n_ctx, 0, params.n_parallel);
} }
std::vector<server_tokens> tokenize(json &data, const json & json_prompt, bool add_bos) const std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
{ {
mtmd::bitmaps bitmaps; // TODO: currently, we tokenize using special tokens by default
std::vector<server_tokens> inputs; // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
const bool TMP_FORCE_SPECIAL = true;
if (mctx != nullptr) // If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
if (json_prompt.is_array())
{ {
const auto &images_data = data.find("image_data"); bool first = true;
if (images_data != data.end() && images_data->is_array()) for (const auto& p : json_prompt)
{ {
for (const auto &img : *images_data) if (p.is_string())
{ {
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>()); auto s = p.template get<std::string>();
std::vector<llama_token> p;
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_buffer.data(), image_buffer.size())); if (first)
if (!bmp.ptr) { {
throw std::runtime_error("Failed to load image"); p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
first = false;
} }
// calculate bitmap hash (for KV caching) else
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3); {
bmp.set_id(hash.c_str()); p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
bitmaps.entries.push_back(std::move(bmp)); }
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
}
else
{
if (first)
{
first = false;
}
prompt_tokens.push_back(p.template get<llama_token>());
} }
} }
}
// multimodal else
std::string prompt_str = json_prompt.template get<std::string>(); {
mtmd_input_text inp_txt = { auto s = json_prompt.template get<std::string>();
prompt_str.c_str(), prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
/* add_special */ true,
/* parse_special */ true,
};
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = bitmaps.c_ptr();
int32_t tokenized = mtmd_tokenize(mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (tokenized != 0) {
throw std::runtime_error("Failed to tokenize prompt");
}
server_tokens tmp(chunks, true);
inputs.push_back(std::move(tmp));
} else {
// non-multimodal version
auto tokenized_prompts = tokenize_input_prompts(vocab, json_prompt, true, true);
for (auto & p : tokenized_prompts) {
auto tmp = server_tokens(p, mctx != nullptr);
inputs.push_back(std::move(tmp));
}
} }
return inputs; return prompt_tokens;
} }
llama_client_slot* get_slot(int id) { llama_client_slot* get_slot(int id) {
@@ -724,8 +712,6 @@ struct llama_server_context
slot->sparams.grammar_triggers = grammar_triggers; slot->sparams.grammar_triggers = grammar_triggers;
slot->sparams.grammar_lazy = grammar_lazy; slot->sparams.grammar_lazy = grammar_lazy;
slot->data = data;
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ? // Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", { LOG_WARNING("Max tokens to predict exceeds server configuration", {
@@ -767,7 +753,43 @@ struct llama_server_context
if (json_value(data, "ignore_eos", false) && has_eos_token) { if (json_value(data, "ignore_eos", false) && has_eos_token) {
slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY}); slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
} }
/*
slot->sparams.penalty_prompt_tokens.clear();
slot->sparams.use_penalty_prompt_tokens = false;
const auto &penalty_prompt = data.find("penalty_prompt");
if (penalty_prompt != data.end())
{
if (penalty_prompt->is_string())
{
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
if (slot->params.n_predict > 0)
{
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
}
slot->sparams.use_penalty_prompt_tokens = true;
}
else if (penalty_prompt->is_array())
{
const auto n_tokens = penalty_prompt->size();
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
const int n_vocab = llama_n_vocab(model);
for (const auto &penalty_token : *penalty_prompt)
{
if (penalty_token.is_number_integer())
{
const auto tok = penalty_token.get<llama_token>();
if (tok >= 0 && tok < n_vocab)
{
slot->sparams.penalty_prompt_tokens.push_back(tok);
}
}
}
slot->sparams.use_penalty_prompt_tokens = true;
}
}
*/
slot->sparams.logit_bias.clear(); slot->sparams.logit_bias.clear();
const auto &logit_bias = data.find("logit_bias"); const auto &logit_bias = data.find("logit_bias");
@@ -843,6 +865,79 @@ struct llama_server_context
} }
if (multimodal)
{
const auto &images_data = data.find("image_data");
if (images_data != data.end() && images_data->is_array())
{
for (const auto &img : *images_data)
{
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
__func__,
slot->id,
img_sl.id
);
return false;
}
LOG_VERBOSE("image loaded", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
// process prompt
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
if (slot->images.size() > 0 && !slot->prompt.is_array())
{
std::string prompt = slot->prompt.get<std::string>();
size_t pos = 0, begin_prefix = 0;
std::string pattern = "[img-";
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
size_t end_prefix = pos;
pos += pattern.length();
size_t end_pos = prompt.find(']', pos);
if (end_pos != std::string::npos)
{
std::string image_id = prompt.substr(pos, end_pos - pos);
try
{
int img_id = std::stoi(image_id);
bool found = false;
for (slot_image &img : slot->images)
{
if (img.id == img_id) {
found = true;
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
begin_prefix = end_pos + 1;
break;
}
}
if (!found) {
LOG("ERROR: Image with id: %i, not found.\n", img_id);
slot->images.clear();
return false;
}
} catch (const std::invalid_argument& e) {
LOG("Invalid image number id in prompt\n");
slot->images.clear();
return false;
}
}
}
slot->prompt = "";
slot->params.input_suffix = prompt.substr(begin_prefix);
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
}
}
}
if (slot->ctx_sampling != nullptr) if (slot->ctx_sampling != nullptr)
{ {
@@ -1090,6 +1185,26 @@ struct llama_server_context
return slot.has_next_token; // continue return slot.has_next_token; // continue
} }
bool process_images(llama_client_slot &slot) const
{
for (slot_image &img : slot.images)
{
if (!img.request_encode_image)
{
continue;
}
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG("Error processing the given image");
return false;
}
img.request_encode_image = false;
}
return slot.images.size() > 0;
}
void send_error(task_server& task, const std::string &error) void send_error(task_server& task, const std::string &error)
{ {
LOG("task %i - error: %s\n", task.id, error.c_str()); LOG("task %i - error: %s\n", task.id, error.c_str());
@@ -1332,6 +1447,74 @@ struct llama_server_context
} }
} }
// for multiple images processing
bool ingest_images(llama_client_slot &slot, int n_batch)
{
int image_idx = 0;
while (image_idx < (int) slot.images.size())
{
slot_image &img = slot.images[image_idx];
// process prefix prompt
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
};
if (llama_decode(ctx, batch_view))
{
LOG("%s : failed to eval\n", __func__);
return false;
}
}
// process image with llm
for (int i = 0; i < img.image_tokens; i += n_batch)
{
int n_eval = img.image_tokens - i;
if (n_eval > n_batch)
{
n_eval = n_batch;
}
const int n_embd = llama_model_n_embd(model);
float * embd = img.image_embedding + i * n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
if (llama_decode(ctx, llava_batch.batch))
{
LOG("%s : failed to eval image\n", __func__);
return false;
}
slot.n_past += n_eval;
}
image_idx++;
common_batch_clear(batch);
// append prefix of next image
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
slot.params.input_suffix : // no more images, then process suffix prompt
(json)(slot.images[image_idx].prefix_prompt);
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i)
{
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
slot.n_past += 1;
}
}
return true;
}
void request_cancel(int task_id) void request_cancel(int task_id)
{ {
task_server task; task_server task;
@@ -1546,7 +1729,7 @@ struct llama_server_context
{ {
for (auto & slot : slots) for (auto & slot : slots)
{ {
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()); const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
// empty prompt passed -> release the slot and send empty response // empty prompt passed -> release the slot and send empty response
// note: infill mode allows empty prompt // note: infill mode allows empty prompt
@@ -1563,7 +1746,7 @@ struct llama_server_context
{ {
slot.state = PROCESSING; slot.state = PROCESSING;
slot.command = NONE; slot.command = NONE;
std::vector<server_tokens> prompt_tokens; std::vector<llama_token> prompt_tokens;
slot.t_start_process_prompt = ggml_time_us(); slot.t_start_process_prompt = ggml_time_us();
slot.t_start_genereration = 0; slot.t_start_genereration = 0;
@@ -1575,41 +1758,24 @@ struct llama_server_context
params.input_suffix.erase(0, 1); params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false; suff_rm_leading_spc = false;
} }
auto prefix_tokens = tokenize(slot.data, slot.params.input_prefix, false); auto prefix_tokens = tokenize(slot.params.input_prefix, false);
auto suffix_tokens = tokenize(slot.data, slot.params.input_suffix, false); auto suffix_tokens = tokenize(slot.params.input_suffix, false);
const int space_token = 29871; // TODO: this should not be hardcoded const int space_token = 29871; // TODO: this should not be hardcoded
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0][0] == space_token) { if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
suffix_tokens.erase(suffix_tokens.begin()); suffix_tokens.erase(suffix_tokens.begin());
} }
// Create llama_tokens vectors for the special tokens prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
llama_tokens fim_pre_tokens; prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
fim_pre_tokens.push_back(llama_vocab_fim_pre(vocab)); prefix_tokens.insert(prefix_tokens.end(), llama_vocab_fim_suf(vocab));
llama_tokens bos_tokens; prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
bos_tokens.push_back(llama_vocab_bos(vocab)); prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
llama_tokens fim_suf_tokens;
fim_suf_tokens.push_back(llama_vocab_fim_suf(vocab));
llama_tokens fim_mid_tokens;
fim_mid_tokens.push_back(llama_vocab_fim_mid(vocab));
// Create server_tokens objects
server_tokens fim_pre_token(fim_pre_tokens, mctx != nullptr);
server_tokens bos_token(bos_tokens, mctx != nullptr);
server_tokens fim_suf_token(fim_suf_tokens, mctx != nullptr);
server_tokens fim_mid_token(fim_mid_tokens, mctx != nullptr);
// Insert tokens in the correct order
prefix_tokens.insert(prefix_tokens.begin(), fim_pre_token);
prefix_tokens.insert(prefix_tokens.begin(), bos_token); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), fim_suf_token);
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
prefix_tokens.push_back(fim_mid_token);
prompt_tokens = prefix_tokens; prompt_tokens = prefix_tokens;
} }
else else
{ {
prompt_tokens = tokenize(slot.data, slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
} }
slot.num_prompt_tokens = prompt_tokens.size(); slot.num_prompt_tokens = prompt_tokens.size();
@@ -1637,12 +1803,7 @@ struct llama_server_context
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
}); });
slot.truncated = true; slot.truncated = true;
prompt_tokens = new_tokens;
// Convert new_tokens to server_tokens
std::vector<server_tokens> new_prompt_tokens;
server_tokens new_server_tokens(new_tokens, mctx != nullptr);
new_prompt_tokens.push_back(std::move(new_server_tokens));
prompt_tokens = std::move(new_prompt_tokens);
slot.num_prompt_tokens = prompt_tokens.size(); slot.num_prompt_tokens = prompt_tokens.size();
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
@@ -1662,17 +1823,10 @@ struct llama_server_context
// push the prompt into the sampling context (do not apply grammar) // push the prompt into the sampling context (do not apply grammar)
for (auto &token : prompt_tokens) for (auto &token : prompt_tokens)
{ {
// Convert server_tokens to llama_token for sampling common_sampler_accept(slot.ctx_sampling, token, false);
llama_token tok = token[0]; // Get first token
common_sampler_accept(slot.ctx_sampling, tok, false);
} }
// Convert server_tokens to llama_tokens for comparison slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
std::vector<llama_token> prompt_llama_tokens;
for (const auto &token : prompt_tokens) {
prompt_llama_tokens.push_back(token[0]);
}
slot.n_past = common_part(slot.cache_tokens, prompt_llama_tokens);
// the last token of the cache is not in the KV cache until the next call to llama_decode // the last token of the cache is not in the KV cache until the next call to llama_decode
// (it was sampled, pushed into the "cache_tokens", but not yet put in the context) // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
@@ -1710,12 +1864,7 @@ struct llama_server_context
}); });
} }
// Convert server_tokens to llama_tokens for cache slot.cache_tokens = prompt_tokens;
std::vector<llama_token> cache_llama_tokens;
for (const auto &token : prompt_tokens) {
cache_llama_tokens.push_back(token[0]);
}
slot.cache_tokens = cache_llama_tokens;
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
{ {
@@ -1739,36 +1888,18 @@ struct llama_server_context
}); });
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
// process the prefix of first image
std::vector<server_tokens> prefix_tokens = prompt_tokens;
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
// check if we should process the image
if (slot.n_past < slot.n_prompt_tokens
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
// process the image
int32_t new_n_past;
int32_t res = prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
int32_t n_pos = new_n_past - slot.n_past;
if (res != 0) {
slot.release();
LOG_ERR("failed to process image, res = %d\n", res);
continue;
}
slot.n_past += n_pos;
// slot.n_prompt_tokens_processed += n_pos;
}
LOG_VERBOSE("prompt ingested", { LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past}, {"n_past", slot.n_past},
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())}, {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
}); });
const bool has_images = process_images(slot);
// process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
int32_t ga_i = slot.ga_i; int32_t ga_i = slot.ga_i;
int32_t ga_n = slot.ga_n; int32_t ga_n = slot.ga_n;
@@ -1788,6 +1919,19 @@ struct llama_server_context
slot_npast++; slot_npast++;
} }
if (has_images && !ingest_images(slot, n_batch))
{
LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
__func__,
slot.id,
slot.task_id
);
// FIXME @phymbert: to be properly tested
// early returning without changing the slot state will block the slot for ever
// no one at the moment is checking the return value
return false;
}
// extract the logits only for the last token // extract the logits only for the last token
if (batch.n_tokens > 0) if (batch.n_tokens > 0)
{ {
@@ -1974,11 +2118,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
} }
std::function<void(int)> shutdown_handler; std::function<void(int)> shutdown_handler;
inline void signal_handler(int signal) { shutdown_handler(signal); }
inline void signal_handler(int signal) {
exit(1);
}
///////////////////////////////// /////////////////////////////////
//////////////////////////////// ////////////////////////////////
@@ -2016,6 +2156,26 @@ static void start_llama_server() {
json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama) json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama)
{ {
// This is for example a slot data from the json data
// slot->params.stream = json_value(data, "stream", false);
// slot->params.cache_prompt = json_value(data, "cache_prompt", false);
// slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
// slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
// slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
// slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
// slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
// slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
// slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
// slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
// slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
// slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
// slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
// slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
// slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
// slot->params.seed = json_value(data, "seed", default_params.seed);
// slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
// slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
// Create now a json data from the prediction options instead // Create now a json data from the prediction options instead
// //
json data; json data;
@@ -2060,6 +2220,69 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
return data; return data;
} }
// static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
// {
// // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
// gpt_params default_params;
// llama.stream = streaming;
// llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
// llama.params.sparams.top_k = predict->topk();
// llama.params.sparams.top_p = predict->topp();
// llama.params.sparams.typical_p = predict->typicalp();
// llama.params.sparams.penalty_last_n = predict->repeat();
// llama.params.sparams.temp = predict->temperature();
// llama.params.sparams.penalty_repeat = predict->penalty();
// llama.params.sparams.penalty_present = predict->presencepenalty();
// llama.params.sparams.penalty_freq = predict->frequencypenalty();
// llama.params.sparams.mirostat = predict->mirostat();
// llama.params.sparams.mirostat_tau = predict->mirostattau();
// llama.params.sparams.mirostat_eta = predict->mirostateta();
// llama.params.n_keep = predict->nkeep();
// llama.params.seed = predict->seed();
// llama.params.sparams.grammar = predict->grammar();
// // llama.params.n_probs = predict->
// llama.params.prompt = predict->prompt();
// llama.params.sparams.logit_bias.clear();
// if (predict->ignoreeos())
// {
// llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
// }
// // const auto &logit_bias = body.find("logit_bias");
// // if (logit_bias != body.end() && logit_bias->is_array())
// // {
// // const int n_vocab = llama_n_vocab(llama.model);
// // for (const auto &el : *logit_bias)
// // {
// // if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
// // {
// // llama_token tok = el[0].get<llama_token>();
// // if (tok >= 0 && tok < n_vocab)
// // {
// // if (el[1].is_number())
// // {
// // llama.params.logit_bias[tok] = el[1].get<float>();
// // }
// // else if (el[1].is_boolean() && !el[1].get<bool>())
// // {
// // llama.params.logit_bias[tok] = -INFINITY;
// // }
// // }
// // }
// // }
// // }
// llama.params.antiprompt.clear();
// for (const std::string& stopPrompt : predict->stopprompts()) {
// if (!stopPrompt.empty())
// {
// llama.params.antiprompt.push_back(stopPrompt);
// }
// }
// }
const std::vector<ggml_type> kv_cache_types = { const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32, GGML_TYPE_F32,
@@ -2091,15 +2314,15 @@ static std::string get_all_kv_cache_types() {
} }
static void params_parse(const backend::ModelOptions* request, static void params_parse(const backend::ModelOptions* request,
common_params & params, llama_server_context &llama) { common_params & params) {
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809 // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
params.model.path = request->modelfile(); params.model = request->modelfile();
if (!request->mmproj().empty()) { if (!request->mmproj().empty()) {
// get the directory of modelfile // get the directory of modelfile
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\")); std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
params.mmproj.path = model_dir + "/"+ request->mmproj(); params.mmproj = model_dir + "/"+ request->mmproj();
} }
// params.model_alias ?? // params.model_alias ??
params.model_alias = request->modelfile(); params.model_alias = request->modelfile();
@@ -2129,20 +2352,6 @@ static void params_parse(const backend::ModelOptions* request,
add_rpc_devices(std::string(llama_grpc_servers)); add_rpc_devices(std::string(llama_grpc_servers));
} }
// decode options. Options are in form optname:optvale, or if booleans only optname.
for (int i = 0; i < request->options_size(); i++) {
std::string opt = request->options(i);
char *optname = strtok(&opt[0], ":");
char *optval = strtok(NULL, ":");
if (optval == NULL) {
optval = "true";
}
if (!strcmp(optname, "gpu")) {
llama.has_gpu = true;
}
}
// TODO: Add yarn // TODO: Add yarn
if (!request->tensorsplit().empty()) { if (!request->tensorsplit().empty()) {
@@ -2174,7 +2383,7 @@ static void params_parse(const backend::ModelOptions* request,
scale_factor = request->lorascale(); scale_factor = request->lorascale();
} }
// get the directory of modelfile // get the directory of modelfile
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\")); std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor }); params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
} }
params.use_mlock = request->mlock(); params.use_mlock = request->mlock();
@@ -2236,7 +2445,7 @@ public:
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) { grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
// Implement LoadModel RPC // Implement LoadModel RPC
common_params params; common_params params;
params_parse(request, params, llama); params_parse(request, params);
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
@@ -2372,10 +2581,10 @@ public:
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){ grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
json data = parse_options(false, request, llama); json data = parse_options(false, request, llama);
std::vector<server_tokens> tokens = llama.tokenize(data, data["prompt"],false); std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
for (int i=0 ; i< tokens.size(); i++){ for (int i=0 ; i< tokens.size(); i++){
response->add_tokens(tokens[i].llama_token); response->add_tokens(tokens[i]);
} }
return grpc::Status::OK; return grpc::Status::OK;
@@ -2413,9 +2622,7 @@ void RunServer(const std::string& server_address) {
ServerBuilder builder; ServerBuilder builder;
builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
builder.RegisterService(&service); builder.RegisterService(&service);
builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
std::unique_ptr<Server> server(builder.BuildAndStart()); std::unique_ptr<Server> server(builder.BuildAndStart());
std::cout << "Server listening on " << server_address << std::endl; std::cout << "Server listening on " << server_address << std::endl;
server->Wait(); server->Wait();
@@ -2424,20 +2631,6 @@ void RunServer(const std::string& server_address) {
int main(int argc, char** argv) { int main(int argc, char** argv) {
std::string server_address("localhost:50051"); std::string server_address("localhost:50051");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
// Define long and short options // Define long and short options
struct option long_options[] = { struct option long_options[] = {
{"addr", required_argument, nullptr, 'a'}, {"addr", required_argument, nullptr, 'a'},

View File

@@ -1,13 +1,13 @@
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd0d2fa..6c5e811a 100644 index 7f892beb..0517e529 100644
--- a/tools/mtmd/clip.cpp --- a/examples/llava/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/examples/llava/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima @@ -2766,7 +2766,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); int patch_offset = ctx->has_class_embedding ? 1 : 0;
int* patches_data = (int*)malloc(ggml_nbytes(patches)); int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) { for (int i = 0; i < num_patches; i++) {
- patches_data[i] = i + 1; - patches_data[i] = i + patch_offset;
+ patches_data[i] = i; + patches_data[i] = i + 1;
} }
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data); free(patches_data);

View File

@@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
set -e
## Patches ## Patches
## Apply patches from the `patches` directory ## Apply patches from the `patches` directory
for patch in $(ls patches); do for patch in $(ls patches); do
@@ -7,22 +9,21 @@ for patch in $(ls patches); do
patch -d llama.cpp/ -p1 < patches/$patch patch -d llama.cpp/ -p1 < patches/$patch
done done
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/ cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
cp -rfv json.hpp llama.cpp/tools/grpc-server/ cp -rfv json.hpp llama.cpp/examples/grpc-server/
cp -rfv utils.hpp llama.cpp/tools/grpc-server/ cp -rfv utils.hpp llama.cpp/examples/grpc-server/
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
echo "grpc-server already added" echo "grpc-server already added"
else else
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
fi fi
## XXX: In some versions of CMake clip wasn't being built before llama. ## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future. ## This is an hack for now, but it should be fixed in the future.
# cp -rfv llama.cpp/tools/mtmd/clip.h llama.cpp/tools/grpc-server/clip.h cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
# cp -rfv llama.cpp/tools/mtmd/clip-impl.h llama.cpp/tools/grpc-server/clip-impl.h cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
# cp -rfv llama.cpp/tools/mtmd/llava.cpp llama.cpp/tools/grpc-server/llava.cpp echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
# echo '#include "llama.h"' > llama.cpp/tools/grpc-server/llava.h cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
# cat llama.cpp/tools/mtmd/llava.h >> llama.cpp/tools/grpc-server/llava.h cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
# cp -rfv llama.cpp/tools/mtmd/clip.cpp llama.cpp/tools/grpc-server/clip.cpp

View File

@@ -1,4 +1,4 @@
// https://github.com/ggerganov/llama.cpp/blob/master/tools/server/utils.hpp // https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
#pragma once #pragma once
@@ -11,7 +11,7 @@
#include "json.hpp" #include "json.hpp"
#include "../mtmd/clip.h" #include "../llava/clip.h"
using json = nlohmann::json; using json = nlohmann::json;
@@ -480,431 +480,4 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
} }
return ret; return ret;
}
//
// tokenizer and input processing utils
//
static bool json_is_array_of_numbers(const json & data) {
if (data.is_array()) {
for (const auto & e : data) {
if (!e.is_number_integer()) {
return false;
}
}
return true;
}
return false;
}
// is array having BOTH numbers & strings?
static bool json_is_array_of_mixed_numbers_strings(const json & data) {
bool seen_string = false;
bool seen_number = false;
if (data.is_array()) {
for (const auto & e : data) {
seen_string |= e.is_string();
seen_number |= e.is_number_integer();
if (seen_number && seen_string) {
return true;
}
}
}
return false;
}
// get value by path(key1 / key2)
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
json result = json::object();
for (const std::string & path : paths) {
json current = js;
const auto keys = string_split<std::string>(path, /*separator*/ '/');
bool valid_path = true;
for (const std::string & k : keys) {
if (valid_path && current.is_object() && current.contains(k)) {
current = current[k];
} else {
valid_path = false;
}
}
if (valid_path) {
result[path] = current;
}
}
return result;
}
/**
* this handles 2 cases:
* - only string, example: "string"
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
*/
static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
llama_tokens prompt_tokens;
if (json_prompt.is_array()) {
bool first = true;
for (const auto & p : json_prompt) {
if (p.is_string()) {
auto s = p.template get<std::string>();
llama_tokens p;
if (first) {
p = common_tokenize(vocab, s, add_special, parse_special);
first = false;
} else {
p = common_tokenize(vocab, s, false, parse_special);
}
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
} else {
if (first) {
first = false;
}
prompt_tokens.push_back(p.template get<llama_token>());
}
}
} else {
auto s = json_prompt.template get<std::string>();
prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
}
return prompt_tokens;
}
/**
* break the input "prompt" object into multiple prompt if needed, then tokenize them
* this supports these cases:
* - "prompt": "string"
* - "prompt": [12, 34, 56]
* - "prompt": [12, 34, "string", 56, 78]
* and multiple prompts (multi-tasks):
* - "prompt": ["string1", "string2"]
* - "prompt": ["string1", [12, 34, 56]]
* - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
*/
static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
std::vector<llama_tokens> result;
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
// string or mixed
result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
} else if (json_is_array_of_numbers(json_prompt)) {
// array of tokens
result.push_back(json_prompt.get<llama_tokens>());
} else if (json_prompt.is_array()) {
// array of prompts
result.reserve(json_prompt.size());
for (const auto & p : json_prompt) {
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
} else if (json_is_array_of_numbers(p)) {
// array of tokens
result.push_back(p.get<llama_tokens>());
} else {
throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
}
}
} else {
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
}
if (result.empty()) {
throw std::runtime_error("\"prompt\" must not be empty");
}
return result;
}
//
// utils for interacting with libmtmd
// (may need to refactor in near future)
//
/**
* server_tokens is a helper to manage the input tokens and image for the server.
* it is made this way to simplify the logic of KV cache management.
*/
struct server_tokens {
bool has_mtmd = false;
private: // disallow accessing these members directly, risking out-of-sync
// map a **start** position in tokens to the image chunk
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
// list of tokens
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
llama_tokens tokens;
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// pos 0 1 2 3 4 5 6 7 8 9
// map_pos_to_image will contain: {5, img0}, {8, img1}
public:
server_tokens() = default;
~server_tokens() = default;
// Prevent copying
server_tokens(const server_tokens&) = delete;
server_tokens& operator=(const server_tokens&) = delete;
// Allow moving (usually implicitly generated if members are movable)
server_tokens(server_tokens&&) = default;
server_tokens& operator=(server_tokens&&) = default;
// Allow accessing elements using [] operator
llama_token operator[](size_t index) { return tokens[index]; }
const llama_token& operator[](size_t index) const { return tokens[index]; }
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
push_back(mtmd_chunks[i]);
}
}
server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
// for debugging
std::string str() const {
std::ostringstream oss;
oss << "tokens: ";
for (const auto & t : tokens) {
if (t == LLAMA_TOKEN_NULL) {
oss << "<embd> ";
} else {
oss << t << " ";
}
}
oss << "\n";
oss << "image pos: ";
for (const auto & it : map_pos_to_image) {
oss << it.first << ", ";
}
return oss.str();
}
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
auto it = map_pos_to_image.find(pos);
if (it != map_pos_to_image.end()) {
return it->second;
} else {
throw std::runtime_error("Chunk not found");
}
}
void push_back(llama_token tok) {
if (tok == LLAMA_TOKEN_NULL) {
throw std::runtime_error("Invalid token");
}
tokens.emplace_back(tok);
}
// will create a copy of the chunk if it contains non-text data
void push_back(const mtmd_input_chunk * chunk) {
auto type = mtmd_input_chunk_get_type(chunk);
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
GGML_ASSERT(has_mtmd);
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
llama_pos start_pos = tokens.size();
for (int i = 0; i < n_pos; ++i) {
tokens.emplace_back(LLAMA_TOKEN_NULL);
}
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_pos_to_image[start_pos] = std::move(new_chunk);
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
size_t n_tokens;
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
for (size_t i = 0; i < n_tokens; ++i) {
push_back(text_tokens[i]);
}
} else {
GGML_ABORT("Invalid chunk type");
}
}
// for compatibility with context shift and prompt truncation
void insert(const llama_tokens & inp_tokens) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
}
// for compatibility with speculative decoding, ctx shift, slot save/load
const llama_tokens & get_text_tokens() const {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
return tokens;
}
// for compatibility with speculative decoding
void set_token(llama_pos pos, llama_token id) {
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
tokens[pos] = id;
}
size_t size() const {
return tokens.size();
}
bool empty() const {
return tokens.empty();
}
void clear() {
tokens.clear();
}
void resize(size_t n) {
GGML_ASSERT(n <= tokens.size());
if (has_mtmd) {
// we throw an error if we try to remove a token in the middle of an image
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
// n 1 2 3 4 5 6 7 8 9 10
// allowed to resize ^ ^
// disallowed to resize ^ ^ ^
if (n > 0) {
llama_token last_token = tokens[n - 1];
// make sure we never remove tokens in the middle of an image
if (last_token == LLAMA_TOKEN_NULL) {
find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
}
}
// remove all image chunks that are not used anymore
for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
llama_pos pos = it->first;
if (pos >= (llama_pos)n) {
it = map_pos_to_image.erase(it);
} else {
++it;
}
}
}
tokens.resize(n);
}
std::string detokenize(const llama_context * ctx, bool special) const {
llama_tokens text_tokens;
text_tokens.reserve(tokens.size());
for (const auto & t : tokens) {
if (t != LLAMA_TOKEN_NULL) {
text_tokens.push_back(t);
}
}
return common_detokenize(ctx, text_tokens, special);
}
size_t get_common_prefix(const server_tokens & b) const {
size_t max_idx = std::min(tokens.size(), b.tokens.size());
for (size_t i = 0; i < max_idx; ++i) {
auto & ai = tokens[i];
auto & bi = b.tokens[i];
if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
GGML_ASSERT(has_mtmd);
const auto & a_chunk = find_chunk(i);
const auto & b_chunk = b.find_chunk(i);
GGML_ASSERT(a_chunk && b_chunk);
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
std::string ai_id = mtmd_image_tokens_get_id(a_img);
std::string bi_id = mtmd_image_tokens_get_id(b_img);
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
if (ai_id == bi_id && a_pos == b_pos) {
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
i += a_pos - 1; // will be +1 by the for loop
continue;
} else {
return i;
}
} else if (ai == bi) {
continue;
} else {
return i;
}
}
return max_idx; // all tokens are equal
}
// make sure all text tokens are within the vocab range
bool validate(const struct llama_context * ctx) const {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
for (size_t i = 0; i < tokens.size(); ++i) {
auto & t = tokens[i];
if (t == LLAMA_TOKEN_NULL) {
try {
const auto & chunk = find_chunk(i);
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
i += n_pos - 1; // will be +1 by the for loop
} catch (const std::exception & e) {
return false;
}
} else if (t < 0 || t >= n_vocab) {
return false;
}
}
return true;
}
// encode and decode the image chunk
int32_t process_chunk(
llama_context * ctx,
mtmd_context * mctx,
llama_pos n_past,
int32_t seq_id,
llama_pos & n_pos_out) {
auto it = map_pos_to_image.find(n_past);
if (it == map_pos_to_image.end()) {
throw std::runtime_error("Chunk not found");
}
// SRV_INF("%s\n", "processing image...");
int32_t n_batch = llama_n_batch(ctx);
int64_t t0 = ggml_time_ms();
llama_pos new_n_past = n_past;
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
it->second.get(), // chunk
n_past,
seq_id,
n_batch,
true, // logits last
&new_n_past);
//SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
if (result != 0) {
LOG_ERR("mtmd_helper_eval failed with status %d", result);
n_pos_out = n_past;
return result;
}
n_pos_out = new_n_past;
return 0;
}
};
// Computes FNV-1a hash of the data
static std::string fnv_hash(const uint8_t * data, size_t len) {
const uint64_t fnv_prime = 0x100000001b3ULL;
uint64_t hash = 0xcbf29ce484222325ULL;
for (size_t i = 0; i < len; ++i) {
hash ^= data[i];
hash *= fnv_prime;
}
return std::to_string(hash);
} }

View File

@@ -8,19 +8,12 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
# keep standard at C11 and C++11 # keep standard at C11 and C++11
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
GOCMD?=go
CGO_LDFLAGS?=
# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
CGO_LDFLAGS_SYCL=
GO_TAGS?=
LD_FLAGS?=
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DSD_CUDA=ON CMAKE_ARGS+=-DGGML_CUDA=ON
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# to CMAKE_ARGS automatically # to CMAKE_ARGS automatically
else ifeq ($(BUILD_TYPE),openblas) else ifeq ($(BUILD_TYPE),openblas)
@@ -30,48 +23,29 @@ else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas) else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DSD_HIPBLAS=ON CMAKE_ARGS+=-DGGML_HIP=ON
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
# But if it's OSX without metal, disable it here # But if it's OSX without metal, disable it here
else ifeq ($(OS),Darwin) else ifeq ($(OS),Darwin)
ifneq ($(BUILD_TYPE),metal) ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DSD_METAL=OFF CMAKE_ARGS+=-DGGML_METAL=OFF
else else
CMAKE_ARGS+=-DSD_METAL=ON CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
TARGET+=--target ggml-metal TARGET+=--target ggml-metal
endif endif
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) # ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \ # CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
-DCMAKE_C_COMPILER=icx \ # endif
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON \
-DGGML_SYCL_F16=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
ifeq ($(BUILD_TYPE),sycl_f32) # ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \ # CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
-DCMAKE_C_COMPILER=icx \ # endif
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
# warnings # warnings
# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
# Find all .a archives in ARCHIVE_DIR # Find all .a archives in ARCHIVE_DIR
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive) # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@@ -112,24 +86,11 @@ endif
$(MAKE) $(COMBINED_LIB) $(MAKE) $(COMBINED_LIB)
gosd.o: gosd.o:
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
else
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c $(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
endif
libsd.a: gosd.o libsd.a: gosd.o
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
$(AR) rcs libsd.a gosd.o $(AR) rcs libsd.a gosd.o
stablediffusion-ggml:
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
ifneq ($(UPX),)
$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
endif
clean: clean:
rm -rf gosd.o libsd.a build $(COMBINED_LIB) rm -rf gosd.o libsd.a build $(COMBINED_LIB)

View File

@@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
context.SetTranslate(true) context.SetTranslate(true)
} }
if err := context.Process(data, nil, nil, nil); err != nil { if err := context.Process(data, nil, nil); err != nil {
return pb.TranscriptResult{}, err return pb.TranscriptResult{}, err
} }

View File

@@ -0,0 +1,17 @@
.PHONY: autogptq
autogptq: protogen
bash install.sh
.PHONY: protogen
protogen: backend_pb2_grpc.py backend_pb2.py
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
backend_pb2_grpc.py backend_pb2.py:
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__

View File

@@ -0,0 +1,5 @@
# Creating a separate environment for the autogptq project
```
make autogptq
```

View File

@@ -0,0 +1,153 @@
#!/usr/bin/env python3
from concurrent import futures
import argparse
import signal
import sys
import os
import time
import base64
import grpc
import backend_pb2
import backend_pb2_grpc
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextGenerationPipeline
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
try:
device = "cuda:0"
if request.Device != "":
device = request.Device
# support loading local model files
model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
# support model `Qwen/Qwen-VL-Chat-Int4`
if "qwen-vl" in request.Model.lower():
self.model_name = "Qwen-VL-Chat"
model = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=request.TrustRemoteCode,
device_map="auto").eval()
else:
model = AutoGPTQForCausalLM.from_quantized(model_path,
model_basename=request.ModelBaseName,
use_safetensors=True,
trust_remote_code=request.TrustRemoteCode,
device=device,
use_triton=request.UseTriton,
quantize_config=None)
self.model = model
self.tokenizer = tokenizer
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
penalty = 1.0
if request.Penalty != 0.0:
penalty = request.Penalty
tokens = 512
if request.Tokens != 0:
tokens = request.Tokens
top_p = 0.95
if request.TopP != 0.0:
top_p = request.TopP
prompt_images = self.recompile_vl_prompt(request)
compiled_prompt = prompt_images[0]
print(f"Prompt: {compiled_prompt}", file=sys.stderr)
# Implement Predict RPC
pipeline = TextGenerationPipeline(
model=self.model,
tokenizer=self.tokenizer,
max_new_tokens=tokens,
temperature=request.Temperature,
top_p=top_p,
repetition_penalty=penalty,
)
t = pipeline(compiled_prompt)[0]["generated_text"]
print(f"generated_text: {t}", file=sys.stderr)
if compiled_prompt in t:
t = t.replace(compiled_prompt, "")
# house keeping. Remove the image files from /tmp folder
for img_path in prompt_images[1]:
try:
os.remove(img_path)
except Exception as e:
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
def PredictStream(self, request, context):
# Implement PredictStream RPC
#for reply in some_data_generator():
# yield reply
# Not implemented yet
return self.Predict(request, context)
def recompile_vl_prompt(self, request):
prompt = request.Prompt
image_paths = []
if "qwen-vl" in self.model_name.lower():
# request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
# Then, save the image file paths to an array "image_paths".
# read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
for i, img in enumerate(request.Images):
timestamp = str(int(time.time() * 1000)) # Generate timestamp
img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename
with open(img_path, "wb") as f:
f.write(base64.b64decode(img))
image_paths.append(img_path)
prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
else:
prompt = request.Prompt
return (prompt, image_paths)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -e
source $(dirname $0)/../common/libbackend.sh
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
installRequirements

View File

@@ -0,0 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118

View File

@@ -0,0 +1 @@
torch==2.4.1

View File

@@ -0,0 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0

View File

@@ -0,0 +1,6 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
optimum[openvino]
setuptools

View File

@@ -0,0 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.71.0
protobuf
certifi
transformers

4
backend/python/autogptq/run.sh Executable file
View File

@@ -0,0 +1,4 @@
#!/bin/bash
source $(dirname $0)/../common/libbackend.sh
startBackend $@

View File

@@ -0,0 +1,6 @@
#!/bin/bash
set -e
source $(dirname $0)/../common/libbackend.sh
runUnittests

View File

@@ -61,12 +61,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View File

@@ -1,4 +1,4 @@
bark==0.1.5 bark==0.1.5
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
certifi certifi

View File

@@ -1,3 +1,3 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
grpcio-tools grpcio-tools

View File

@@ -86,12 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View File

@@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
certifi certifi
packaging==24.1 packaging==24.1

View File

@@ -19,7 +19,7 @@ import grpc
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \ from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
from diffusers.pipelines.stable_diffusion import safety_checker from diffusers.pipelines.stable_diffusion import safety_checker
from diffusers.utils import load_image, export_to_video from diffusers.utils import load_image, export_to_video
from compel import Compel, ReturnedEmbeddingsType from compel import Compel, ReturnedEmbeddingsType
@@ -168,13 +168,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# We are storing all the options in a dict so we can use it later when # We are storing all the options in a dict so we can use it later when
# generating the images # generating the images
for opt in options: for opt in options:
if ":" not in opt:
continue
key, value = opt.split(":") key, value = opt.split(":")
self.options[key] = value self.options[key] = value
print(f"Options: {self.options}", file=sys.stderr)
local = False local = False
modelFile = request.Model modelFile = request.Model
@@ -291,12 +287,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.LowVRAM: if request.LowVRAM:
self.pipe.enable_model_cpu_offload() self.pipe.enable_model_cpu_offload()
elif request.PipelineType == "Lumina2Text2ImgPipeline":
self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
request.Model,
torch_dtype=torch.bfloat16)
if request.LowVRAM:
self.pipe.enable_model_cpu_offload()
elif request.PipelineType == "SanaPipeline": elif request.PipelineType == "SanaPipeline":
self.pipe = SanaPipeline.from_pretrained( self.pipe = SanaPipeline.from_pretrained(
request.Model, request.Model,
@@ -526,12 +516,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View File

@@ -1,5 +1,5 @@
setuptools setuptools
grpcio==1.72.0 grpcio==1.71.0
pillow pillow
protobuf protobuf
certifi certifi

View File

@@ -105,12 +105,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View File

@@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
certifi certifi
wheel wheel

View File

@@ -62,12 +62,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.TranscriptResult(segments=resultSegments, text=text) return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View File

@@ -1,3 +1,3 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
grpcio-tools grpcio-tools

View File

@@ -99,12 +99,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View File

@@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
phonemizer phonemizer
scipy scipy

View File

@@ -91,12 +91,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.RerankResult(usage=usage, results=results) return backend_pb2.RerankResult(usage=usage, results=results)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View File

@@ -1,3 +1,3 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
certifi certifi

View File

@@ -559,12 +559,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
async def serve(address): async def serve(address):
# Start asyncio gRPC server # Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
# Add the servicer to the server # Add the servicer to the server
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
# Bind the server to the address # Bind the server to the address

View File

@@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
certifi certifi
setuptools setuptools

View File

@@ -194,40 +194,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
await iterations.aclose() await iterations.aclose()
async def _predict(self, request, context, streaming=False): async def _predict(self, request, context, streaming=False):
# Build the sampling parameters
# NOTE: this must stay in sync with the vllm backend
request_to_sampling_params = {
"N": "n",
"PresencePenalty": "presence_penalty",
"FrequencyPenalty": "frequency_penalty",
"RepetitionPenalty": "repetition_penalty",
"Temperature": "temperature",
"TopP": "top_p",
"TopK": "top_k",
"MinP": "min_p",
"Seed": "seed",
"StopPrompts": "stop",
"StopTokenIds": "stop_token_ids",
"BadWords": "bad_words",
"IncludeStopStrInOutput": "include_stop_str_in_output",
"IgnoreEOS": "ignore_eos",
"Tokens": "max_tokens",
"MinTokens": "min_tokens",
"Logprobs": "logprobs",
"PromptLogprobs": "prompt_logprobs",
"SkipSpecialTokens": "skip_special_tokens",
"SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
"TruncatePromptTokens": "truncate_prompt_tokens",
"GuidedDecoding": "guided_decoding",
}
# Build sampling parameters
sampling_params = SamplingParams(top_p=0.9, max_tokens=200) sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
if request.TopP != 0:
for request_field, param_field in request_to_sampling_params.items(): sampling_params.top_p = request.TopP
if hasattr(request, request_field): if request.Tokens > 0:
value = getattr(request, request_field) sampling_params.max_tokens = request.Tokens
if value not in (None, 0, [], False): if request.Temperature != 0:
setattr(sampling_params, param_field, value) sampling_params.temperature = request.Temperature
if request.TopK != 0:
sampling_params.top_k = request.TopK
if request.PresencePenalty != 0:
sampling_params.presence_penalty = request.PresencePenalty
if request.FrequencyPenalty != 0:
sampling_params.frequency_penalty = request.FrequencyPenalty
if request.StopPrompts:
sampling_params.stop = request.StopPrompts
if request.IgnoreEOS:
sampling_params.ignore_eos = request.IgnoreEOS
if request.Seed != 0:
sampling_params.seed = request.Seed
# Extract image paths and process images # Extract image paths and process images
prompt = request.Prompt prompt = request.Prompt
@@ -333,12 +320,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
async def serve(address): async def serve(address):
# Start asyncio gRPC server # Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
# Add the servicer to the server # Add the servicer to the server
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
# Bind the server to the address # Bind the server to the address

View File

@@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.71.0
protobuf protobuf
certifi certifi
setuptools setuptools

View File

@@ -75,53 +75,6 @@ class TestBackendServicer(unittest.TestCase):
finally: finally:
self.tearDown() self.tearDown()
def test_sampling_params(self):
"""
This method tests if all sampling parameters are correctly processed
NOTE: this does NOT test for correctness, just that we received a compatible response
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
req = backend_pb2.PredictOptions(
Prompt="The capital of France is",
TopP=0.8,
Tokens=50,
Temperature=0.7,
TopK=40,
PresencePenalty=0.1,
FrequencyPenalty=0.2,
RepetitionPenalty=1.1,
MinP=0.05,
Seed=42,
StopPrompts=["\n"],
StopTokenIds=[50256],
BadWords=["badword"],
IncludeStopStrInOutput=True,
IgnoreEOS=True,
MinTokens=5,
Logprobs=5,
PromptLogprobs=5,
SkipSpecialTokens=True,
SpacesBetweenSpecialTokens=True,
TruncatePromptTokens=10,
GuidedDecoding=True,
N=2,
)
resp = stub.Predict(req)
self.assertIsNotNone(resp.message)
self.assertIsNotNone(resp.logprobs)
except Exception as err:
print(err)
self.fail("sampling params service failed")
finally:
self.tearDown()
def test_embedding(self): def test_embedding(self):
""" """
This method tests if the embeddings are generated successfully This method tests if the embeddings are generated successfully

View File

@@ -16,7 +16,7 @@ type Application struct {
func newApplication(appConfig *config.ApplicationConfig) *Application { func newApplication(appConfig *config.ApplicationConfig) *Application {
return &Application{ return &Application{
backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath), backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
modelLoader: model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend), modelLoader: model.NewModelLoader(appConfig.ModelPath),
applicationConfig: appConfig, applicationConfig: appConfig,
templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath), templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
} }

View File

@@ -43,12 +43,18 @@ func New(opts ...config.AppOption) (*Application, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to create ModelPath: %q", err) return nil, fmt.Errorf("unable to create ModelPath: %q", err)
} }
if options.GeneratedContentDir != "" { if options.ImageDir != "" {
err := os.MkdirAll(options.GeneratedContentDir, 0750) err := os.MkdirAll(options.ImageDir, 0750)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to create ImageDir: %q", err) return nil, fmt.Errorf("unable to create ImageDir: %q", err)
} }
} }
if options.AudioDir != "" {
err := os.MkdirAll(options.AudioDir, 0750)
if err != nil {
return nil, fmt.Errorf("unable to create AudioDir: %q", err)
}
}
if options.UploadDir != "" { if options.UploadDir != "" {
err := os.MkdirAll(options.UploadDir, 0750) err := os.MkdirAll(options.UploadDir, 0750)
if err != nil { if err != nil {
@@ -137,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
}() }()
} }
if options.LoadToMemory != nil && !options.SingleBackend { if options.LoadToMemory != nil {
for _, m := range options.LoadToMemory { for _, m := range options.LoadToMemory {
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options) cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
if err != nil { if err != nil {

View File

@@ -17,7 +17,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
var fn func() ([]float32, error) var fn func() ([]float32, error)
switch model := inferenceModel.(type) { switch model := inferenceModel.(type) {

View File

@@ -16,7 +16,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
fn := func() error { fn := func() error {
_, err := inferenceModel.GenerateImage( _, err := inferenceModel.GenerateImage(

View File

@@ -53,7 +53,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
var protoMessages []*proto.Message var protoMessages []*proto.Message
// if we are using the tokenizer template, we need to convert the messages to proto messages // if we are using the tokenizer template, we need to convert the messages to proto messages

View File

@@ -40,6 +40,10 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
grpcOpts := grpcModelOpts(c) grpcOpts := grpcModelOpts(c)
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts)) defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
if so.SingleBackend {
defOpts = append(defOpts, model.WithSingleActiveBackend())
}
if so.ParallelBackendRequests { if so.ParallelBackendRequests {
defOpts = append(defOpts, model.EnableParallelRequests) defOpts = append(defOpts, model.EnableParallelRequests)
} }
@@ -99,7 +103,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
mmap = *c.MMap mmap = *c.MMap
} }
ctxSize := 4096 ctxSize := 1024
if c.ContextSize != nil { if c.ContextSize != nil {
ctxSize = *c.ContextSize ctxSize = *c.ContextSize
} }
@@ -117,7 +121,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
triggers := make([]*pb.GrammarTrigger, 0) triggers := make([]*pb.GrammarTrigger, 0)
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers { for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
triggers = append(triggers, &pb.GrammarTrigger{ triggers = append(triggers, &pb.GrammarTrigger{
Word: t.Word, Word: t.Word,
}) })
} }
@@ -157,33 +161,38 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
DisableLogStatus: c.DisableLogStatus, DisableLogStatus: c.DisableLogStatus,
DType: c.DType, DType: c.DType,
// LimitMMPerPrompt vLLM // LimitMMPerPrompt vLLM
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt), LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt), LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt), LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
MMProj: c.MMProj, MMProj: c.MMProj,
FlashAttention: c.FlashAttention, FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK, CacheTypeKey: c.CacheTypeK,
CacheTypeValue: c.CacheTypeV, CacheTypeValue: c.CacheTypeV,
NoKVOffload: c.NoKVOffloading, NoKVOffload: c.NoKVOffloading,
YarnExtFactor: c.YarnExtFactor, YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor, YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast, YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow, YarnBetaSlow: c.YarnBetaSlow,
NGQA: c.NGQA, NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps, RMSNormEps: c.RMSNormEps,
MLock: mmlock, MLock: mmlock,
RopeFreqBase: c.RopeFreqBase, RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling, RopeScaling: c.RopeScaling,
Type: c.ModelType, Type: c.ModelType,
RopeFreqScale: c.RopeFreqScale, RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA, NUMA: c.NUMA,
Embeddings: embeddings, Embeddings: embeddings,
LowVRAM: lowVRAM, LowVRAM: lowVRAM,
NGPULayers: int32(nGPULayers), NGPULayers: int32(nGPULayers),
MMap: mmap, MMap: mmap,
MainGPU: c.MainGPU, MainGPU: c.MainGPU,
Threads: int32(*c.Threads), Threads: int32(*c.Threads),
TensorSplit: c.TensorSplit, TensorSplit: c.TensorSplit,
// AutoGPTQ
ModelBaseName: c.AutoGPTQ.ModelBaseName,
Device: c.AutoGPTQ.Device,
UseTriton: c.AutoGPTQ.Triton,
UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
// RWKV // RWKV
Tokenizer: c.Tokenizer, Tokenizer: c.Tokenizer,
} }

View File

@@ -12,10 +12,10 @@ import (
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) { func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
opts := ModelOptions(backendConfig, appConfig) opts := ModelOptions(backendConfig, appConfig)
rerankModel, err := loader.Load(opts...) rerankModel, err := loader.Load(opts...)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
if rerankModel == nil { if rerankModel == nil {
return nil, fmt.Errorf("could not load rerank model") return nil, fmt.Errorf("could not load rerank model")

View File

@@ -26,26 +26,21 @@ func SoundGeneration(
opts := ModelOptions(backendConfig, appConfig) opts := ModelOptions(backendConfig, appConfig)
soundGenModel, err := loader.Load(opts...) soundGenModel, err := loader.Load(opts...)
if err != nil { if err != nil {
return "", nil, err return "", nil, err
} }
defer loader.Close()
if soundGenModel == nil { if soundGenModel == nil {
return "", nil, fmt.Errorf("could not load sound generation model") return "", nil, fmt.Errorf("could not load sound generation model")
} }
if err := os.MkdirAll(appConfig.GeneratedContentDir, 0750); err != nil { if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
return "", nil, fmt.Errorf("failed creating audio directory: %s", err) return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
} }
audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio") fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
if err := os.MkdirAll(audioDir, 0750); err != nil { filePath := filepath.Join(appConfig.AudioDir, fileName)
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
}
fileName := utils.GenerateUniqueFileName(audioDir, "sound_generation", ".wav")
filePath := filepath.Join(audioDir, fileName)
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{ res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
Text: text, Text: text,

View File

@@ -20,7 +20,6 @@ func TokenMetrics(
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
if model == nil { if model == nil {
return nil, fmt.Errorf("could not loadmodel model") return nil, fmt.Errorf("could not loadmodel model")

View File

@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
opts := ModelOptions(backendConfig, appConfig) opts := ModelOptions(backendConfig, appConfig)
inferenceModel, err = loader.Load(opts...) inferenceModel, err = loader.Load(opts...)
if err != nil { if err != nil {
return schema.TokenizeResponse{}, err return schema.TokenizeResponse{}, err
} }
defer loader.Close()
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath) predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
predictOptions.Prompt = s predictOptions.Prompt = s

View File

@@ -24,7 +24,6 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer ml.Close()
if transcriptionModel == nil { if transcriptionModel == nil {
return nil, fmt.Errorf("could not load transcription model") return nil, fmt.Errorf("could not load transcription model")

View File

@@ -23,22 +23,21 @@ func ModelTTS(
) (string, *proto.Result, error) { ) (string, *proto.Result, error) {
opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend)) opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
ttsModel, err := loader.Load(opts...) ttsModel, err := loader.Load(opts...)
if err != nil { if err != nil {
return "", nil, err return "", nil, err
} }
defer loader.Close()
if ttsModel == nil { if ttsModel == nil {
return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model) return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
} }
audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio") if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
if err := os.MkdirAll(audioDir, 0750); err != nil {
return "", nil, fmt.Errorf("failed creating audio directory: %s", err) return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
} }
fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav") fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
filePath := filepath.Join(audioDir, fileName) filePath := filepath.Join(appConfig.AudioDir, fileName)
// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect. // We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
// This should be addressed in a follow up PR soon. // This should be addressed in a follow up PR soon.

View File

@@ -19,8 +19,6 @@ func VAD(request *schema.VADRequest,
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer ml.Close()
req := proto.VADRequest{ req := proto.VADRequest{
Audio: request.Audio, Audio: request.Audio,
} }

View File

@@ -1,36 +0,0 @@
package backend
import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/grpc/proto"
model "github.com/mudler/LocalAI/pkg/model"
)
func VideoGeneration(height, width int32, prompt, startImage, endImage, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
opts := ModelOptions(backendConfig, appConfig)
inferenceModel, err := loader.Load(
opts...,
)
if err != nil {
return nil, err
}
defer loader.Close()
fn := func() error {
_, err := inferenceModel.GenerateVideo(
appConfig.Context,
&proto.GenerateVideoRequest{
Height: height,
Width: width,
Prompt: prompt,
StartImage: startImage,
EndImage: endImage,
Dst: dst,
})
return err
}
return fn, nil
}

View File

@@ -1,13 +1,11 @@
package cliContext package cliContext
import ( import "embed"
rice "github.com/GeertJohan/go.rice"
)
type Context struct { type Context struct {
Debug bool `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"` Debug bool `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"` LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI // This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
BackendAssets *rice.Box `kong:"-"` BackendAssets embed.FS `kong:"-"`
} }

View File

@@ -21,7 +21,8 @@ type RunCMD struct {
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"` ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"` BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
GeneratedContentPath string `env:"LOCALAI_GENERATED_CONTENT_PATH,GENERATED_CONTENT_PATH" type:"path" default:"/tmp/generated/content" help:"Location for generated content (e.g. images, audio, videos)" group:"storage"` ImagePath string `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
AudioPath string `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
UploadPath string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"` UploadPath string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
ConfigPath string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"` ConfigPath string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
LocalaiConfigDir string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"` LocalaiConfigDir string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
@@ -37,7 +38,7 @@ type RunCMD struct {
F16 bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"` F16 bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"` Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"` ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"` Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"` CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
@@ -46,7 +47,7 @@ type RunCMD struct {
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"` CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"` UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"` APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disables the web user interface. When set to true, the server will only expose API endpoints without serving the web interface" group:"api"` DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"` DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"` OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"` UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
@@ -80,7 +81,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
config.WithModelPath(r.ModelsPath), config.WithModelPath(r.ModelsPath),
config.WithContextSize(r.ContextSize), config.WithContextSize(r.ContextSize),
config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel), config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
config.WithGeneratedContentDir(r.GeneratedContentPath), config.WithImageDir(r.ImagePath),
config.WithAudioDir(r.AudioPath),
config.WithUploadDir(r.UploadPath), config.WithUploadDir(r.UploadPath),
config.WithConfigsDir(r.ConfigPath), config.WithConfigsDir(r.ConfigPath),
config.WithDynamicConfigDir(r.LocalaiConfigDir), config.WithDynamicConfigDir(r.LocalaiConfigDir),

View File

@@ -70,11 +70,11 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
opts := &config.ApplicationConfig{ opts := &config.ApplicationConfig{
ModelPath: t.ModelsPath, ModelPath: t.ModelsPath,
Context: context.Background(), Context: context.Background(),
GeneratedContentDir: outputDir, AudioDir: outputDir,
AssetsDestination: t.BackendAssetsPath, AssetsDestination: t.BackendAssetsPath,
ExternalGRPCBackends: externalBackends, ExternalGRPCBackends: externalBackends,
} }
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend) ml := model.NewModelLoader(opts.ModelPath)
defer func() { defer func() {
err := ml.StopAllGRPC() err := ml.StopAllGRPC()

View File

@@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
} }
cl := config.NewBackendConfigLoader(t.ModelsPath) cl := config.NewBackendConfigLoader(t.ModelsPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend) ml := model.NewModelLoader(opts.ModelPath)
if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil { if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
return err return err
} }

View File

@@ -36,12 +36,12 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
text := strings.Join(t.Text, " ") text := strings.Join(t.Text, " ")
opts := &config.ApplicationConfig{ opts := &config.ApplicationConfig{
ModelPath: t.ModelsPath, ModelPath: t.ModelsPath,
Context: context.Background(), Context: context.Background(),
GeneratedContentDir: outputDir, AudioDir: outputDir,
AssetsDestination: t.BackendAssetsPath, AssetsDestination: t.BackendAssetsPath,
} }
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend) ml := model.NewModelLoader(opts.ModelPath)
defer func() { defer func() {
err := ml.StopAllGRPC() err := ml.StopAllGRPC()

View File

@@ -7,11 +7,11 @@ import (
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
gguf "github.com/gpustack/gguf-parser-go"
cliContext "github.com/mudler/LocalAI/core/cli/context" cliContext "github.com/mudler/LocalAI/core/cli/context"
"github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/gallery" "github.com/mudler/LocalAI/core/gallery"
"github.com/mudler/LocalAI/pkg/downloader" "github.com/mudler/LocalAI/pkg/downloader"
gguf "github.com/thxcode/gguf-parser-go"
) )
type UtilCMD struct { type UtilCMD struct {
@@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
log.Info(). log.Info().
Any("eosTokenID", f.Tokenizer().EOSTokenID). Any("eosTokenID", f.Tokenizer().EOSTokenID).
Any("bosTokenID", f.Tokenizer().BOSTokenID). Any("bosTokenID", f.Tokenizer().BOSTokenID).
Any("modelName", f.Metadata().Name). Any("modelName", f.Model().Name).
Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0]) Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer") log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")

View File

@@ -2,11 +2,11 @@ package config
import ( import (
"context" "context"
"embed"
"encoding/json" "encoding/json"
"regexp" "regexp"
"time" "time"
rice "github.com/GeertJohan/go.rice"
"github.com/mudler/LocalAI/pkg/xsysinfo" "github.com/mudler/LocalAI/pkg/xsysinfo"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
) )
@@ -19,21 +19,20 @@ type ApplicationConfig struct {
UploadLimitMB, Threads, ContextSize int UploadLimitMB, Threads, ContextSize int
F16 bool F16 bool
Debug bool Debug bool
GeneratedContentDir string ImageDir string
AudioDir string
ConfigsDir string UploadDir string
UploadDir string ConfigsDir string
DynamicConfigsDir string
DynamicConfigsDir string DynamicConfigsDirPollInterval time.Duration
DynamicConfigsDirPollInterval time.Duration CORS bool
CORS bool CSRF bool
CSRF bool PreloadJSONModels string
PreloadJSONModels string PreloadModelsFromPath string
PreloadModelsFromPath string CORSAllowOrigins string
CORSAllowOrigins string ApiKeys []string
ApiKeys []string P2PToken string
P2PToken string P2PNetworkID string
P2PNetworkID string
DisableWebUI bool DisableWebUI bool
EnforcePredownloadScans bool EnforcePredownloadScans bool
@@ -47,7 +46,7 @@ type ApplicationConfig struct {
Galleries []Gallery Galleries []Gallery
BackendAssets *rice.Box BackendAssets embed.FS
AssetsDestination string AssetsDestination string
ExternalGRPCBackends map[string]string ExternalGRPCBackends map[string]string
@@ -198,7 +197,7 @@ func WithBackendAssetsOutput(out string) AppOption {
} }
} }
func WithBackendAssets(f *rice.Box) AppOption { func WithBackendAssets(f embed.FS) AppOption {
return func(o *ApplicationConfig) { return func(o *ApplicationConfig) {
o.BackendAssets = f o.BackendAssets = f
} }
@@ -280,9 +279,15 @@ func WithDebug(debug bool) AppOption {
} }
} }
func WithGeneratedContentDir(generatedContentDir string) AppOption { func WithAudioDir(audioDir string) AppOption {
return func(o *ApplicationConfig) { return func(o *ApplicationConfig) {
o.GeneratedContentDir = generatedContentDir o.AudioDir = audioDir
}
}
func WithImageDir(imageDir string) AppOption {
return func(o *ApplicationConfig) {
o.ImageDir = imageDir
} }
} }

View File

@@ -50,6 +50,9 @@ type BackendConfig struct {
// LLM configs (GPT4ALL, Llama.cpp, ...) // LLM configs (GPT4ALL, Llama.cpp, ...)
LLMConfig `yaml:",inline"` LLMConfig `yaml:",inline"`
// AutoGPTQ specifics
AutoGPTQ AutoGPTQ `yaml:"autogptq"`
// Diffusers // Diffusers
Diffusers Diffusers `yaml:"diffusers"` Diffusers Diffusers `yaml:"diffusers"`
Step int `yaml:"step"` Step int `yaml:"step"`
@@ -173,6 +176,14 @@ type LimitMMPerPrompt struct {
LimitAudioPerPrompt int `yaml:"audio"` LimitAudioPerPrompt int `yaml:"audio"`
} }
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
Device string `yaml:"device"`
Triton bool `yaml:"triton"`
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
}
// TemplateConfig is a struct that holds the configuration of the templating system // TemplateConfig is a struct that holds the configuration of the templating system
type TemplateConfig struct { type TemplateConfig struct {
// Chat is the template used in the chat completion endpoint // Chat is the template used in the chat completion endpoint
@@ -304,6 +315,9 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
defaultTFZ := 1.0 defaultTFZ := 1.0
defaultZero := 0 defaultZero := 0
// Try to offload all GPU layers (if GPU is found)
defaultHigh := 99999999
trueV := true trueV := true
falseV := false falseV := false
@@ -363,6 +377,9 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
if cfg.MirostatTAU == nil { if cfg.MirostatTAU == nil {
cfg.MirostatTAU = &defaultMirostatTAU cfg.MirostatTAU = &defaultMirostatTAU
} }
if cfg.NGPULayers == nil {
cfg.NGPULayers = &defaultHigh
}
if cfg.LowVRAM == nil { if cfg.LowVRAM == nil {
cfg.LowVRAM = &falseV cfg.LowVRAM = &falseV
@@ -372,6 +389,16 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Embeddings = &falseV cfg.Embeddings = &falseV
} }
// Value passed by the top level are treated as default (no implicit defaults)
// defaults are set by the user
if ctx == 0 {
ctx = 1024
}
if cfg.ContextSize == nil {
cfg.ContextSize = &ctx
}
if threads == 0 { if threads == 0 {
// Threads can't be 0 // Threads can't be 0
threads = 4 threads = 4
@@ -393,7 +420,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
cfg.Debug = &trueV cfg.Debug = &trueV
} }
guessDefaultsFromFile(cfg, lo.modelPath, ctx) guessDefaultsFromFile(cfg, lo.modelPath)
} }
func (c *BackendConfig) Validate() bool { func (c *BackendConfig) Validate() bool {
@@ -430,19 +457,18 @@ func (c *BackendConfig) HasTemplate() bool {
type BackendConfigUsecases int type BackendConfigUsecases int
const ( const (
FLAG_ANY BackendConfigUsecases = 0b000000000000 FLAG_ANY BackendConfigUsecases = 0b00000000000
FLAG_CHAT BackendConfigUsecases = 0b000000000001 FLAG_CHAT BackendConfigUsecases = 0b00000000001
FLAG_COMPLETION BackendConfigUsecases = 0b000000000010 FLAG_COMPLETION BackendConfigUsecases = 0b00000000010
FLAG_EDIT BackendConfigUsecases = 0b000000000100 FLAG_EDIT BackendConfigUsecases = 0b00000000100
FLAG_EMBEDDINGS BackendConfigUsecases = 0b000000001000 FLAG_EMBEDDINGS BackendConfigUsecases = 0b00000001000
FLAG_RERANK BackendConfigUsecases = 0b000000010000 FLAG_RERANK BackendConfigUsecases = 0b00000010000
FLAG_IMAGE BackendConfigUsecases = 0b000000100000 FLAG_IMAGE BackendConfigUsecases = 0b00000100000
FLAG_TRANSCRIPT BackendConfigUsecases = 0b000001000000 FLAG_TRANSCRIPT BackendConfigUsecases = 0b00001000000
FLAG_TTS BackendConfigUsecases = 0b000010000000 FLAG_TTS BackendConfigUsecases = 0b00010000000
FLAG_SOUND_GENERATION BackendConfigUsecases = 0b000100000000 FLAG_SOUND_GENERATION BackendConfigUsecases = 0b00100000000
FLAG_TOKENIZE BackendConfigUsecases = 0b001000000000 FLAG_TOKENIZE BackendConfigUsecases = 0b01000000000
FLAG_VAD BackendConfigUsecases = 0b010000000000 FLAG_VAD BackendConfigUsecases = 0b10000000000
FLAG_VIDEO BackendConfigUsecases = 0b100000000000
// Common Subsets // Common Subsets
FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
@@ -463,7 +489,6 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
"FLAG_TOKENIZE": FLAG_TOKENIZE, "FLAG_TOKENIZE": FLAG_TOKENIZE,
"FLAG_VAD": FLAG_VAD, "FLAG_VAD": FLAG_VAD,
"FLAG_LLM": FLAG_LLM, "FLAG_LLM": FLAG_LLM,
"FLAG_VIDEO": FLAG_VIDEO,
} }
} }
@@ -528,17 +553,6 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
return false return false
} }
}
if (u & FLAG_VIDEO) == FLAG_VIDEO {
videoBackends := []string{"diffusers", "stablediffusion"}
if !slices.Contains(videoBackends, c.Backend) {
return false
}
if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
return false
}
} }
if (u & FLAG_RERANK) == FLAG_RERANK { if (u & FLAG_RERANK) == FLAG_RERANK {
if c.Backend != "rerankers" { if c.Backend != "rerankers" {
@@ -551,7 +565,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
} }
} }
if (u & FLAG_TTS) == FLAG_TTS { if (u & FLAG_TTS) == FLAG_TTS {
ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"} ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
if !slices.Contains(ttsBackends, c.Backend) { if !slices.Contains(ttsBackends, c.Backend) {
return false return false
} }

View File

@@ -1,296 +0,0 @@
package config
import (
"strings"
"github.com/mudler/LocalAI/pkg/xsysinfo"
"github.com/rs/zerolog/log"
gguf "github.com/gpustack/gguf-parser-go"
)
type familyType uint8
const (
Unknown familyType = iota
LLaMa3
CommandR
Phi3
ChatML
Mistral03
Gemma
DeepSeek2
)
const (
defaultContextSize = 1024
defaultNGPULayers = 99999999
)
type settingsConfig struct {
StopWords []string
TemplateConfig TemplateConfig
RepeatPenalty float64
}
// default settings to adopt with a given model family
var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
Gemma: {
RepeatPenalty: 1.0,
StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
TemplateConfig: TemplateConfig{
Chat: "{{.Input }}\n<start_of_turn>model\n",
ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
Completion: "{{.Input}}",
},
},
DeepSeek2: {
StopWords: []string{"<end▁of▁sentence>"},
TemplateConfig: TemplateConfig{
ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
{{ end -}}
{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<end▁of▁sentence>{{end}}
{{if eq .RoleName "system" -}}{{.Content}}
{{end -}}`,
Chat: "{{.Input -}}\nAssistant: ",
},
},
LLaMa3: {
StopWords: []string{"<|eot_id|>"},
TemplateConfig: TemplateConfig{
Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
},
},
CommandR: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
You are a function calling AI model, you can call the following functions:
## Available Tools
{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
{{end}}
When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
ChatMessage: `{{if eq .RoleName "user" -}}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "system" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "assistant" -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "tool" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if .FunctionCall -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
{{- end -}}`,
},
StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
},
Phi3: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input}}\n<|assistant|>",
ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
Completion: "{{.Input}}",
},
StopWords: []string{"<|end|>", "<|endoftext|>"},
},
ChatML: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}\n<|im_start|>assistant",
Functions: `<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant`,
ChatMessage: `<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
},
Mistral03: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}",
Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
ChatMessage: `{{if eq .RoleName "user" -}}
[INST] {{.Content }} [/INST]
{{- else if .FunctionCall -}}
[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
{{- else if eq .RoleName "tool" -}}
[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
{{- else -}}
{{ .Content -}}
{{ end -}}`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
},
}
// this maps well known template used in HF to model families defined above
var knownTemplates = map[string]familyType{
`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML,
`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
}
func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
if defaultCtx == 0 && cfg.ContextSize == nil {
ctxSize := f.EstimateLLaMACppRun().ContextSize
if ctxSize > 0 {
cSize := int(ctxSize)
cfg.ContextSize = &cSize
} else {
defaultCtx = defaultContextSize
cfg.ContextSize = &defaultCtx
}
}
// GPU options
if cfg.Options == nil {
if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
cfg.Options = []string{"gpu"}
}
}
// vram estimation
vram, err := xsysinfo.TotalAvailableVRAM()
if err != nil {
log.Error().Msgf("guessDefaultsFromFile(TotalAvailableVRAM): %s", err)
} else if vram > 0 {
estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
if err != nil {
log.Error().Msgf("guessDefaultsFromFile(EstimateGGUFVRAMUsage): %s", err)
} else {
if estimate.IsFullOffload {
log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended")
}
if estimate.EstimatedVRAM > vram {
log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM")
}
if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
log.Debug().Msgf("guessDefaultsFromFile: %d layers estimated", estimate.EstimatedLayers)
cfg.NGPULayers = &estimate.EstimatedLayers
}
}
}
if cfg.NGPULayers == nil {
// we assume we want to offload all layers
defaultHigh := defaultNGPULayers
cfg.NGPULayers = &defaultHigh
}
log.Debug().Any("NGPULayers", cfg.NGPULayers).Msgf("guessDefaultsFromFile: %s", "NGPULayers set")
// template estimations
if cfg.HasTemplate() {
// nothing to guess here
log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
return
}
log.Debug().
Any("eosTokenID", f.Tokenizer().EOSTokenID).
Any("bosTokenID", f.Tokenizer().BOSTokenID).
Any("modelName", f.Metadata().Name).
Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
// guess the name
if cfg.Name == "" {
cfg.Name = f.Metadata().Name
}
family := identifyFamily(f)
if family == Unknown {
log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
return
}
// identify template
settings, ok := defaultsSettings[family]
if ok {
cfg.TemplateConfig = settings.TemplateConfig
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
if len(cfg.StopWords) == 0 {
cfg.StopWords = settings.StopWords
}
if cfg.RepeatPenalty == 0.0 {
cfg.RepeatPenalty = settings.RepeatPenalty
}
} else {
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
}
if cfg.HasTemplate() {
return
}
// identify from well known templates first, otherwise use the raw jinja template
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found {
// try to use the jinja template
cfg.TemplateConfig.JinjaTemplate = true
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
}
}
func identifyFamily(f *gguf.GGUFFile) familyType {
// identify from well known templates first
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found && chatTemplate.ValueString() != "" {
if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
return family
}
}
// otherwise try to identify from the model properties
arch := f.Architecture().Architecture
eosTokenID := f.Tokenizer().EOSTokenID
bosTokenID := f.Tokenizer().BOSTokenID
isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
llama3 := arch == "llama" && eosTokenID == 128009
commandR := arch == "command-r" && eosTokenID == 255001
qwen2 := arch == "qwen2"
phi3 := arch == "phi-3"
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
deepseek2 := arch == "deepseek2"
switch {
case deepseek2:
return DeepSeek2
case gemma:
return Gemma
case llama3:
return LLaMa3
case commandR:
return CommandR
case phi3:
return Phi3
case qwen2, isYI:
return ChatML
default:
return Unknown
}
}

View File

@@ -3,12 +3,147 @@ package config
import ( import (
"os" "os"
"path/filepath" "path/filepath"
"strings"
gguf "github.com/gpustack/gguf-parser-go"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
gguf "github.com/thxcode/gguf-parser-go"
) )
func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) { type familyType uint8
const (
Unknown familyType = iota
LLaMa3
CommandR
Phi3
ChatML
Mistral03
Gemma
DeepSeek2
)
type settingsConfig struct {
StopWords []string
TemplateConfig TemplateConfig
RepeatPenalty float64
}
// default settings to adopt with a given model family
var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
Gemma: {
RepeatPenalty: 1.0,
StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
TemplateConfig: TemplateConfig{
Chat: "{{.Input }}\n<start_of_turn>model\n",
ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
Completion: "{{.Input}}",
},
},
DeepSeek2: {
StopWords: []string{"<end▁of▁sentence>"},
TemplateConfig: TemplateConfig{
ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
{{ end -}}
{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<end▁of▁sentence>{{end}}
{{if eq .RoleName "system" -}}{{.Content}}
{{end -}}`,
Chat: "{{.Input -}}\nAssistant: ",
},
},
LLaMa3: {
StopWords: []string{"<|eot_id|>"},
TemplateConfig: TemplateConfig{
Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
},
},
CommandR: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
You are a function calling AI model, you can call the following functions:
## Available Tools
{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
{{end}}
When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
ChatMessage: `{{if eq .RoleName "user" -}}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "system" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "assistant" -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if eq .RoleName "tool" -}}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
{{- else if .FunctionCall -}}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
{{- end -}}`,
},
StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
},
Phi3: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input}}\n<|assistant|>",
ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
Completion: "{{.Input}}",
},
StopWords: []string{"<|end|>", "<|endoftext|>"},
},
ChatML: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}\n<|im_start|>assistant",
Functions: `<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant`,
ChatMessage: `<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
},
Mistral03: {
TemplateConfig: TemplateConfig{
Chat: "{{.Input -}}",
Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
ChatMessage: `{{if eq .RoleName "user" -}}
[INST] {{.Content }} [/INST]
{{- else if .FunctionCall -}}
[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
{{- else if eq .RoleName "tool" -}}
[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
{{- else -}}
{{ .Content -}}
{{ end -}}`,
},
StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
},
}
// this maps well known template used in HF to model families defined above
var knownTemplates = map[string]familyType{
`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML,
`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
}
func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" { if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING") log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
return return
@@ -19,20 +154,106 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int)
return return
} }
// We try to guess only if we don't have a template defined already if cfg.HasTemplate() {
guessPath := filepath.Join(modelPath, cfg.ModelFileName()) // nothing to guess here
log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
// try to parse the gguf file
f, err := gguf.ParseGGUFFile(guessPath)
if err == nil {
guessGGUFFromFile(cfg, f, defaultCtx)
return return
} }
if cfg.ContextSize == nil { // We try to guess only if we don't have a template defined already
if defaultCtx == 0 { guessPath := filepath.Join(modelPath, cfg.ModelFileName())
defaultCtx = defaultContextSize f, err := gguf.ParseGGUFFile(guessPath)
if err != nil {
// Only valid for gguf files
log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
return
}
log.Debug().
Any("eosTokenID", f.Tokenizer().EOSTokenID).
Any("bosTokenID", f.Tokenizer().BOSTokenID).
Any("modelName", f.Model().Name).
Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
// guess the name
if cfg.Name == "" {
cfg.Name = f.Model().Name
}
family := identifyFamily(f)
if family == Unknown {
log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
return
}
// identify template
settings, ok := defaultsSettings[family]
if ok {
cfg.TemplateConfig = settings.TemplateConfig
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
if len(cfg.StopWords) == 0 {
cfg.StopWords = settings.StopWords
} }
cfg.ContextSize = &defaultCtx if cfg.RepeatPenalty == 0.0 {
cfg.RepeatPenalty = settings.RepeatPenalty
}
} else {
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
}
if cfg.HasTemplate() {
return
}
// identify from well known templates first, otherwise use the raw jinja template
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found {
// try to use the jinja template
cfg.TemplateConfig.JinjaTemplate = true
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
}
}
func identifyFamily(f *gguf.GGUFFile) familyType {
// identify from well known templates first
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
if found && chatTemplate.ValueString() != "" {
if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
return family
}
}
// otherwise try to identify from the model properties
arch := f.Architecture().Architecture
eosTokenID := f.Tokenizer().EOSTokenID
bosTokenID := f.Tokenizer().BOSTokenID
isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
llama3 := arch == "llama" && eosTokenID == 128009
commandR := arch == "command-r" && eosTokenID == 255001
qwen2 := arch == "qwen2"
phi3 := arch == "phi-3"
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
deepseek2 := arch == "deepseek2"
switch {
case deepseek2:
return DeepSeek2
case gemma:
return Gemma
case llama3:
return LLaMa3
case commandR:
return CommandR
case phi3:
return Phi3
case qwen2, isYI:
return ChatML
default:
return Unknown
} }
} }

View File

@@ -5,8 +5,6 @@ import (
"errors" "errors"
"fmt" "fmt"
"net/http" "net/http"
"os"
"path/filepath"
"github.com/dave-gray101/v2keyauth" "github.com/dave-gray101/v2keyauth"
"github.com/mudler/LocalAI/pkg/utils" "github.com/mudler/LocalAI/pkg/utils"
@@ -144,9 +142,9 @@ func API(application *application.Application) (*fiber.App, error) {
httpFS := http.FS(embedDirStatic) httpFS := http.FS(embedDirStatic)
router.Use(favicon.New(favicon.Config{ router.Use(favicon.New(favicon.Config{
URL: "/favicon.svg", URL: "/favicon.ico",
FileSystem: httpFS, FileSystem: httpFS,
File: "static/favicon.svg", File: "static/favicon.ico",
})) }))
router.Use("/static", filesystem.New(filesystem.Config{ router.Use("/static", filesystem.New(filesystem.Config{
@@ -155,19 +153,12 @@ func API(application *application.Application) (*fiber.App, error) {
Browse: true, Browse: true,
})) }))
if application.ApplicationConfig().GeneratedContentDir != "" { if application.ApplicationConfig().ImageDir != "" {
os.MkdirAll(application.ApplicationConfig().GeneratedContentDir, 0750) router.Static("/generated-images", application.ApplicationConfig().ImageDir)
audioPath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "audio") }
imagePath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "images")
videoPath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "videos")
os.MkdirAll(audioPath, 0750) if application.ApplicationConfig().AudioDir != "" {
os.MkdirAll(imagePath, 0750) router.Static("/generated-audio", application.ApplicationConfig().AudioDir)
os.MkdirAll(videoPath, 0750)
router.Static("/generated-audio", audioPath)
router.Static("/generated-images", imagePath)
router.Static("/generated-videos", videoPath)
} }
// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration // Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration

View File

@@ -3,6 +3,7 @@ package http_test
import ( import (
"bytes" "bytes"
"context" "context"
"embed"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
@@ -23,7 +24,6 @@ import (
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
rice "github.com/GeertJohan/go.rice"
openaigo "github.com/otiai10/openaigo" openaigo "github.com/otiai10/openaigo"
"github.com/sashabaranov/go-openai" "github.com/sashabaranov/go-openai"
"github.com/sashabaranov/go-openai/jsonschema" "github.com/sashabaranov/go-openai/jsonschema"
@@ -264,15 +264,8 @@ func getRequest(url string, header http.Header) (error, int, []byte) {
const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml` const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml`
var backendAssets *rice.Box //go:embed backend-assets/*
var backendAssets embed.FS
func init() {
var err error
backendAssets, err = rice.FindBox("backend-assets")
if err != nil {
panic(err)
}
}
var _ = Describe("API test", func() { var _ = Describe("API test", func() {
@@ -636,7 +629,8 @@ var _ = Describe("API test", func() {
application, err := application.New( application, err := application.New(
append(commonOpts, append(commonOpts,
config.WithContext(c), config.WithContext(c),
config.WithGeneratedContentDir(tmpdir), config.WithAudioDir(tmpdir),
config.WithImageDir(tmpdir),
config.WithGalleries(galleries), config.WithGalleries(galleries),
config.WithModelPath(modelDir), config.WithModelPath(modelDir),
config.WithBackendAssets(backendAssets), config.WithBackendAssets(backendAssets),

View File

@@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
"id": modalName(m), "id": modalName(m),
"tabindex": "-1", "tabindex": "-1",
"aria-hidden": "true", "aria-hidden": "true",
"class": "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50", "class": "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
}, },
elem.Div( elem.Div(
attrs.Props{ attrs.Props{
"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]", "class": "relative p-4 w-full max-w-2xl max-h-full",
}, },
elem.Div( elem.Div(
attrs.Props{ attrs.Props{
"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col", "class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
}, },
// header // header
elem.Div( elem.Div(
@@ -164,13 +164,14 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
// body // body
elem.Div( elem.Div(
attrs.Props{ attrs.Props{
"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0", "class": "p-4 md:p-5 space-y-4",
}, },
elem.Div( elem.Div(
attrs.Props{ attrs.Props{
"class": "flex justify-center items-center", "class": "flex justify-center items-center",
}, },
elem.Img(attrs.Props{ elem.Img(attrs.Props{
// "class": "rounded-t-lg object-fit object-center h-96",
"class": "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded", "class": "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
"src": m.Icon, "src": m.Icon,
"loading": "lazy", "loading": "lazy",
@@ -231,6 +232,7 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
), ),
), ),
) )
} }
func modelDescription(m *gallery.GalleryModel) elem.Node { func modelDescription(m *gallery.GalleryModel) elem.Node {

View File

@@ -21,7 +21,6 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
if err != nil { if err != nil {
return err return err
} }
defer sl.Close()
vals := make([][]byte, len(input.Values)) vals := make([][]byte, len(input.Values))
for i, v := range input.Values { for i, v := range input.Values {
@@ -49,7 +48,6 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
if err != nil { if err != nil {
return err return err
} }
defer sl.Close()
if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil { if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
return err return err
@@ -71,7 +69,6 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
if err != nil { if err != nil {
return err return err
} }
defer sl.Close()
keys, vals, err := store.GetCols(c.Context(), sb, input.Keys) keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
if err != nil { if err != nil {
@@ -103,7 +100,6 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
if err != nil { if err != nil {
return err return err
} }
defer sl.Close()
keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk) keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
if err != nil { if err != nil {

View File

@@ -1,205 +0,0 @@
package localai
import (
"bufio"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/google/uuid"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/http/middleware"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/core/backend"
"github.com/gofiber/fiber/v2"
model "github.com/mudler/LocalAI/pkg/model"
"github.com/rs/zerolog/log"
)
func downloadFile(url string) (string, error) {
// Get the data
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
// Create the file
out, err := os.CreateTemp("", "video")
if err != nil {
return "", err
}
defer out.Close()
// Write the body to file
_, err = io.Copy(out, resp.Body)
return out.Name(), err
}
//
/*
*
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"prompt": "A cute baby sea otter",
"n": 1,
"size": "512x512"
}'
*
*/
// VideoEndpoint
// @Summary Creates a video given a prompt.
// @Param request body schema.OpenAIRequest true "query params"
// @Success 200 {object} schema.OpenAIResponse "Response"
// @Router /video [post]
func VideoEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.VideoRequest)
if !ok || input.Model == "" {
log.Error().Msg("Video Endpoint - Invalid Input")
return fiber.ErrBadRequest
}
config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
if !ok || config == nil {
log.Error().Msg("Video Endpoint - Invalid Config")
return fiber.ErrBadRequest
}
src := ""
if input.StartImage != "" {
var fileData []byte
var err error
// check if input.File is an URL, if so download it and save it
// to a temporary file
if strings.HasPrefix(input.StartImage, "http://") || strings.HasPrefix(input.StartImage, "https://") {
out, err := downloadFile(input.StartImage)
if err != nil {
return fmt.Errorf("failed downloading file:%w", err)
}
defer os.RemoveAll(out)
fileData, err = os.ReadFile(out)
if err != nil {
return fmt.Errorf("failed reading file:%w", err)
}
} else {
// base 64 decode the file and write it somewhere
// that we will cleanup
fileData, err = base64.StdEncoding.DecodeString(input.StartImage)
if err != nil {
return err
}
}
// Create a temporary file
outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64")
if err != nil {
return err
}
// write the base64 result
writer := bufio.NewWriter(outputFile)
_, err = writer.Write(fileData)
if err != nil {
outputFile.Close()
return err
}
outputFile.Close()
src = outputFile.Name()
defer os.RemoveAll(src)
}
log.Debug().Msgf("Parameter Config: %+v", config)
switch config.Backend {
case "stablediffusion":
config.Backend = model.StableDiffusionGGMLBackend
case "":
config.Backend = model.StableDiffusionGGMLBackend
}
width := input.Width
height := input.Height
if width == 0 {
width = 512
}
if height == 0 {
height = 512
}
b64JSON := input.ResponseFormat == "b64_json"
tempDir := ""
if !b64JSON {
tempDir = filepath.Join(appConfig.GeneratedContentDir, "videos")
}
// Create a temporary file
outputFile, err := os.CreateTemp(tempDir, "b64")
if err != nil {
return err
}
outputFile.Close()
// TODO: use mime type to determine the extension
output := outputFile.Name() + ".mp4"
// Rename the temporary file
err = os.Rename(outputFile.Name(), output)
if err != nil {
return err
}
baseURL := c.BaseURL()
fn, err := backend.VideoGeneration(height, width, input.Prompt, src, input.EndImage, output, ml, *config, appConfig)
if err != nil {
return err
}
if err := fn(); err != nil {
return err
}
item := &schema.Item{}
if b64JSON {
defer os.RemoveAll(output)
data, err := os.ReadFile(output)
if err != nil {
return err
}
item.B64JSON = base64.StdEncoding.EncodeToString(data)
} else {
base := filepath.Base(output)
item.URL = baseURL + "/generated-videos/" + base
}
id := uuid.New().String()
created := int(time.Now().Unix())
resp := &schema.OpenAIResponse{
ID: id,
Created: created,
Data: []schema.Item{*item},
}
jsonResult, _ := json.Marshal(resp)
log.Debug().Msgf("Response: %s", jsonResult)
// Return the prediction in the response body
return c.JSON(resp)
}
}

View File

@@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
cl := &config.BackendConfigLoader{} cl := &config.BackendConfigLoader{}
//configsDir := "/tmp/localai/configs" //configsDir := "/tmp/localai/configs"
modelPath := "/tmp/localai/model" modelPath := "/tmp/localai/model"
var ml = model.NewModelLoader(modelPath, false) var ml = model.NewModelLoader(modelPath)
appConfig := &config.ApplicationConfig{ appConfig := &config.ApplicationConfig{
ConfigsDir: configsDir, ConfigsDir: configsDir,

View File

@@ -108,7 +108,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
} }
// Create a temporary file // Create a temporary file
outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64") outputFile, err := os.CreateTemp(appConfig.ImageDir, "b64")
if err != nil { if err != nil {
return err return err
} }
@@ -184,7 +184,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
tempDir := "" tempDir := ""
if !b64JSON { if !b64JSON {
tempDir = filepath.Join(appConfig.GeneratedContentDir, "images") tempDir = appConfig.ImageDir
} }
// Create a temporary file // Create a temporary file
outputFile, err := os.CreateTemp(tempDir, "b64") outputFile, err := os.CreateTemp(tempDir, "b64")
@@ -192,7 +192,6 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
return err return err
} }
outputFile.Close() outputFile.Close()
output := outputFile.Name() + ".png" output := outputFile.Name() + ".png"
// Rename the temporary file // Rename the temporary file
err = os.Rename(outputFile.Name(), output) err = os.Rename(outputFile.Name(), output)

View File

@@ -29,9 +29,9 @@ func Explorer(db *explorer.Database) *fiber.App {
httpFS := http.FS(embedDirStatic) httpFS := http.FS(embedDirStatic)
app.Use(favicon.New(favicon.Config{ app.Use(favicon.New(favicon.Config{
URL: "/favicon.svg", URL: "/favicon.ico",
FileSystem: httpFS, FileSystem: httpFS,
File: "static/favicon.svg", File: "static/favicon.ico",
})) }))
app.Use("/static", filesystem.New(filesystem.Config{ app.Use("/static", filesystem.New(filesystem.Config{

View File

@@ -203,10 +203,18 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
config.Diffusers.ClipSkip = input.ClipSkip config.Diffusers.ClipSkip = input.ClipSkip
} }
if input.ModelBaseName != "" {
config.AutoGPTQ.ModelBaseName = input.ModelBaseName
}
if input.NegativePromptScale != 0 { if input.NegativePromptScale != 0 {
config.NegativePromptScale = input.NegativePromptScale config.NegativePromptScale = input.NegativePromptScale
} }
if input.UseFastTokenizer {
config.UseFastTokenizer = input.UseFastTokenizer
}
if input.NegativePrompt != "" { if input.NegativePrompt != "" {
config.NegativePrompt = input.NegativePrompt config.NegativePrompt = input.NegativePrompt
} }

Some files were not shown because too many files have changed in this diff Show More