Compare commits

..

3 Commits

Author SHA1 Message Date
Ettore Di Giacinto
a8e91345e2 fixup: create piper libdir also when not built
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2023-11-12 22:17:11 +01:00
Ettore Di Giacinto
ea4ade6b60 Merge branch 'master' into enable_gpu 2023-11-11 19:20:36 +01:00
Ettore Di Giacinto
796d0c99aa tests: re-enable GPU tests
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2023-11-06 19:10:35 +01:00
259 changed files with 1632 additions and 14686 deletions

19
.env
View File

@@ -69,21 +69,4 @@ MODELS_PATH=/models
# PYTHON_GRPC_MAX_WORKERS=1
### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
# LLAMACPP_PARALLEL=1
### Enable to run parallel requests
# PARALLEL_REQUESTS=true
### Watchdog settings
###
# Enables watchdog to kill backends that are inactive for too much time
# WATCHDOG_IDLE=true
#
# Enables watchdog to kill backends that are busy for too much time
# WATCHDOG_BUSY=true
#
# Time in duration format (e.g. 1h30m) after which a backend is considered idle
# WATCHDOG_IDLE_TIMEOUT=5m
#
# Time in duration format (e.g. 1h30m) after which a backend is considered busy
# WATCHDOG_BUSY_TIMEOUT=5m
# LLAMACPP_PARALLEL=1

View File

@@ -14,25 +14,8 @@ concurrency:
cancel-in-progress: true
jobs:
extras-image-build:
uses: ./.github/workflows/image_build.yml
with:
tag-latest: ${{ matrix.tag-latest }}
tag-suffix: ${{ matrix.tag-suffix }}
ffmpeg: ${{ matrix.ffmpeg }}
image-type: ${{ matrix.image-type }}
build-type: ${{ matrix.build-type }}
cuda-major-version: ${{ matrix.cuda-major-version }}
cuda-minor-version: ${{ matrix.cuda-minor-version }}
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
secrets:
dockerUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
dockerPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
image-build:
strategy:
# Pushing with all jobs in parallel
# eats the bandwidth of all the nodes
max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
matrix:
include:
- build-type: ''
@@ -41,117 +24,130 @@ jobs:
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
- build-type: ''
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
cuda-major-version: 11
cuda-minor-version: 7
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda11'
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
cuda-major-version: 12
cuda-minor-version: 1
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda12'
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
cuda-major-version: 11
cuda-minor-version: 7
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda11-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
cuda-major-version: 12
cuda-minor-version: 1
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda12-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
core-image-build:
uses: ./.github/workflows/image_build.yml
with:
tag-latest: ${{ matrix.tag-latest }}
tag-suffix: ${{ matrix.tag-suffix }}
ffmpeg: ${{ matrix.ffmpeg }}
image-type: ${{ matrix.image-type }}
build-type: ${{ matrix.build-type }}
cuda-major-version: ${{ matrix.cuda-major-version }}
cuda-minor-version: ${{ matrix.cuda-minor-version }}
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
secrets:
dockerUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
dockerPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
strategy:
matrix:
include:
- build-type: ''
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'ubuntu-latest'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda11-core'
ffmpeg: ''
image-type: 'core'
runs-on: 'ubuntu-latest'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda12-core'
ffmpeg: ''
image-type: 'core'
runs-on: 'ubuntu-latest'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda11-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'ubuntu-latest'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "1"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda12-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'ubuntu-latest'
runs-on: arc-runner-set
steps:
- name: Force Install GIT latest
run: |
sudo apt-get update \
&& sudo apt-get install -y software-properties-common \
&& sudo apt-get update \
&& sudo add-apt-repository -y ppa:git-core/ppa \
&& sudo apt-get update \
&& sudo apt-get install -y git
- name: Checkout
uses: actions/checkout@v4
# - name: Release space from worker
# run: |
# echo "Listing top largest packages"
# pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
# head -n 30 <<< "${pkgs}"
# echo
# df -h
# echo
# sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
# sudo apt-get remove --auto-remove android-sdk-platform-tools || true
# sudo apt-get purge --auto-remove android-sdk-platform-tools || true
# sudo rm -rf /usr/local/lib/android
# sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
# sudo rm -rf /usr/share/dotnet
# sudo apt-get remove -y '^mono-.*' || true
# sudo apt-get remove -y '^ghc-.*' || true
# sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
# sudo apt-get remove -y 'php.*' || true
# sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
# sudo apt-get remove -y '^google-.*' || true
# sudo apt-get remove -y azure-cli || true
# sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
# sudo apt-get remove -y '^gfortran-.*' || true
# sudo apt-get remove -y microsoft-edge-stable || true
# sudo apt-get remove -y firefox || true
# sudo apt-get remove -y powershell || true
# sudo apt-get remove -y r-base-core || true
# sudo apt-get autoremove -y
# sudo apt-get clean
# echo
# echo "Listing top largest packages"
# pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
# head -n 30 <<< "${pkgs}"
# echo
# sudo rm -rfv build || true
# df -h
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: quay.io/go-skynet/local-ai
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
type=sha
flavor: |
latest=${{ matrix.tag-latest }}
suffix=${{ matrix.tag-suffix }}
- name: Set up QEMU
uses: docker/setup-qemu-action@master
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@master
- name: Login to DockerHub
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
- name: Build and push
uses: docker/build-push-action@v5
with:
builder: ${{ steps.buildx.outputs.name }}
build-args: |
BUILD_TYPE=${{ matrix.build-type }}
CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
FFMPEG=${{ matrix.ffmpeg }}
context: .
file: ./Dockerfile
platforms: ${{ matrix.platforms }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

View File

@@ -1,147 +0,0 @@
---
name: 'build container images (reusable)'
on:
workflow_call:
inputs:
build-type:
description: 'Build type'
default: ''
type: string
cuda-major-version:
description: 'CUDA major version'
default: "11"
type: string
cuda-minor-version:
description: 'CUDA minor version'
default: "7"
type: string
platforms:
description: 'Platforms'
default: ''
type: string
tag-latest:
description: 'Tag latest'
default: ''
type: string
tag-suffix:
description: 'Tag suffix'
default: ''
type: string
ffmpeg:
description: 'FFMPEG'
default: ''
type: string
image-type:
description: 'Image type'
default: ''
type: string
runs-on:
description: 'Runs on'
required: true
default: ''
type: string
secrets:
dockerUsername:
required: true
dockerPassword:
required: true
jobs:
reusable_image-build:
runs-on: ${{ inputs.runs-on }}
steps:
- name: Force Install GIT latest
run: |
sudo apt-get update \
&& sudo apt-get install -y software-properties-common \
&& sudo apt-get update \
&& sudo add-apt-repository -y ppa:git-core/ppa \
&& sudo apt-get update \
&& sudo apt-get install -y git
- name: Checkout
uses: actions/checkout@v4
# - name: Release space from worker
# run: |
# echo "Listing top largest packages"
# pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
# head -n 30 <<< "${pkgs}"
# echo
# df -h
# echo
# sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
# sudo apt-get remove --auto-remove android-sdk-platform-tools || true
# sudo apt-get purge --auto-remove android-sdk-platform-tools || true
# sudo rm -rf /usr/local/lib/android
# sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
# sudo rm -rf /usr/share/dotnet
# sudo apt-get remove -y '^mono-.*' || true
# sudo apt-get remove -y '^ghc-.*' || true
# sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
# sudo apt-get remove -y 'php.*' || true
# sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
# sudo apt-get remove -y '^google-.*' || true
# sudo apt-get remove -y azure-cli || true
# sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
# sudo apt-get remove -y '^gfortran-.*' || true
# sudo apt-get remove -y microsoft-edge-stable || true
# sudo apt-get remove -y firefox || true
# sudo apt-get remove -y powershell || true
# sudo apt-get remove -y r-base-core || true
# sudo apt-get autoremove -y
# sudo apt-get clean
# echo
# echo "Listing top largest packages"
# pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
# head -n 30 <<< "${pkgs}"
# echo
# sudo rm -rfv build || true
# df -h
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: quay.io/go-skynet/local-ai
tags: |
type=ref,event=branch
type=semver,pattern={{raw}}
type=sha
flavor: |
latest=${{ inputs.tag-latest }}
suffix=${{ inputs.tag-suffix }}
- name: Set up QEMU
uses: docker/setup-qemu-action@master
with:
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@master
- name: Login to DockerHub
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.dockerUsername }}
password: ${{ secrets.dockerPassword }}
- name: Build and push
uses: docker/build-push-action@v5
with:
builder: ${{ steps.buildx.outputs.name }}
build-args: |
BUILD_TYPE=${{ inputs.build-type }}
CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
FFMPEG=${{ inputs.ffmpeg }}
IMAGE_TYPE=${{ inputs.image-type }}
context: .
file: ./Dockerfile
platforms: ${{ inputs.platforms }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
- name: job summary
run: |
echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY

View File

@@ -1,219 +0,0 @@
---
name: 'Tests extras backends'
on:
pull_request:
push:
branches:
- master
tags:
- '*'
concurrency:
group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }}
cancel-in-progress: true
jobs:
tests-transformers:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo rm -rfv /usr/bin/conda || true
- name: Test transformers
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/transformers
make -C backend/python/transformers test
tests-sentencetransformers:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo rm -rfv /usr/bin/conda || true
- name: Test sentencetransformers
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/sentencetransformers
make -C backend/python/sentencetransformers test
tests-diffusers:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo rm -rfv /usr/bin/conda || true
- name: Test diffusers
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/diffusers
make -C backend/python/diffusers test
tests-transformers-musicgen:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo rm -rfv /usr/bin/conda || true
- name: Test transformers-musicgen
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/transformers-musicgen
make -C backend/python/transformers-musicgen test
tests-bark:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
sudo apt-get update && \
sudo apt-get install -y conda
sudo apt-get install -y ca-certificates cmake curl patch
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo rm -rfv /usr/bin/conda || true
- name: Test bark
run: |
export PATH=$PATH:/opt/conda/bin
make -C backend/python/bark
make -C backend/python/bark test
# Below tests needs GPU. Commented out for now
# TODO: Re-enable as soon as we have GPU nodes
# tests-vllm:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential ffmpeg
# curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
# sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
# gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
# sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
# sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
# sudo apt-get update && \
# sudo apt-get install -y conda
# sudo apt-get install -y ca-certificates cmake curl patch
# sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
# sudo rm -rfv /usr/bin/conda || true
# - name: Test vllm
# run: |
# export PATH=$PATH:/opt/conda/bin
# make -C backend/python/vllm
# make -C backend/python/vllm test
# tests-vallex:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential ffmpeg
# curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
# sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
# gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
# sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
# sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
# sudo apt-get update && \
# sudo apt-get install -y conda
# sudo apt-get install -y ca-certificates cmake curl patch
# sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
# sudo rm -rfv /usr/bin/conda || true
# - name: Test vall-e-x
# run: |
# export PATH=$PATH:/opt/conda/bin
# make -C backend/python/vall-e-x
# make -C backend/python/vall-e-x test

View File

@@ -78,12 +78,13 @@ jobs:
sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
sudo rm -rfv /usr/bin/conda || true
PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface
# Pre-build piper before we start tests in order to have shared libraries in place
make sources/go-piper && \
GO_TAGS="tts" make -C sources/go-piper piper.o && \
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
make go-piper && \
GO_TAGS="tts" make -C go-piper piper.o && \
sudo cp -rfv go-piper/piper/build/pi/lib/. /usr/lib/ && \
# Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

10
.gitignore vendored
View File

@@ -1,9 +1,15 @@
# go-llama build artifacts
/sources/
go-llama
go-llama-stable
/gpt4all
go-stable-diffusion
go-piper
/go-bert
go-ggllm
/piper
__pycache__/
*.a
get-sources
prepare-sources
/backend/cpp/llama/grpc-server
/backend/cpp/llama/llama.cpp

3
.gitmodules vendored
View File

@@ -1,3 +0,0 @@
[submodule "docs/themes/hugo-theme-relearn"]
path = docs/themes/hugo-theme-relearn
url = https://github.com/McShelby/hugo-theme-relearn.git

View File

@@ -12,9 +12,7 @@ ARG TARGETARCH
ARG TARGETVARIANT
ENV BUILD_TYPE=${BUILD_TYPE}
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/run.sh,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/run.sh,vall-e-x:/build/extra/grpc/vall-e-x/run.sh,vllm:/build/extra/grpc/vllm/run.sh"
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
ARG GO_TAGS="stablediffusion tts"
@@ -66,10 +64,20 @@ RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmo
apt-get update && \
apt-get install -y conda
COPY extra/requirements.txt /build/extra/requirements.txt
ENV PATH="/root/.cargo/bin:${PATH}"
RUN pip install --upgrade pip
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
#RUN if [ "${TARGETARCH}" = "amd64" ]; then \
# pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
# fi
#RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
# pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
# fi
#RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
# Vall-e-X
RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
# \
# ; fi
@@ -90,9 +98,12 @@ ENV NVIDIA_VISIBLE_DEVICES=all
WORKDIR /build
COPY Makefile .
RUN make get-sources
COPY go.mod .
RUN make prepare
COPY . .
COPY .git .
RUN make prepare
# stablediffusion does not tolerate a newer version of abseil, build it first
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
@@ -101,15 +112,15 @@ RUN if [ "${BUILD_GRPC}" = "true" ]; then \
git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
../.. && make -j12 install \
../.. && make -j12 install && rm -rf grpc \
; fi
# Rebuild with defaults backends
RUN make build
RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
RUN if [ ! -d "/build/go-piper/piper/build/pi/lib/" ]; then \
mkdir -p /build/go-piper/piper/build/pi/lib/ \
touch /build/go-piper/piper/build/pi/lib/keep \
; fi
###################################
@@ -143,56 +154,50 @@ WORKDIR /build
# see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
# https://github.com/go-skynet/LocalAI/pull/434
COPY . .
COPY --from=builder /build/sources ./sources/
COPY --from=builder /build/grpc ./grpc/
RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
RUN make prepare-sources
# Copy the binary
COPY --from=builder /build/local-ai ./
# Copy shared libraries for piper
COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
COPY --from=builder /build/go-piper/piper/build/pi/lib/* /usr/lib/
# do not let stablediffusion rebuild (requires an older version of absl)
COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
## Duplicated from Makefile to avoid having a big layer that's hard to push
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/autogptq \
PATH=$PATH:/opt/conda/bin make -C extra/grpc/autogptq \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/bark \
PATH=$PATH:/opt/conda/bin make -C extra/grpc/bark \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/diffusers \
PATH=$PATH:/opt/conda/bin make -C extra/grpc/diffusers \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/vllm \
PATH=$PATH:/opt/conda/bin make -C extra/grpc/vllm \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers \
PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/transformers \
PATH=$PATH:/opt/conda/bin make -C extra/grpc/vall-e-x \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/vall-e-x \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/exllama2 \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/petals \
; fi
RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
PATH=$PATH:/opt/conda/bin make -C backend/python/transformers-musicgen \
PATH=$PATH:/opt/conda/bin make -C extra/grpc/exllama \
; fi
# Copy VALLE-X as it's not a real "lib"
RUN if [ -d /usr/lib/vall-e-x ]; then \
cp -rfv /usr/lib/vall-e-x/* ./ ; \
fi
# we also copy exllama libs over to resolve exllama import error
RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
fi
# Define the health check command
HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1

View File

@@ -1,10 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>com.apple.security.network.client</key>
<true/>
<key>com.apple.security.network.server</key>
<true/>
</dict>
</plist>

354
Makefile
View File

@@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
CPPLLAMA_VERSION?=88ae8952b65cbf32eb1f5703681ea592e510e570
CPPLLAMA_VERSION?=a75fa576abba9d37f463580c379e4bbf1e1ad03c
# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -22,21 +22,20 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
RWKV_VERSION?=c898cd0f62df8f2a7830e53d1d513bef4f6f792b
# whisper.cpp version
WHISPER_CPP_VERSION?=940de9dbe9c90624dc99521cb34c8a97b86d543c
WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
# bert.cpp version
BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
# go-piper version
PIPER_VERSION?=d6b6275ba037dabdba4a8b65dfdf6b2a73a67f07
PIPER_VERSION?=736f6fb639ab8e3397356e48eeb6bdcb9da88a78
# stablediffusion version
STABLEDIFFUSION_VERSION?=902db5f066fd137697e3b69d0fa10d4782bd2c2f
STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
export BUILD_TYPE?=
export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
export CMAKE_ARGS?=
CGO_LDFLAGS?=
CUDA_LIBPATH?=/usr/local/cuda/lib64/
GO_TAGS?=
@@ -69,39 +68,29 @@ ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
ifeq ($(OS),Darwin)
ifeq ($(UNAME_S),Darwin)
CGO_LDFLAGS += -lcblas -framework Accelerate
ifeq ($(OSX_SIGNING_IDENTITY),)
OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
endif
# on OSX, if BUILD_TYPE is blank, we should default to use Metal
ifeq ($(BUILD_TYPE),)
BUILD_TYPE=metal
# disable metal if on Darwin and any other value is explicitly passed.
else ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DLLAMA_METAL=OFF
endif
ifneq ($(BUILD_TYPE),metal)
# explicit disable metal if on Darwin and metal is disabled
CMAKE_ARGS+=-DLLAMA_METAL=OFF
endif
endif
ifeq ($(BUILD_TYPE),openblas)
CGO_LDFLAGS+=-lopenblas
export WHISPER_OPENBLAS=1
endif
ifeq ($(BUILD_TYPE),cublas)
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
export LLAMA_CUBLAS=1
export WHISPER_CUBLAS=1
endif
ifeq ($(BUILD_TYPE),hipblas)
ROCM_HOME ?= /opt/rocm
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
# llama-ggml has no hipblas support, so override it here.
# Llama-stable has no hipblas support, so override it here.
export STABLE_BUILD_TYPE=
export WHISPER_HIPBLAS=1
GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
@@ -111,12 +100,10 @@ endif
ifeq ($(BUILD_TYPE),metal)
CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
export LLAMA_METAL=1
export WHISPER_METAL=1
endif
ifeq ($(BUILD_TYPE),clblas)
CGO_LDFLAGS+=-lOpenCL -lclblast
export WHISPER_CLBLAST=1
endif
# glibc-static or glibc-devel-static required
@@ -132,12 +119,12 @@ endif
ifeq ($(findstring tts,$(GO_TAGS)),tts)
# OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
# OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/sources/go-piper/piper/src/cpp -I$(shell pwd)/sources/go-piper/piper/build/fi/include -I$(shell pwd)/sources/go-piper/piper/build/pi/include -I$(shell pwd)/sources/go-piper/piper/build/si/include
PIPER_CGO_LDFLAGS+=-L$(shell pwd)/sources/go-piper/piper/build/fi/lib -L$(shell pwd)/sources/go-piper/piper/build/pi/lib -L$(shell pwd)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/go-piper/piper/src/cpp -I$(shell pwd)/go-piper/piper/build/fi/include -I$(shell pwd)/go-piper/piper/build/pi/include -I$(shell pwd)/go-piper/piper/build/si/include
PIPER_CGO_LDFLAGS+=-L$(shell pwd)/go-piper/piper/build/fi/lib -L$(shell pwd)/go-piper/piper/build/pi/lib -L$(shell pwd)/go-piper/piper/build/si/lib -lfmt -lspdlog
OPTIONAL_GRPC+=backend-assets/grpc/piper
endif
ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-ggml backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
# If empty, then we build all
@@ -150,117 +137,112 @@ endif
all: help
## GPT4ALL
sources/gpt4all:
git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
gpt4all:
git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
## go-piper
sources/go-piper:
git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
go-piper:
git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
cd go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
## BERT embeddings
sources/go-bert:
git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
go-bert:
git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp go-bert
cd go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
## stable diffusion
sources/go-stable-diffusion:
git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
go-stable-diffusion:
git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion go-stable-diffusion
cd go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
sources/go-stable-diffusion/libstablediffusion.a:
$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
go-stable-diffusion/libstablediffusion.a:
$(MAKE) -C go-stable-diffusion libstablediffusion.a
## RWKV
sources/go-rwkv:
git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
go-rwkv:
git clone --recurse-submodules $(RWKV_REPO) go-rwkv
cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
sources/go-rwkv/librwkv.a: sources/go-rwkv
cd sources/go-rwkv && cd rwkv.cpp && cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF && cmake --build . && cp librwkv.a ..
go-rwkv/librwkv.a: go-rwkv
cd go-rwkv && cd rwkv.cpp && cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF && cmake --build . && cp librwkv.a ..
sources/go-bert/libgobert.a: sources/go-bert
$(MAKE) -C sources/go-bert libgobert.a
go-bert/libgobert.a: go-bert
$(MAKE) -C go-bert libgobert.a
backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
mkdir -p backend-assets/gpt4all
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
@cp gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
backend-assets/espeak-ng-data: sources/go-piper
backend-assets/espeak-ng-data: go-piper
mkdir -p backend-assets/espeak-ng-data
$(MAKE) -C sources/go-piper piper.o
@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
$(MAKE) -C go-piper piper.o
@cp -rf go-piper/piper/build/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a
## CEREBRAS GPT
sources/go-ggml-transformers:
git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp sources/go-ggml-transformers
cd sources/go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
go-ggml-transformers:
git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp go-ggml-transformers
cd go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
sources/go-ggml-transformers/libtransformers.a: sources/go-ggml-transformers
$(MAKE) -C sources/go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
go-ggml-transformers/libtransformers.a: go-ggml-transformers
$(MAKE) -C go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
sources/whisper.cpp:
git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
whisper.cpp:
git clone https://github.com/ggerganov/whisper.cpp.git
cd whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && make libwhisper.a
whisper.cpp/libwhisper.a: whisper.cpp
cd whisper.cpp && make libwhisper.a
sources/go-llama:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
go-llama:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
sources/go-llama-ggml:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
go-llama-stable:
git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
sources/go-llama/libbinding.a: sources/go-llama
$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
go-llama/libbinding.a: go-llama
$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
go-llama-stable/libbinding.a: go-llama-stable
$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
sources/go-piper/libpiper_binding.a: sources/go-piper
$(MAKE) -C sources/go-piper libpiper_binding.a example/main
go-piper/libpiper_binding.a: go-piper
$(MAKE) -C go-piper libpiper_binding.a example/main
backend/cpp/llama/llama.cpp:
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/go-ggml-transformers sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion
get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
touch $@
replace:
$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang
$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/sources/go-ggml-transformers
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/sources/go-rwkv
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(shell pwd)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/sources/go-bert
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/sources/go-stable-diffusion
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/sources/go-piper
$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper
prepare-sources: get-sources replace
$(GOCMD) mod download
touch $@
## GENERIC
rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C sources/go-llama clean
$(MAKE) -C sources/go-llama-ggml clean
$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
$(MAKE) -C sources/go-ggml-transformers clean
$(MAKE) -C sources/go-rwkv clean
$(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-stable-diffusion clean
$(MAKE) -C sources/go-bert clean
$(MAKE) -C sources/go-piper clean
$(MAKE) -C go-llama clean
$(MAKE) -C go-llama-stable clean
$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
$(MAKE) -C go-ggml-transformers clean
$(MAKE) -C go-rwkv clean
$(MAKE) -C whisper.cpp clean
$(MAKE) -C go-stable-diffusion clean
$(MAKE) -C go-bert clean
$(MAKE) -C go-piper clean
$(MAKE) build
prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -269,7 +251,17 @@ prepare: prepare-sources $(OPTIONAL_TARGETS)
clean: ## Remove build related file
$(GOCMD) clean -cache
rm -f prepare
rm -rf ./sources
rm -rf ./go-llama
rm -rf ./gpt4all
rm -rf ./go-llama-stable
rm -rf ./go-gpt2
rm -rf ./go-stable-diffusion
rm -rf ./go-ggml-transformers
rm -rf ./backend-assets
rm -rf ./go-rwkv
rm -rf ./go-bert
rm -rf ./whisper.cpp
rm -rf ./go-piper
rm -rf $(BINARY_NAME)
rm -rf release/
rm -rf ./backend/cpp/grpc/grpc_repo
@@ -291,9 +283,6 @@ dist: build
mkdir -p release
cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
osx-signed: build
codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"
## Run
run: prepare ## run local-ai
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./
@@ -317,7 +306,7 @@ test: prepare test-models/testmodel grpcs
@echo 'Running tests'
export GO_TAGS="tts stablediffusion"
$(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 --fail-fast -v -r ./api ./pkg
$(MAKE) test-gpt4all
$(MAKE) test-llama
@@ -384,55 +373,40 @@ help: ## Show this help.
protogen: protogen-go protogen-python
protogen-go:
protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
backend/backend.proto
protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative \
pkg/grpc/proto/backend.proto
protogen-python:
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto
## GRPC
# Note: it is duplicated in the Dockerfile
prepare-extra-conda-environments:
$(MAKE) -C backend/python/autogptq
$(MAKE) -C backend/python/bark
$(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/vllm
$(MAKE) -C backend/python/sentencetransformers
$(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/transformers-musicgen
$(MAKE) -C backend/python/vall-e-x
$(MAKE) -C backend/python/exllama
$(MAKE) -C backend/python/petals
$(MAKE) -C backend/python/exllama2
$(MAKE) -C extra/grpc/autogptq
$(MAKE) -C extra/grpc/bark
$(MAKE) -C extra/grpc/diffusers
$(MAKE) -C extra/grpc/vllm
$(MAKE) -C extra/grpc/huggingface
$(MAKE) -C extra/grpc/vall-e-x
$(MAKE) -C extra/grpc/exllama
prepare-test-extra:
$(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/diffusers
test-extra: prepare-test-extra
$(MAKE) -C backend/python/transformers test
$(MAKE) -C backend/python/diffusers test
backend-assets/grpc:
mkdir -p backend-assets/grpc
backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama LIBRARY_PATH=$(shell pwd)/sources/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
# TODO: every binary should have its own folder instead, so can have different implementations
backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
# TODO: every binary should have its own folder instead, so can have different metal implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/ggml-metal.metal backend-assets/grpc/
cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
endif
## BACKEND CPP LLAMA START
@@ -451,7 +425,7 @@ ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
CMAKE_ARGS="${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
else
echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
@@ -466,71 +440,71 @@ ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
endif
backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/sources/go-llama-ggml
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-llama-ggml LIBRARY_PATH=$(shell pwd)/sources/go-llama-ggml \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/
backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/sources/gpt4all/gpt4all-bindings/golang/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
backend-assets/grpc/dolly: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./backend/go/llm/dolly/
backend-assets/grpc/dolly: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./cmd/grpc/dolly/
backend-assets/grpc/gpt2: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./backend/go/llm/gpt2/
backend-assets/grpc/gpt2: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./cmd/grpc/gpt2/
backend-assets/grpc/gptj: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./backend/go/llm/gptj/
backend-assets/grpc/gptj: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./cmd/grpc/gptj/
backend-assets/grpc/gptneox: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./backend/go/llm/gptneox/
backend-assets/grpc/gptneox: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./cmd/grpc/gptneox/
backend-assets/grpc/mpt: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./backend/go/llm/mpt/
backend-assets/grpc/mpt: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./cmd/grpc/mpt/
backend-assets/grpc/replit: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./backend/go/llm/replit/
backend-assets/grpc/replit: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./cmd/grpc/replit/
backend-assets/grpc/falcon-ggml: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./backend/go/llm/falcon-ggml/
backend-assets/grpc/falcon-ggml: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./cmd/grpc/falcon-ggml/
backend-assets/grpc/starcoder: backend-assets/grpc sources/go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/sources/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./backend/go/llm/starcoder/
backend-assets/grpc/starcoder: backend-assets/grpc go-ggml-transformers/libtransformers.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./cmd/grpc/starcoder/
backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-rwkv LIBRARY_PATH=$(shell pwd)/sources/go-rwkv \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/
backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-bert LIBRARY_PATH=$(shell pwd)/sources/go-bert \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
backend-assets/grpc/langchain-huggingface: backend-assets/grpc
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/
backend-assets/grpc/stablediffusion: backend-assets/grpc
if [ ! -f backend-assets/grpc/stablediffusion ]; then \
$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/sources/go-stable-diffusion/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/; \
$(MAKE) go-stable-diffusion/libstablediffusion.a; \
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
fi
backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/sources/go-piper \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/
backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/sources/whisper.cpp LIBRARY_PATH=$(shell pwd)/sources/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/
grpcs: prepare $(GRPC_BACKENDS)

View File

@@ -21,14 +21,15 @@
</p>
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
>
> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
>
> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)
[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.
<p align="center"><b>Follow LocalAI </b></p>
<p align="center"><b>Follow LocalAI </b></p>
<p align="center">
<a href="https://twitter.com/LocalAI_API" target="blank">
@@ -38,7 +39,7 @@
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
</a>
<p align="center"><b>Connect with the Creator </b></p>
<p align="center"><b>Connect with the Creator </b></p>
<p align="center">
<a href="https://twitter.com/mudler_it" target="blank">
@@ -49,12 +50,12 @@
</a>
</p>
<p align="center"><b>Share LocalAI Repository</b></p>
<p align="center"><b>Share LocalAI Repository</b></p>
<p align="center">
<a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a>
<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a>
<a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
<a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
" target="blank">
@@ -63,37 +64,23 @@
</p>
## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)
## 🔥🔥 Hot topics / Roadmap
[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
🆕 New! [LLM finetuning guide](https://localai.io/advanced/fine-tuning/)
Hot topics (looking for contributors):
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
<hr>
In a nutshell:
- Local, OpenAI drop-in alternative REST API. You own your data.
- NO GPU required. NO Internet access is required either
- Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html).
- Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html).
- Supports multiple models
- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.
LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!
LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!
Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)
## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -104,34 +91,7 @@ Note that this started just as a [fun weekend project](https://localai.io/#backs
- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
- 🆕 [Vision API](https://localai.io/features/gpt-vision/)
## 💻 Usage
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
### 🔗 Community and integrations
Build and deploy custom containers:
- https://github.com/sozercan/aikit
WebUIs:
- https://github.com/Jirubizu/localai-admin
- https://github.com/go-skynet/LocalAI-frontend
Model galleries
- https://github.com/go-skynet/model-gallery
Other:
- Helm chart https://github.com/go-skynet/helm-charts
### 🔗 Resources
- 🆕 New! [LLM finetuning guide](https://localai.io/advanced/fine-tuning/)
- [How to build locally](https://localai.io/basics/build/index.html)
- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
- [Projects integrating LocalAI](https://localai.io/integrations/)
- [How tos section](https://localai.io/howtos/) (curated by our community)
## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)
@@ -140,6 +100,21 @@ Other:
- [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
- [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)
## 💻 Usage
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
### 💡 Example: Use Luna-AI Llama model
See the [documentation](https://localai.io/basics/getting_started)
### 🔗 Resources
- [How to build locally](https://localai.io/basics/build/index.html)
- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
- [Projects integrating LocalAI](https://localai.io/integrations/)
- [How tos section](https://localai.io/howtos/) (curated by our community)
## Citation
If you utilize this repository, data in a downstream project, please consider citing it with:
@@ -162,12 +137,12 @@ Support the project by becoming [a backer or sponsor](https://github.com/sponsor
A huge thank you to our generous sponsors who support this project:
| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
|:-----------------------------------------------:|
| [Spectro Cloud](https://www.spectrocloud.com/) |
| [Spectro Cloud](https://www.spectrocloud.com/) |
| Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs! |
And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.
And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.
- [Sponsor list](https://github.com/sponsors/mudler)
- JDAM00 (donating HW for the CI)

View File

@@ -1,10 +1,8 @@
package api
import (
"encoding/json"
"errors"
"fmt"
"os"
"strings"
config "github.com/go-skynet/LocalAI/api/config"
@@ -15,7 +13,6 @@ import (
"github.com/go-skynet/LocalAI/internal"
"github.com/go-skynet/LocalAI/metrics"
"github.com/go-skynet/LocalAI/pkg/assets"
"github.com/go-skynet/LocalAI/pkg/model"
"github.com/gofiber/fiber/v2"
"github.com/gofiber/fiber/v2/middleware/cors"
@@ -82,22 +79,6 @@ func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader,
options.Loader.StopAllGRPC()
}()
if options.WatchDog {
wd := model.NewWatchDog(
options.Loader,
options.WatchDogBusyTimeout,
options.WatchDogIdleTimeout,
options.WatchDogBusy,
options.WatchDogIdle)
options.Loader.SetWatchDog(wd)
go wd.Run()
go func() {
<-options.Context.Done()
log.Debug().Msgf("Context canceled, shutting down")
wd.Shutdown()
}()
}
return options, cl, nil
}
@@ -146,46 +127,28 @@ func App(opts ...options.AppOption) (*fiber.App, error) {
// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
auth := func(c *fiber.Ctx) error {
if len(options.ApiKeys) == 0 {
return c.Next()
}
// Check for api_keys.json file
fileContent, err := os.ReadFile("api_keys.json")
if err == nil {
// Parse JSON content from the file
var fileKeys []string
err := json.Unmarshal(fileContent, &fileKeys)
if err != nil {
return c.Status(fiber.StatusInternalServerError).JSON(fiber.Map{"message": "Error parsing api_keys.json"})
if len(options.ApiKeys) > 0 {
authHeader := c.Get("Authorization")
if authHeader == "" {
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
}
authHeaderParts := strings.Split(authHeader, " ")
if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
}
// Add file keys to options.ApiKeys
options.ApiKeys = append(options.ApiKeys, fileKeys...)
}
if len(options.ApiKeys) == 0 {
return c.Next()
}
authHeader := c.Get("Authorization")
if authHeader == "" {
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
}
authHeaderParts := strings.Split(authHeader, " ")
if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
}
apiKey := authHeaderParts[1]
for _, key := range options.ApiKeys {
if apiKey == key {
return c.Next()
apiKey := authHeaderParts[1]
validApiKey := false
for _, key := range options.ApiKeys {
if apiKey == key {
validApiKey = true
}
}
if !validApiKey {
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
}
}
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
return c.Next()
}
if options.CORS {

View File

@@ -301,7 +301,7 @@ var _ = Describe("API test", func() {
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
Name: "openllama_3b",
Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -704,7 +704,7 @@ var _ = Describe("API test", func() {
})
Context("External gRPC calls", func() {
It("calculate embeddings with sentencetransformers", func() {
It("calculate embeddings with huggingface", func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}

View File

@@ -16,7 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
model.WithContext(o.Context),
model.WithModel(c.Model),
model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
CUDA: c.CUDA || c.Diffusers.CUDA,
CUDA: c.Diffusers.CUDA,
SchedulerType: c.Diffusers.SchedulerType,
PipelineType: c.Diffusers.PipelineType,
CFGScale: c.Diffusers.CFGScale,
@@ -27,7 +27,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
CLIPModel: c.Diffusers.ClipModel,
CLIPSubfolder: c.Diffusers.ClipSubFolder,
CLIPSkip: int32(c.Diffusers.ClipSkip),
ControlNet: c.Diffusers.ControlNet,
}),
})

View File

@@ -16,10 +16,6 @@ func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.
opts = append(opts, model.WithSingleActiveBackend())
}
if o.ParallelBackendRequests {
opts = append(opts, model.EnableParallelRequests)
}
if c.GRPC.Attempts != 0 {
opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
}
@@ -46,7 +42,6 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
Seed: int32(c.Seed),
NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ,
CUDA: c.CUDA, // diffusers, transformers
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization,

View File

@@ -59,13 +59,9 @@ func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *opt
// If the model file is not empty, we pass it joined with the model path
modelPath := ""
if modelFile != "" {
if bb != model.TransformersMusicGen {
modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
return "", nil, err
}
} else {
modelPath = modelFile
modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
return "", nil, err
}
}

View File

@@ -38,17 +38,14 @@ type Config struct {
// Diffusers
Diffusers Diffusers `yaml:"diffusers"`
Step int `yaml:"step"`
Step int `yaml:"step"`
// GRPC Options
GRPC GRPC `yaml:"grpc"`
// Vall-e-x
VallE VallE `yaml:"vall-e"`
// CUDA
// Explicitly enable CUDA or not (some backends might need it)
CUDA bool `yaml:"cuda"`
}
type VallE struct {
@@ -68,16 +65,15 @@ type GRPC struct {
}
type Diffusers struct {
CUDA bool `yaml:"cuda"`
PipelineType string `yaml:"pipeline_type"`
SchedulerType string `yaml:"scheduler_type"`
CUDA bool `yaml:"cuda"`
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
ClipSkip int `yaml:"clip_skip"` // Skip every N frames
ClipModel string `yaml:"clip_model"` // Clip model to use
ClipSubFolder string `yaml:"clip_subfolder"` // Subfolder to use for clip model
ControlNet string `yaml:"control_net"`
}
type LLMConfig struct {
@@ -281,7 +277,7 @@ func (cm *ConfigLoader) LoadConfigs(path string) error {
}
for _, file := range files {
// Skip templates, YAML and .keep files
if !strings.Contains(file.Name(), ".yaml") && !strings.Contains(file.Name(), ".yml") {
if !strings.Contains(file.Name(), ".yaml") {
continue
}
c, err := ReadConfig(filepath.Join(path, file.Name()))

View File

@@ -123,12 +123,13 @@ func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
return err
}
model := bm.options.Loader.CheckIsLoaded(backendId)
if model == "" {
client := bm.options.Loader.CheckIsLoaded(backendId)
if client == nil {
return fmt.Errorf("backend %s is not currently loaded", backendId)
}
status, rpcErr := model.GRPC(false, nil).Status(context.TODO())
status, rpcErr := client.Status(context.TODO())
if rpcErr != nil {
log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
val, slbErr := bm.SampleLocalBackendProcess(backendId)

View File

@@ -81,7 +81,7 @@ func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx)
noActionDescription = config.FunctionsConfig.NoActionDescriptionName
}
if input.ResponseFormat.Type == "json_object" {
if input.ResponseFormat == "json_object" {
input.Grammar = grammar.JSONBNF
}

View File

@@ -65,7 +65,7 @@ func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fibe
return fmt.Errorf("failed reading parameters from request:%w", err)
}
if input.ResponseFormat.Type == "json_object" {
if input.ResponseFormat == "json_object" {
input.Grammar = grammar.JSONBNF
}

View File

@@ -5,8 +5,6 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strconv"
@@ -24,26 +22,6 @@ import (
"github.com/rs/zerolog/log"
)
func downloadFile(url string) (string, error) {
// Get the data
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
// Create the file
out, err := os.CreateTemp("", "image")
if err != nil {
return "", err
}
defer out.Close()
// Write the body to file
_, err = io.Copy(out, resp.Body)
return out.Name(), err
}
// https://platform.openai.com/docs/api-reference/images/create
/*
@@ -78,31 +56,12 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
src := ""
if input.File != "" {
fileData := []byte{}
// check if input.File is an URL, if so download it and save it
// to a temporary file
if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
out, err := downloadFile(input.File)
if err != nil {
return fmt.Errorf("failed downloading file:%w", err)
}
defer os.RemoveAll(out)
fileData, err = os.ReadFile(out)
if err != nil {
return fmt.Errorf("failed reading file:%w", err)
}
} else {
// base 64 decode the file and write it somewhere
// that we will cleanup
fileData, err = base64.StdEncoding.DecodeString(input.File)
if err != nil {
return err
}
//base 64 decode the file and write it somewhere
// that we will cleanup
decoded, err := base64.StdEncoding.DecodeString(input.File)
if err != nil {
return err
}
// Create a temporary file
outputFile, err := os.CreateTemp(o.ImageDir, "b64")
if err != nil {
@@ -110,7 +69,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
}
// write the base64 result
writer := bufio.NewWriter(outputFile)
_, err = writer.Write(fileData)
_, err = writer.Write(decoded)
if err != nil {
outputFile.Close()
return err
@@ -141,7 +100,7 @@ func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx
}
b64JSON := false
if input.ResponseFormat.Type == "b64_json" {
if input.ResponseFormat == "b64_json" {
b64JSON = true
}
// src and clip_skip

View File

@@ -4,11 +4,10 @@ import (
"context"
"embed"
"encoding/json"
"time"
"github.com/go-skynet/LocalAI/metrics"
"github.com/go-skynet/LocalAI/pkg/gallery"
model "github.com/go-skynet/LocalAI/pkg/model"
"github.com/go-skynet/LocalAI/metrics"
"github.com/rs/zerolog/log"
)
@@ -37,13 +36,7 @@ type Option struct {
AutoloadGalleries bool
SingleBackend bool
ParallelBackendRequests bool
WatchDogIdle bool
WatchDogBusy bool
WatchDog bool
WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
SingleBackend bool
}
type AppOption func(*Option)
@@ -69,40 +62,10 @@ func WithCors(b bool) AppOption {
}
}
var EnableWatchDog = func(o *Option) {
o.WatchDog = true
}
var EnableWatchDogIdleCheck = func(o *Option) {
o.WatchDog = true
o.WatchDogIdle = true
}
var EnableWatchDogBusyCheck = func(o *Option) {
o.WatchDog = true
o.WatchDogBusy = true
}
func SetWatchDogBusyTimeout(t time.Duration) AppOption {
return func(o *Option) {
o.WatchDogBusyTimeout = t
}
}
func SetWatchDogIdleTimeout(t time.Duration) AppOption {
return func(o *Option) {
o.WatchDogIdleTimeout = t
}
}
var EnableSingleBackend = func(o *Option) {
o.SingleBackend = true
}
var EnableParallelBackendRequests = func(o *Option) {
o.ParallelBackendRequests = true
}
var EnableGalleriesAutoload = func(o *Option) {
o.AutoloadGalleries = true
}

View File

@@ -83,12 +83,6 @@ type OpenAIModel struct {
Object string `json:"object"`
}
type ChatCompletionResponseFormatType string
type ChatCompletionResponseFormat struct {
Type ChatCompletionResponseFormatType `json:"type,omitempty"`
}
type OpenAIRequest struct {
config.PredictionOptions
@@ -98,7 +92,7 @@ type OpenAIRequest struct {
// whisper
File string `json:"file" validate:"required"`
//whisper/image
ResponseFormat ChatCompletionResponseFormat `json:"response_format"`
ResponseFormat string `json:"response_format"`
// image
Size string `json:"size"`
// Prompt is read only by completion/image API calls

View File

@@ -36,7 +36,7 @@ include_directories(${Protobuf_INCLUDE_DIRS})
message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
# Proto file
get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
get_filename_component(hw_proto_path "${hw_proto}" PATH)
# Generated sources

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=
LLAMA_VERSION?=d9b33fe95bd257b36c84ee5769cc048230067d6f
CMAKE_ARGS?=
BUILD_TYPE?=
@@ -21,9 +21,6 @@ endif
llama.cpp:
git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
if [ -z "$(LLAMA_VERSION)" ]; then \
exit 1; \
fi
cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
llama.cpp/examples/grpc-server:

View File

@@ -40,17 +40,8 @@ using backend::HealthMessage;
///// LLAMA.CPP server code below
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json;
struct server_params
{
std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
};
using json = nlohmann::json;
static bool server_verbose = false;
@@ -71,10 +62,6 @@ static bool server_verbose = false;
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
json oaicompat_completion_params_parse(const json &body);
std::string format_chatml(std::vector<json> messages);
//
// base64 utils (TODO: move to common in the future)
//
@@ -165,23 +152,15 @@ struct task_server {
json data;
bool infill_mode = false;
bool embedding_mode = false;
int multitask_id = -1;
};
struct task_result {
int id;
int multitask_id = -1;
bool stop;
bool error;
json result_json;
};
struct task_multi {
int id;
std::set<int> subtasks_remaining{};
std::vector<task_result> results{};
};
// TODO: can become bool if we can't find use of more states
enum slot_state
{
@@ -386,6 +365,7 @@ struct llama_client_slot
int32_t num_prompt_tokens = 0;
int32_t num_prompt_tokens_processed = 0;
int32_t multibyte_pending = 0;
json prompt;
std::string generated_text;
@@ -401,9 +381,6 @@ struct llama_client_slot
bool stopped_word = false;
bool stopped_limit = false;
bool oaicompat = false;
std::string oaicompat_model;
std::string stopping_word;
// sampling
@@ -423,9 +400,6 @@ struct llama_client_slot
double t_prompt_processing; // ms
double t_token_generation; // ms
// multitasks
int multitask_id = -1;
void reset() {
num_prompt_tokens = 0;
generated_text = "";
@@ -434,6 +408,7 @@ struct llama_client_slot
stopped_word = false;
stopped_limit = false;
stopping_word = "";
multibyte_pending = 0;
n_past = 0;
sent_count = 0;
sent_token_probs_index = 0;
@@ -505,7 +480,7 @@ struct llama_client_slot
};
}
void print_timings() const {
void print_timings() {
LOG_TEE("\n");
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
@@ -529,7 +504,6 @@ struct llama_server_context
bool multimodal = false;
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
bool add_bos_token = true;
int32_t id_gen;
int32_t n_ctx; // total context for all clients / slots
@@ -548,8 +522,7 @@ struct llama_server_context
std::vector<task_server> queue_tasks;
std::vector<task_result> queue_results;
std::vector<task_multi> queue_multitasks;
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
std::mutex mutex_tasks;
std::mutex mutex_results;
~llama_server_context()
@@ -603,8 +576,6 @@ struct llama_server_context
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model);
return true;
}
@@ -638,11 +609,6 @@ struct llama_server_context
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
{
// TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
const bool TMP_FORCE_SPECIAL = true;
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
@@ -658,12 +624,12 @@ struct llama_server_context
std::vector<llama_token> p;
if (first)
{
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
p = ::llama_tokenize(ctx, s, add_bos);
first = false;
}
else
{
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
p = ::llama_tokenize(ctx, s, false);
}
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
}
@@ -680,7 +646,7 @@ struct llama_server_context
else
{
auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
}
return prompt_tokens;
@@ -711,20 +677,11 @@ struct llama_server_context
slot_params default_params;
llama_sampling_params default_sparams;
if (data.count("__oaicompat") != 0) {
slot->oaicompat = true;
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
} else {
slot->oaicompat = false;
slot->oaicompat_model = "";
}
slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
@@ -909,7 +866,7 @@ struct llama_server_context
}
void update_system_prompt() {
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
llama_batch_clear(batch);
@@ -1000,36 +957,35 @@ struct llama_server_context
slot.generated_text += token_str;
slot.has_next_token = true;
// check if there is incomplete UTF-8 character at the end
bool incomplete = false;
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
if (slot.multibyte_pending > 0)
{
unsigned char c = slot.generated_text[slot.generated_text.size() - i];
if ((c & 0xC0) == 0x80)
{
// continuation byte: 10xxxxxx
continue;
}
slot.multibyte_pending -= token_str.size();
}
else if (token_str.size() == 1)
{
const char c = token_str[0];
// 2-byte characters: 110xxxxx 10xxxxxx
if ((c & 0xE0) == 0xC0)
{
// 2-byte character: 110xxxxx ...
incomplete = i < 2;
slot.multibyte_pending = 1;
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
}
else if ((c & 0xF0) == 0xE0)
{
// 3-byte character: 1110xxxx ...
incomplete = i < 3;
slot.multibyte_pending = 2;
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
}
else if ((c & 0xF8) == 0xF0)
{
// 4-byte character: 11110xxx ...
incomplete = i < 4;
slot.multibyte_pending = 3;
}
else
{
slot.multibyte_pending = 0;
}
// else 1-byte character or invalid byte
break;
}
if (!incomplete)
if (slot.multibyte_pending == 0)
{
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
@@ -1064,7 +1020,7 @@ struct llama_server_context
}
}
if (incomplete)
if (slot.multibyte_pending > 0 && !slot.has_next_token)
{
slot.has_next_token = true;
}
@@ -1133,40 +1089,16 @@ struct llama_server_context
return slot.images.size() > 0;
}
void send_error(task_server& task, std::string error)
void send_error(int id, std::string error)
{
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = task.id;
res.multitask_id = task.multitask_id;
res.stop = false;
res.id = id;
res.error = true;
res.result_json = { { "content", error } };
queue_results.push_back(res);
}
void add_multi_task(int id, std::vector<int>& sub_ids)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
task_multi multi;
multi.id = id;
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
queue_multitasks.push_back(multi);
}
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
for (auto& multitask : queue_multitasks)
{
if (multitask.id == multitask_id)
{
multitask.subtasks_remaining.erase(subtask_id);
multitask.results.push_back(result);
}
}
}
json get_model_props()
{
return get_formated_generation(slots[0]);
@@ -1184,7 +1116,6 @@ struct llama_server_context
{"temp", slot.sparams.temp},
{"top_k", slot.sparams.top_k},
{"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p},
{"tfs_z", slot.sparams.tfs_z},
{"typical_p", slot.sparams.typical_p},
{"repeat_last_n", slot.sparams.penalty_last_n},
@@ -1211,7 +1142,6 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
res.error = false;
res.stop = false;
@@ -1237,12 +1167,6 @@ struct llama_server_context
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
}
if (slot.oaicompat)
{
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
res.result_json["model"] = slot.oaicompat_model;
}
queue_results.push_back(res);
}
@@ -1251,7 +1175,6 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
res.error = false;
res.stop = true;
@@ -1291,18 +1214,6 @@ struct llama_server_context
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
}
if (slot.oaicompat)
{
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
res.result_json["model"] = slot.oaicompat_model;
}
// parent multitask, if any, needs to be updated
if (slot.multitask_id != -1)
{
update_multi_task(slot.multitask_id, slot.task_id, res);
}
queue_results.push_back(res);
}
@@ -1311,7 +1222,6 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = slot.task_id;
res.multitask_id = slot.multitask_id;
res.error = false;
res.stop = true;
@@ -1338,26 +1248,15 @@ struct llama_server_context
queue_results.push_back(res);
}
int request_completion(json data, bool infill, bool embedding, int multitask_id)
int request_completion(json data, bool infill, bool embedding)
{
std::unique_lock<std::mutex> lock(mutex_tasks);
std::lock_guard<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.target_id = 0;
task.data = std::move(data);
task.data = data;
task.infill_mode = infill;
task.embedding_mode = embedding;
task.type = COMPLETION_TASK;
task.multitask_id = multitask_id;
// when a completion task's prompt array is not a singleton, we split it into multiple requests
if (task.data.at("prompt").size() > 1)
{
lock.unlock(); // entering new func scope
return split_multiprompt_task(task);
}
// otherwise, it's a single-prompt task, we actually queue it
queue_tasks.push_back(task);
return task.id;
}
@@ -1376,17 +1275,8 @@ struct llama_server_context
for (int i = 0; i < (int) queue_results.size(); i++)
{
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (queue_results[i].multitask_id == task_id)
{
update_multi_task(task_id, queue_results[i].id, queue_results[i]);
queue_results.erase(queue_results.begin() + i);
continue;
}
if (queue_results[i].id == task_id)
{
assert(queue_results[i].multitask_id == -1);
task_result res = queue_results[i];
queue_results.erase(queue_results.begin() + i);
return res;
@@ -1476,27 +1366,6 @@ struct llama_server_context
queue_tasks.push_back(task);
}
int split_multiprompt_task(task_server& multiprompt_task)
{
int prompt_count = multiprompt_task.data.at("prompt").size();
assert(prompt_count > 1);
int multitask_id = id_gen++;
std::vector<int> subtask_ids(prompt_count);
for (int i = 0; i < prompt_count; i++)
{
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
}
// queue up the multitask so we can track its subtask progression
add_multi_task(multitask_id, subtask_ids);
return multitask_id;
}
void process_tasks()
{
std::lock_guard<std::mutex> lock(mutex_tasks);
@@ -1512,7 +1381,7 @@ struct llama_server_context
{
LOG_TEE("slot unavailable\n");
// send error result
send_error(task, "slot unavailable");
send_error(task.id, "slot unavailable");
return;
}
@@ -1526,12 +1395,11 @@ struct llama_server_context
slot->infill = task.infill_mode;
slot->embedding = task.embedding_mode;
slot->task_id = task.id;
slot->multitask_id = task.multitask_id;
if (!launch_slot_with_data(slot, task.data))
{
// send error result
send_error(task, "internal_error");
send_error(task.id, "internal_error");
break;
}
} break;
@@ -1547,38 +1415,6 @@ struct llama_server_context
} break;
}
}
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
if (queue_iterator->subtasks_remaining.empty())
{
// all subtasks done == multitask is done
task_result aggregate_result;
aggregate_result.id = queue_iterator->id;
aggregate_result.stop = true;
aggregate_result.error = false;
// collect json results into one json result
std::vector<json> result_jsons;
for (auto& subres : queue_iterator->results)
{
result_jsons.push_back(subres.result_json);
aggregate_result.error = aggregate_result.error && subres.error;
}
aggregate_result.result_json = json{ "results", result_jsons };
std::lock_guard<std::mutex> lock(mutex_results);
queue_results.push_back(aggregate_result);
queue_iterator = queue_multitasks.erase(queue_iterator);
}
else
{
++queue_iterator;
}
}
}
bool update_slots() {
@@ -1717,40 +1553,11 @@ struct llama_server_context
}
else
{
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
}
slot.num_prompt_tokens = prompt_tokens.size();
if (slot.params.n_keep < 0)
{
slot.params.n_keep = slot.num_prompt_tokens;
}
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
// if input prompt is too big, truncate it
if (slot.num_prompt_tokens >= slot.n_ctx)
{
const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_block_size = n_left / 2;
const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
LOG_VERBOSE("input truncated", {
{"n_ctx", slot.n_ctx},
{"n_keep", slot.params.n_keep},
{"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
});
slot.truncated = true;
prompt_tokens = new_tokens;
slot.num_prompt_tokens = prompt_tokens.size();
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
}
if (!slot.params.cache_prompt)
{
llama_sampling_reset(slot.ctx_sampling);
@@ -1760,6 +1567,35 @@ struct llama_server_context
}
else
{
if (slot.params.n_keep < 0)
{
slot.params.n_keep = slot.num_prompt_tokens;
}
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
// if input prompt is too big, truncate it
if (slot.num_prompt_tokens >= slot.n_ctx)
{
const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_block_size = n_left / 2;
const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
LOG_VERBOSE("input truncated", {
{"n_ctx", slot.n_ctx},
{"n_keep", slot.params.n_keep},
{"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
});
slot.truncated = true;
prompt_tokens = new_tokens;
slot.num_prompt_tokens = prompt_tokens.size();
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
}
// push the prompt into the sampling context (do not apply grammar)
for (auto &token : prompt_tokens)
{
@@ -1794,7 +1630,7 @@ struct llama_server_context
const bool has_images = process_images(slot);
// process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
{
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
@@ -1914,231 +1750,6 @@ struct llama_server_context
};
static std::string random_string()
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
std::random_device rd;
std::mt19937 generator(rd());
std::string result(32, ' ');
for (int i = 0; i < 32; ++i) {
result[i] = str[generator() % str.size()];
}
return result;
}
static std::string gen_chatcmplid()
{
std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
}
std::string format_chatml(std::vector<json> messages)
{
std::ostringstream chatml_msgs;
for (auto it = messages.begin(); it != messages.end(); ++it) {
chatml_msgs << "<|im_start|>"
<< json_value(*it, "role", std::string("user")) << '\n';
chatml_msgs << json_value(*it, "content", std::string(""))
<< "<|im_end|>\n";
}
chatml_msgs << "<|im_start|>assistant" << '\n';
return chatml_msgs.str();
}
/* llama.cpp completion api semantics */
json oaicompat_completion_params_parse(
const json &body /* openai api json semantics */)
{
json llama_params;
llama_params["__oaicompat"] = true;
// Map OpenAI parameters to llama.cpp parameters
llama_params["model"] = json_value(body, "model", std::string("uknown"));
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
llama_params["temperature"] = json_value(body, "temperature", 0.8);
llama_params["top_k"] = json_value(body, "top_k", 40);
llama_params["top_p"] = json_value(body, "top_p", 0.95);
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
llama_params["seed"] = json_value(body, "seed", 0);
llama_params["stream"] = json_value(body, "stream", false);
llama_params["mirostat"] = json_value(body, "mirostat", false);
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
if (llama_params.count("grammar") != 0) {
llama_params["grammar"] = json_value(body, "grammar", json::object());
}
// Handle 'stop' field
if (body.contains("stop") && body["stop"].is_string()) {
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
} else {
llama_params["stop"] = json_value(body, "stop", json::array());
}
// Ensure there is ChatML-specific end sequence among stop words
llama_params["stop"].push_back("<|im_end|>");
return llama_params;
}
static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
{
json result = response.result_json;
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason = "length";
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
json choices =
streaming ? json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}})
: json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"message", json{{"content", content},
{"role", "assistant"}}}}});
std::time_t t = std::time(0);
json res =
json{{"choices", choices},
{"created", t},
{"model",
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
{"usage",
json{{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
{"id", gen_chatcmplid()}};
if (server_verbose) {
res["__verbose"] = result;
}
if (result.contains("completion_probabilities")) {
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
return res;
}
// return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
json result = response.result_json;
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({response.result_json});
}
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
bool stopped_word = json_value(result, "stopped_word", false);
bool stopped_eos = json_value(result, "stopped_eos", false);
bool stopped_limit = json_value(result, "stopped_limit", false);
std::string content = json_value(result, "content", std::string(""));
std::string finish_reason;
if (stopped_word || stopped_eos) {
finish_reason = "stop";
}
if (stopped_limit) {
finish_reason = "length";
}
std::time_t t = std::time(0);
json choices;
if (!finish_reason.empty()) {
choices = json::array({json{{"finish_reason", finish_reason},
{"index", 0},
{"delta", json::object()}}});
} else {
if (first) {
if (content.empty()) {
choices = json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{{"role", "assistant"}}}}});
} else {
// We have to send this as two updates to conform to openai behavior
json initial_ret = json{{"choices", json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"role", "assistant"}
}}}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
{"delta", json{
{"content", content}}}
}})},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({initial_ret, second_ret});
}
} else {
// Some idiosyncrasy in task processing logic makes several trailing calls
// with empty content, we ignore these at the calee site.
if (content.empty()) {
return std::vector<json>({json::object()});
}
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
{"delta",
json{
{"content", content},
}},
}});
}
}
json ret = json{{"choices", choices},
{"created", t},
{"id", gen_chatcmplid()},
{"model", modelname},
{"object", "chat.completion.chunk"}};
return std::vector<json>({ret});
}
static json format_partial_response(
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
@@ -2171,6 +1782,8 @@ static json format_detokenized_response(std::string content)
{"content", content}};
}
struct token_translator
{
llama_context * ctx;
@@ -2366,7 +1979,7 @@ static void params_parse(const backend::ModelOptions* request,
// params.model_alias ??
params.model_alias = request->modelfile();
params.n_ctx = request->contextsize();
//params.memory_f16 = request->f16memory();
params.memory_f16 = request->f16memory();
params.n_threads = request->threads();
params.n_gpu_layers = request->ngpulayers();
params.n_batch = request->nbatch();
@@ -2473,7 +2086,7 @@ public:
}
grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
json data = parse_options(true, request, llama);
const int task_id = llama.request_completion(data, false, false, -1);
const int task_id = llama.request_completion(data, false, false);
while (true)
{
task_result result = llama.next_result(task_id);
@@ -2509,7 +2122,7 @@ public:
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
json data = parse_options(false, request, llama);
const int task_id = llama.request_completion(data, false, false, -1);
const int task_id = llama.request_completion(data, false, false);
std::string completion_text;
task_result result = llama.next_result(task_id);
if (!result.error && result.stop) {

View File

@@ -1,21 +0,0 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
panic(err)
}
}

View File

@@ -1,21 +0,0 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Whisper{}); err != nil {
panic(err)
}
}

View File

@@ -1,21 +0,0 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Piper{}); err != nil {
panic(err)
}
}

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,17 +0,0 @@
.PHONY: ttsbark
ttsbark:
@echo "Creating virtual environment..."
@conda env create --name ttsbark --file ttsbark.yml
@echo "Virtual environment created."
.PHONY: run
run:
@echo "Running bark..."
bash run.sh
@echo "bark run."
.PHONY: test
test:
@echo "Testing bark..."
bash test.sh
@echo "bark tested."

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,81 +0,0 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "ttsbark.py", "--addr", "localhost:50051"])
time.sleep(10)
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.terminate()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_tts(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="v2/en_speaker_4"))
self.assertTrue(response.success)
tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
tts_response = stub.TTS(tts_request)
self.assertIsNotNone(tts_response)
except Exception as err:
print(err)
self.fail("TTS service failed")
finally:
self.tearDown()

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,84 +0,0 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "backend_diffusers.py", "--addr", "localhost:50051"])
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.kill()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test(self):
"""
This method tests if the backend can generate images
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
print(response.message)
self.assertTrue(response.success)
image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
re = stub.GenerateImage(image_req)
self.assertTrue(re.success)
except Exception as err:
print(err)
self.fail("Image gen service failed")
finally:
self.tearDown()

View File

@@ -1,14 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the diffusers server with conda
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate diffusers
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python -m unittest $DIR/test.py

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,15 +0,0 @@
#!/bin/bash
##
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate exllama
echo $CONDA_PREFIX
git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
cp -rfv $CONDA_PREFIX/exllama/* ./

View File

@@ -1,12 +0,0 @@
.PHONY: exllama2
exllama2:
@echo "Creating virtual environment..."
@conda env create --name exllama2 --file exllama2.yml
@echo "Virtual environment created."
bash install.sh
.PHONY: run
run:
@echo "Running exllama2..."
bash run.sh
@echo "exllama2 run."

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,57 +0,0 @@
name: exllama2
channels:
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- bzip2=1.0.8=h7b6447c_0
- ca-certificates=2023.08.22=h06a4308_0
- ld_impl_linux-64=2.38=h1181459_1
- libffi=3.4.4=h6a678d5_0
- libgcc-ng=11.2.0=h1234567_1
- libgomp=11.2.0=h1234567_1
- libstdcxx-ng=11.2.0=h1234567_1
- libuuid=1.41.5=h5eee18b_0
- ncurses=6.4=h6a678d5_0
- openssl=3.0.11=h7f8727e_2
- pip=23.2.1=py311h06a4308_0
- python=3.11.5=h955ad1f_0
- readline=8.2=h5eee18b_0
- setuptools=68.0.0=py311h06a4308_0
- sqlite=3.41.2=h5eee18b_0
- tk=8.6.12=h1ccaba5_0
- tzdata=2023c=h04d1e81_0
- wheel=0.41.2=py311h06a4308_0
- xz=5.4.2=h5eee18b_0
- zlib=1.2.13=h5eee18b_0
- pip:
- filelock==3.12.4
- fsspec==2023.9.2
- grpcio==1.59.0
- markupsafe==2.1.3
- mpmath==1.3.0
- networkx==3.1
- protobuf==4.24.4
- nvidia-cublas-cu12==12.1.3.1
- nvidia-cuda-cupti-cu12==12.1.105
- nvidia-cuda-nvrtc-cu12==12.1.105
- nvidia-cuda-runtime-cu12==12.1.105
- nvidia-cudnn-cu12==8.9.2.26
- nvidia-cufft-cu12==11.0.2.54
- nvidia-curand-cu12==10.3.2.106
- nvidia-cusolver-cu12==11.4.5.107
- nvidia-cusparse-cu12==12.1.0.106
- nvidia-nccl-cu12==2.18.1
- nvidia-nvjitlink-cu12==12.2.140
- nvidia-nvtx-cu12==12.1.105
- pandas
- numpy
- ninja
- fastparquet
- torch>=2.1.0
- safetensors>=0.3.2
- sentencepiece>=0.1.97
- pygments
- websockets
- regex
prefix: /opt/conda/envs/exllama2

View File

@@ -1,134 +0,0 @@
#!/usr/bin/env python3
import grpc
from concurrent import futures
import time
import backend_pb2
import backend_pb2_grpc
import argparse
import signal
import sys
import os, glob
from pathlib import Path
import torch
import torch.nn.functional as F
from torch import version as torch_version
from exllamav2.generator import (
ExLlamaV2BaseGenerator,
ExLlamaV2Sampler
)
from exllamav2 import(
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Tokenizer,
model_init,
)
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
try:
model_directory = request.ModelFile
config = ExLlamaV2Config()
config.model_dir = model_directory
config.prepare()
model = ExLlamaV2(config)
cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)
tokenizer = ExLlamaV2Tokenizer(config)
# Initialize generator
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
self.generator= generator
generator.warmup()
self.model = model
self.tokenizer = tokenizer
self.cache = cache
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
penalty = 1.15
if request.Penalty != 0.0:
penalty = request.Penalty
settings = ExLlamaV2Sampler.Settings()
settings.temperature = request.Temperature
settings.top_k = request.TopK
settings.top_p = request.TopP
settings.token_repetition_penalty = penalty
settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
tokens = 512
if request.Tokens != 0:
tokens = request.Tokens
output = self.generator.generate_simple(request.Prompt, settings, tokens, seed = self.seed)
# Remove prompt from response if present
if request.Prompt in output:
output = output.replace(request.Prompt, "")
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
def PredictStream(self, request, context):
# Implement PredictStream RPC
#for reply in some_data_generator():
# yield reply
# Not implemented yet
return self.Predict(request, context)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View File

@@ -1,14 +0,0 @@
#!/bin/bash
##
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate exllama2
echo $CONDA_PREFIX
git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
cp -rfv $CONDA_PREFIX/exllamav2/* ./

View File

@@ -1,11 +0,0 @@
.PHONY: petals
petals:
@echo "Creating virtual environment..."
@conda env create --name petals --file petals.yml
@echo "Virtual environment created."
.PHONY: run
run:
@echo "Running petals..."
bash run.sh
@echo "petals run."

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,140 +0,0 @@
#!/usr/bin/env python3
from concurrent import futures
import time
import argparse
import signal
import sys
import os
import backend_pb2
import backend_pb2_grpc
import grpc
import torch
from transformers import AutoTokenizer
from petals import AutoDistributedModelForCausalLM
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
A gRPC servicer that implements the Backend service defined in backend.proto.
"""
def Health(self, request, context):
"""
Returns a health check message.
Args:
request: The health check request.
context: The gRPC context.
Returns:
backend_pb2.Reply: The health check reply.
"""
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
"""
Loads a language model.
Args:
request: The load model request.
context: The gRPC context.
Returns:
backend_pb2.Result: The load model result.
"""
try:
self.tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=False, add_bos_token=False)
self.model = AutoDistributedModelForCausalLM.from_pretrained(request.Model)
self.cuda = False
if request.CUDA:
self.model = self.model.cuda()
self.cuda = True
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
"""
Generates text based on the given prompt and sampling parameters.
Args:
request: The predict request.
context: The gRPC context.
Returns:
backend_pb2.Result: The predict result.
"""
inputs = self.tokenizer(request.Prompt, return_tensors="pt")["input_ids"]
if self.cuda:
inputs = inputs.cuda()
if request.Tokens == 0:
# Max to max value if tokens are not specified
request.Tokens = 8192
# TODO: kwargs and map all parameters
outputs = self.model.generate(inputs, max_new_tokens=request.Tokens)
generated_text = self.tokenizer.decode(outputs[0])
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Result(message=bytes(generated_text, encoding='utf-8'))
def PredictStream(self, request, context):
"""
Generates text based on the given prompt and sampling parameters, and streams the results.
Args:
request: The predict stream request.
context: The gRPC context.
Returns:
backend_pb2.Result: The predict stream result.
"""
# Implement PredictStream RPC
#for reply in some_data_generator():
# yield reply
# Not implemented yet
return self.Predict(request, context)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View File

@@ -1,30 +0,0 @@
name: petals
channels:
- defaults
dependencies:
# - _libgcc_mutex=0.1=main
# - _openmp_mutex=5.1=1_gnu
# - bzip2=1.0.8=h7b6447c_0
# - ca-certificates=2023.08.22=h06a4308_0
# - ld_impl_linux-64=2.38=h1181459_1
# - libffi=3.4.4=h6a678d5_0
# - libgcc-ng=11.2.0=h1234567_1
# - libgomp=11.2.0=h1234567_1
# - libstdcxx-ng=11.2.0=h1234567_1
# - libuuid=1.41.5=h5eee18b_0
# - ncurses=6.4=h6a678d5_0
# - openssl=3.0.11=h7f8727e_2
# - pip=23.2.1=py311h06a4308_0
- python=3.11.5=h955ad1f_0
# - readline=8.2=h5eee18b_0
# - setuptools=68.0.0=py311h06a4308_0
# - sqlite=3.41.2=h5eee18b_0
# - tk=8.6.12=h1ccaba5_0
# - tzdata=2023c=h04d1e81_0
# - wheel=0.41.2=py311h06a4308_0
# - xz=5.4.2=h5eee18b_0
# - zlib=1.2.13=h5eee18b_0
- pip:
- torch==2.1.0
- git+https://github.com/bigscience-workshop/petals
prefix: /opt/conda/envs/petals

View File

@@ -1,21 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the exllama server with conda
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
# if source is available use it, or use conda
#
if [ -f /opt/conda/bin/activate ]; then
source activate petals
else
eval "$(conda shell.bash hook)"
conda activate petals
fi
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python $DIR/backend_petals.py $@

View File

@@ -1,18 +0,0 @@
.PHONY: sentencetransformers
sentencetransformers:
@echo "Creating virtual environment..."
@conda env create --name sentencetransformers --file sentencetransformers.yml
@echo "Virtual environment created."
.PHONY: run
run:
@echo "Running sentencetransformers..."
bash run.sh
@echo "sentencetransformers run."
# It is not working well by using command line. It only6 works with IDE like VSCode.
.PHONY: test
test:
@echo "Testing sentencetransformers..."
bash test.sh
@echo "sentencetransformers tested."

View File

@@ -1,5 +0,0 @@
# Creating a separate environment for the sentencetransformers project
```
make sentencetransformers
```

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,14 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the sentencetransformers server with conda
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate sentencetransformers
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python $DIR/sentencetransformers.py $@

View File

@@ -1,77 +0,0 @@
name: sentencetransformers
channels:
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=5.1=1_gnu
- bzip2=1.0.8=h7b6447c_0
- ca-certificates=2023.08.22=h06a4308_0
- ld_impl_linux-64=2.38=h1181459_1
- libffi=3.4.4=h6a678d5_0
- libgcc-ng=11.2.0=h1234567_1
- libgomp=11.2.0=h1234567_1
- libstdcxx-ng=11.2.0=h1234567_1
- libuuid=1.41.5=h5eee18b_0
- ncurses=6.4=h6a678d5_0
- openssl=3.0.11=h7f8727e_2
- pip=23.2.1=py311h06a4308_0
- python=3.11.5=h955ad1f_0
- readline=8.2=h5eee18b_0
- setuptools=68.0.0=py311h06a4308_0
- sqlite=3.41.2=h5eee18b_0
- tk=8.6.12=h1ccaba5_0
- tzdata=2023c=h04d1e81_0
- wheel=0.41.2=py311h06a4308_0
- xz=5.4.2=h5eee18b_0
- zlib=1.2.13=h5eee18b_0
- pip:
- certifi==2023.7.22
- charset-normalizer==3.3.0
- click==8.1.7
- filelock==3.12.4
- fsspec==2023.9.2
- grpcio==1.59.0
- huggingface-hub==0.17.3
- idna==3.4
- install==1.3.5
- jinja2==3.1.2
- joblib==1.3.2
- markupsafe==2.1.3
- mpmath==1.3.0
- networkx==3.1
- nltk==3.8.1
- numpy==1.26.0
- nvidia-cublas-cu12==12.1.3.1
- nvidia-cuda-cupti-cu12==12.1.105
- nvidia-cuda-nvrtc-cu12==12.1.105
- nvidia-cuda-runtime-cu12==12.1.105
- nvidia-cudnn-cu12==8.9.2.26
- nvidia-cufft-cu12==11.0.2.54
- nvidia-curand-cu12==10.3.2.106
- nvidia-cusolver-cu12==11.4.5.107
- nvidia-cusparse-cu12==12.1.0.106
- nvidia-nccl-cu12==2.18.1
- nvidia-nvjitlink-cu12==12.2.140
- nvidia-nvtx-cu12==12.1.105
- packaging==23.2
- pillow==10.0.1
- protobuf==4.24.4
- pyyaml==6.0.1
- regex==2023.10.3
- requests==2.31.0
- safetensors==0.4.0
- scikit-learn==1.3.1
- scipy==1.11.3
- sentence-transformers==2.2.2
- sentencepiece==0.1.99
- sympy==1.12
- threadpoolctl==3.2.0
- tokenizers==0.14.1
- torch==2.1.0
- torchvision==0.16.0
- tqdm==4.66.1
- transformers==4.34.0
- triton==2.1.0
- typing-extensions==4.8.0
- urllib3==2.0.6
prefix: /opt/conda/envs/sentencetransformers

View File

@@ -1,11 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the sentencetransformers server with conda
# Activate conda environment
source activate sentencetransformers
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python -m unittest $DIR/test_sentencetransformers.py

View File

@@ -1,24 +0,0 @@
TRANSFORMERS_MUSICGEN_CONDA_PATH = "transformers-musicgen.yml"
ifeq ($(BUILD_TYPE), cublas)
TRANSFORMERS_MUSICGEN_CONDA_PATH = "transformers-musicgen-nvidia.yml"
endif
.PHONY: transformers-musicgen
transformers-musicgen:
@echo "Creating virtual environment..."
@conda env create --name transformers-musicgen --file $(TRANSFORMERS_MUSICGEN_CONDA_PATH)
@echo "Virtual environment created."
.PHONY: run
run:
@echo "Running transformers..."
bash run.sh
@echo "transformers run."
.PHONY: test
test:
@echo "Testing transformers..."
bash test.sh
@echo "transformers tested."

View File

@@ -1,5 +0,0 @@
# Creating a separate environment for the transformers project
```
make transformers-musicgen
```

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,363 +0,0 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import backend_pb2 as backend__pb2
class BackendStub(object):
"""Missing associated documentation comment in .proto file."""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Health = channel.unary_unary(
'/backend.Backend/Health',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Predict = channel.unary_unary(
'/backend.Backend/Predict',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.LoadModel = channel.unary_unary(
'/backend.Backend/LoadModel',
request_serializer=backend__pb2.ModelOptions.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.PredictStream = channel.unary_stream(
'/backend.Backend/PredictStream',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Embedding = channel.unary_unary(
'/backend.Backend/Embedding',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.EmbeddingResult.FromString,
)
self.GenerateImage = channel.unary_unary(
'/backend.Backend/GenerateImage',
request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.AudioTranscription = channel.unary_unary(
'/backend.Backend/AudioTranscription',
request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
response_deserializer=backend__pb2.TranscriptResult.FromString,
)
self.TTS = channel.unary_unary(
'/backend.Backend/TTS',
request_serializer=backend__pb2.TTSRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.TokenizeString = channel.unary_unary(
'/backend.Backend/TokenizeString',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.TokenizationResponse.FromString,
)
self.Status = channel.unary_unary(
'/backend.Backend/Status',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.StatusResponse.FromString,
)
class BackendServicer(object):
"""Missing associated documentation comment in .proto file."""
def Health(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Predict(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def LoadModel(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def PredictStream(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Embedding(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def GenerateImage(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def AudioTranscription(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TTS(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TokenizeString(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Status(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_BackendServicer_to_server(servicer, server):
rpc_method_handlers = {
'Health': grpc.unary_unary_rpc_method_handler(
servicer.Health,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Predict': grpc.unary_unary_rpc_method_handler(
servicer.Predict,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'LoadModel': grpc.unary_unary_rpc_method_handler(
servicer.LoadModel,
request_deserializer=backend__pb2.ModelOptions.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'PredictStream': grpc.unary_stream_rpc_method_handler(
servicer.PredictStream,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Embedding': grpc.unary_unary_rpc_method_handler(
servicer.Embedding,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
),
'GenerateImage': grpc.unary_unary_rpc_method_handler(
servicer.GenerateImage,
request_deserializer=backend__pb2.GenerateImageRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'AudioTranscription': grpc.unary_unary_rpc_method_handler(
servicer.AudioTranscription,
request_deserializer=backend__pb2.TranscriptRequest.FromString,
response_serializer=backend__pb2.TranscriptResult.SerializeToString,
),
'TTS': grpc.unary_unary_rpc_method_handler(
servicer.TTS,
request_deserializer=backend__pb2.TTSRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'TokenizeString': grpc.unary_unary_rpc_method_handler(
servicer.TokenizeString,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
),
'Status': grpc.unary_unary_rpc_method_handler(
servicer.Status,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.StatusResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'backend.Backend', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
# This class is part of an EXPERIMENTAL API.
class Backend(object):
"""Missing associated documentation comment in .proto file."""
@staticmethod
def Health(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Predict(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def LoadModel(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
backend__pb2.ModelOptions.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def PredictStream(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Embedding(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.EmbeddingResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def GenerateImage(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
backend__pb2.GenerateImageRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def AudioTranscription(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
backend__pb2.TranscriptRequest.SerializeToString,
backend__pb2.TranscriptResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TTS(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
backend__pb2.TTSRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TokenizeString(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.TokenizationResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Status(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.StatusResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

View File

@@ -1,16 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the transformers-musicgen server with conda
echo "Launching gRPC server for transformers-musicgen"
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers-musicgen
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python $DIR/transformers_server.py $@

View File

@@ -1,11 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the transformers server with conda
# Activate conda environment
source activate transformers-musicgen
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python -m unittest $DIR/test_transformers.py

View File

@@ -1,81 +0,0 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "transformers_server.py", "--addr", "localhost:50051"])
time.sleep(10)
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.terminate()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_tts(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/musicgen-small"))
self.assertTrue(response.success)
tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
tts_response = stub.TTS(tts_request)
self.assertIsNotNone(tts_response)
except Exception as err:
print(err)
self.fail("TTS service failed")
finally:
self.tearDown()

View File

@@ -1,71 +0,0 @@
name: transformers-musicgen
channels:
- defaults
dependencies:
- bzip2=1.0.8
- ca-certificates=2023.08.22
- libffi=3.4.4
- libuuid=1.41.5
- ncurses=6.4
- openssl=3.0.11
- pip=23.2.1
- python=3.11.5
- readline=8.2
- setuptools=68.0.0
- sqlite=3.41.2
- tk=8.6.12
- tzdata=2023c
- wheel=0.41.2
- xz=5.4.2
- zlib=1.2.13
- pip:
- certifi==2023.7.22
- charset-normalizer==3.3.0
- click==8.1.7
- filelock==3.12.4
- fsspec==2023.9.2
- grpcio==1.59.0
- huggingface-hub==0.17.3
- idna==3.4
- install==1.3.5
- jinja2==3.1.2
- joblib==1.3.2
- markupsafe==2.1.3
- mpmath==1.3.0
- networkx==3.1
- nltk==3.8.1
- numpy==1.26.0
- nvidia-cublas-cu12==12.1.3.1
- nvidia-cuda-cupti-cu12==12.1.105
- nvidia-cuda-nvrtc-cu12==12.1.105
- nvidia-cuda-runtime-cu12==12.1.105
- nvidia-cudnn-cu12==8.9.2.26
- nvidia-cufft-cu12==11.0.2.54
- nvidia-curand-cu12==10.3.2.106
- nvidia-cusolver-cu12==11.4.5.107
- nvidia-cusparse-cu12==12.1.0.106
- nvidia-nccl-cu12==2.18.1
- nvidia-nvjitlink-cu12==12.2.140
- nvidia-nvtx-cu12==12.1.105
- packaging==23.2
- pillow==10.0.1
- protobuf==4.24.4
- pyyaml==6.0.1
- regex==2023.10.3
- requests==2.31.0
- safetensors==0.4.0
- scikit-learn==1.3.1
- scipy==1.11.3
- sentence-transformers==2.2.2
- sentencepiece==0.1.99
- sympy==1.12
- threadpoolctl==3.2.0
- tokenizers==0.14.1
- torch==2.1.0
- torchvision==0.16.0
- tqdm==4.66.1
- transformers==4.34.0
- triton==2.1.0
- typing-extensions==4.8.0
- urllib3==2.0.6
prefix: /opt/conda/envs/transformers-musicgen

View File

@@ -1,58 +0,0 @@
name: transformers-musicgen
channels:
- defaults
dependencies:
- bzip2=1.0.8
- ca-certificates=2023.08.22
- libffi=3.4.4
- libuuid=1.41.5
- ncurses=6.4
- openssl=3.0.11
- pip=23.2.1
- python=3.11.5
- readline=8.2
- setuptools=68.0.0
- sqlite=3.41.2
- tk=8.6.12
- tzdata=2023c
- wheel=0.41.2
- xz=5.4.2
- zlib=1.2.13
- pip:
- certifi==2023.7.22
- charset-normalizer==3.3.0
- click==8.1.7
- filelock==3.12.4
- fsspec==2023.9.2
- grpcio==1.59.0
- huggingface-hub==0.17.3
- idna==3.4
- install==1.3.5
- jinja2==3.1.2
- joblib==1.3.2
- markupsafe==2.1.3
- mpmath==1.3.0
- networkx==3.1
- nltk==3.8.1
- numpy==1.26.0
- packaging==23.2
- pillow==10.0.1
- protobuf==4.24.4
- pyyaml==6.0.1
- regex==2023.10.3
- requests==2.31.0
- safetensors==0.4.0
- scikit-learn==1.3.1
- scipy==1.11.3
- sentence-transformers==2.2.2
- sentencepiece==0.1.99
- sympy==1.12
- threadpoolctl==3.2.0
- tokenizers==0.14.1
- torch==2.1.0
- torchvision==0.16.0
- tqdm==4.66.1
- transformers==4.34.0
- typing-extensions==4.8.0
- urllib3==2.0.6
prefix: /opt/conda/envs/transformers-musicgen

View File

@@ -1,122 +0,0 @@
#!/usr/bin/env python3
"""
Extra gRPC server for MusicgenForConditionalGeneration models.
"""
from concurrent import futures
import argparse
import signal
import sys
import os
import time
import backend_pb2
import backend_pb2_grpc
import grpc
from scipy.io.wavfile import write as write_wav
from transformers import AutoProcessor, MusicgenForConditionalGeneration
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
A gRPC servicer for the backend service.
This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
"""
def Health(self, request, context):
"""
A gRPC method that returns the health status of the backend service.
Args:
request: A HealthRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Reply object that contains the health status of the backend service.
"""
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
"""
A gRPC method that loads a model into memory.
Args:
request: A LoadModelRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Result object that contains the result of the LoadModel operation.
"""
model_name = request.Model
try:
self.processor = AutoProcessor.from_pretrained(model_name)
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def TTS(self, request, context):
model_name = request.model
if model_name == "":
return backend_pb2.Result(success=False, message="request.model is required")
try:
self.processor = AutoProcessor.from_pretrained(model_name)
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
inputs = self.processor(
text=[request.text],
padding=True,
return_tensors="pt",
)
tokens = 256
# TODO get tokens from request?
audio_values = self.model.generate(**inputs, max_new_tokens=tokens)
print("[transformers-musicgen] TTS generated!", file=sys.stderr)
sampling_rate = self.model.config.audio_encoder.sampling_rate
write_wav(request.dst, rate=sampling_rate, data=audio_values[0, 0].numpy())
print("[transformers-musicgen] TTS saved to", request.dst, file=sys.stderr)
print("[transformers-musicgen] TTS for", file=sys.stderr)
print(request, file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("[transformers-musicgen] Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("[transformers-musicgen] Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
print(f"[transformers-musicgen] startup: {args}", file=sys.stderr)
serve(args.addr)

View File

@@ -1,18 +0,0 @@
.PHONY: transformers
transformers:
@echo "Creating virtual environment..."
@conda env create --name transformers --file transformers.yml
@echo "Virtual environment created."
.PHONY: run
run:
@echo "Running transformers..."
bash run.sh
@echo "transformers run."
# It is not working well by using command line. It only6 works with IDE like VSCode.
.PHONY: test
test:
@echo "Testing transformers..."
bash test.sh
@echo "transformers tested."

View File

@@ -1,5 +0,0 @@
# Creating a separate environment for the transformers project
```
make transformers
```

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,363 +0,0 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import backend_pb2 as backend__pb2
class BackendStub(object):
"""Missing associated documentation comment in .proto file."""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Health = channel.unary_unary(
'/backend.Backend/Health',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Predict = channel.unary_unary(
'/backend.Backend/Predict',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.LoadModel = channel.unary_unary(
'/backend.Backend/LoadModel',
request_serializer=backend__pb2.ModelOptions.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.PredictStream = channel.unary_stream(
'/backend.Backend/PredictStream',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Embedding = channel.unary_unary(
'/backend.Backend/Embedding',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.EmbeddingResult.FromString,
)
self.GenerateImage = channel.unary_unary(
'/backend.Backend/GenerateImage',
request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.AudioTranscription = channel.unary_unary(
'/backend.Backend/AudioTranscription',
request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
response_deserializer=backend__pb2.TranscriptResult.FromString,
)
self.TTS = channel.unary_unary(
'/backend.Backend/TTS',
request_serializer=backend__pb2.TTSRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.TokenizeString = channel.unary_unary(
'/backend.Backend/TokenizeString',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.TokenizationResponse.FromString,
)
self.Status = channel.unary_unary(
'/backend.Backend/Status',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.StatusResponse.FromString,
)
class BackendServicer(object):
"""Missing associated documentation comment in .proto file."""
def Health(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Predict(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def LoadModel(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def PredictStream(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Embedding(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def GenerateImage(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def AudioTranscription(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TTS(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TokenizeString(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Status(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_BackendServicer_to_server(servicer, server):
rpc_method_handlers = {
'Health': grpc.unary_unary_rpc_method_handler(
servicer.Health,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Predict': grpc.unary_unary_rpc_method_handler(
servicer.Predict,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'LoadModel': grpc.unary_unary_rpc_method_handler(
servicer.LoadModel,
request_deserializer=backend__pb2.ModelOptions.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'PredictStream': grpc.unary_stream_rpc_method_handler(
servicer.PredictStream,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Embedding': grpc.unary_unary_rpc_method_handler(
servicer.Embedding,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
),
'GenerateImage': grpc.unary_unary_rpc_method_handler(
servicer.GenerateImage,
request_deserializer=backend__pb2.GenerateImageRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'AudioTranscription': grpc.unary_unary_rpc_method_handler(
servicer.AudioTranscription,
request_deserializer=backend__pb2.TranscriptRequest.FromString,
response_serializer=backend__pb2.TranscriptResult.SerializeToString,
),
'TTS': grpc.unary_unary_rpc_method_handler(
servicer.TTS,
request_deserializer=backend__pb2.TTSRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'TokenizeString': grpc.unary_unary_rpc_method_handler(
servicer.TokenizeString,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
),
'Status': grpc.unary_unary_rpc_method_handler(
servicer.Status,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.StatusResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'backend.Backend', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
# This class is part of an EXPERIMENTAL API.
class Backend(object):
"""Missing associated documentation comment in .proto file."""
@staticmethod
def Health(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Predict(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def LoadModel(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
backend__pb2.ModelOptions.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def PredictStream(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Embedding(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.EmbeddingResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def GenerateImage(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
backend__pb2.GenerateImageRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def AudioTranscription(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
backend__pb2.TranscriptRequest.SerializeToString,
backend__pb2.TranscriptResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TTS(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
backend__pb2.TTSRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TokenizeString(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.TokenizationResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Status(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.StatusResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

View File

@@ -1,14 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the transformers server with conda
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate transformers
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python $DIR/transformers_server.py $@

View File

@@ -1,11 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the transformers server with conda
# Activate conda environment
source activate transformers
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python -m unittest $DIR/test_transformers_server.py

View File

@@ -1,84 +0,0 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "transformers_server.py", "--addr", "localhost:50051"])
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.kill()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_embedding(self):
"""
This method tests if the embeddings are generated successfully
"""
time.sleep(10)
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-cased"))
print(response.message)
self.assertTrue(response.success)
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
embedding_response = stub.Embedding(embedding_request)
self.assertIsNotNone(embedding_response.embeddings)
except Exception as err:
print(err)
self.fail("Embedding service failed")
finally:
self.tearDown()

View File

@@ -1,147 +0,0 @@
#!/usr/bin/env python3
"""
Extra gRPC server for HuggingFace AutoModel models.
"""
from concurrent import futures
import argparse
import signal
import sys
import os
import time
import backend_pb2
import backend_pb2_grpc
import grpc
import torch
from transformers import AutoTokenizer, AutoModel
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
def mean_pooling(model_output, attention_mask):
"""
Mean pooling to get sentence embeddings. See:
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
"""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
A gRPC servicer for the backend service.
This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
"""
def Health(self, request, context):
"""
A gRPC method that returns the health status of the backend service.
Args:
request: A HealthRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Reply object that contains the health status of the backend service.
"""
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
"""
A gRPC method that loads a model into memory.
Args:
request: A LoadModelRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Result object that contains the result of the LoadModel operation.
"""
model_name = request.Model
try:
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method with embeddings models like jinai-v2
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
if request.CUDA:
try:
# TODO: also tensorflow, make configurable
import torch.cuda
if torch.cuda.is_available():
print("Loading model", model_name, "to CUDA.", file=sys.stderr)
self.model = self.model.to("cuda")
except Exception as err:
print("Not using CUDA:", err, file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service
# Replace this with your desired response
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Embedding(self, request, context):
"""
A gRPC method that calculates embeddings for a given sentence.
Args:
request: An EmbeddingRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
An EmbeddingResult object that contains the calculated embeddings.
"""
# Tokenize input
max_length = 512
if request.Tokens != 0:
max_length = request.Tokens
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
# Create word embeddings
model_output = self.model(**encoded_input)
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
print("Embeddings:", sentence_embeddings, file=sys.stderr)
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,363 +0,0 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import backend_pb2 as backend__pb2
class BackendStub(object):
"""Missing associated documentation comment in .proto file."""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Health = channel.unary_unary(
'/backend.Backend/Health',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Predict = channel.unary_unary(
'/backend.Backend/Predict',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.LoadModel = channel.unary_unary(
'/backend.Backend/LoadModel',
request_serializer=backend__pb2.ModelOptions.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.PredictStream = channel.unary_stream(
'/backend.Backend/PredictStream',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Embedding = channel.unary_unary(
'/backend.Backend/Embedding',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.EmbeddingResult.FromString,
)
self.GenerateImage = channel.unary_unary(
'/backend.Backend/GenerateImage',
request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.AudioTranscription = channel.unary_unary(
'/backend.Backend/AudioTranscription',
request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
response_deserializer=backend__pb2.TranscriptResult.FromString,
)
self.TTS = channel.unary_unary(
'/backend.Backend/TTS',
request_serializer=backend__pb2.TTSRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.TokenizeString = channel.unary_unary(
'/backend.Backend/TokenizeString',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.TokenizationResponse.FromString,
)
self.Status = channel.unary_unary(
'/backend.Backend/Status',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.StatusResponse.FromString,
)
class BackendServicer(object):
"""Missing associated documentation comment in .proto file."""
def Health(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Predict(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def LoadModel(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def PredictStream(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Embedding(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def GenerateImage(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def AudioTranscription(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TTS(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TokenizeString(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Status(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_BackendServicer_to_server(servicer, server):
rpc_method_handlers = {
'Health': grpc.unary_unary_rpc_method_handler(
servicer.Health,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Predict': grpc.unary_unary_rpc_method_handler(
servicer.Predict,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'LoadModel': grpc.unary_unary_rpc_method_handler(
servicer.LoadModel,
request_deserializer=backend__pb2.ModelOptions.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'PredictStream': grpc.unary_stream_rpc_method_handler(
servicer.PredictStream,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Embedding': grpc.unary_unary_rpc_method_handler(
servicer.Embedding,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
),
'GenerateImage': grpc.unary_unary_rpc_method_handler(
servicer.GenerateImage,
request_deserializer=backend__pb2.GenerateImageRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'AudioTranscription': grpc.unary_unary_rpc_method_handler(
servicer.AudioTranscription,
request_deserializer=backend__pb2.TranscriptRequest.FromString,
response_serializer=backend__pb2.TranscriptResult.SerializeToString,
),
'TTS': grpc.unary_unary_rpc_method_handler(
servicer.TTS,
request_deserializer=backend__pb2.TTSRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'TokenizeString': grpc.unary_unary_rpc_method_handler(
servicer.TokenizeString,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
),
'Status': grpc.unary_unary_rpc_method_handler(
servicer.Status,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.StatusResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'backend.Backend', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
# This class is part of an EXPERIMENTAL API.
class Backend(object):
"""Missing associated documentation comment in .proto file."""
@staticmethod
def Health(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Predict(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def LoadModel(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
backend__pb2.ModelOptions.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def PredictStream(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Embedding(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.EmbeddingResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def GenerateImage(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
backend__pb2.GenerateImageRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def AudioTranscription(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
backend__pb2.TranscriptRequest.SerializeToString,
backend__pb2.TranscriptResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TTS(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
backend__pb2.TTSRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TokenizeString(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.TokenizationResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Status(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.StatusResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

View File

@@ -1,14 +0,0 @@
#!/bin/bash
##
## A bash script installs the required dependencies of VALL-E-X and prepares the environment
export PATH=$PATH:/opt/conda/bin
# Activate conda environment
source activate ttsvalle
echo $CONDA_PREFIX
git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && pip install -r requirements.txt && popd
cp -rfv $CONDA_PREFIX/vall-e-x/* ./

View File

@@ -1,81 +0,0 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "ttsvalle.py", "--addr", "localhost:50051"])
time.sleep(10)
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.terminate()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="dingzhen"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_tts(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="dingzhen"))
self.assertTrue(response.success)
tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story")
tts_response = stub.TTS(tts_request)
self.assertIsNotNone(tts_response)
except Exception as err:
print(err)
self.fail("TTS service failed")
finally:
self.tearDown()

View File

@@ -1,11 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the ttsvalle server with conda
# Activate conda environment
source activate ttsvalle
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python -m unittest $DIR/test.py

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,363 +0,0 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import backend_pb2 as backend__pb2
class BackendStub(object):
"""Missing associated documentation comment in .proto file."""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Health = channel.unary_unary(
'/backend.Backend/Health',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Predict = channel.unary_unary(
'/backend.Backend/Predict',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.LoadModel = channel.unary_unary(
'/backend.Backend/LoadModel',
request_serializer=backend__pb2.ModelOptions.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.PredictStream = channel.unary_stream(
'/backend.Backend/PredictStream',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.Reply.FromString,
)
self.Embedding = channel.unary_unary(
'/backend.Backend/Embedding',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.EmbeddingResult.FromString,
)
self.GenerateImage = channel.unary_unary(
'/backend.Backend/GenerateImage',
request_serializer=backend__pb2.GenerateImageRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.AudioTranscription = channel.unary_unary(
'/backend.Backend/AudioTranscription',
request_serializer=backend__pb2.TranscriptRequest.SerializeToString,
response_deserializer=backend__pb2.TranscriptResult.FromString,
)
self.TTS = channel.unary_unary(
'/backend.Backend/TTS',
request_serializer=backend__pb2.TTSRequest.SerializeToString,
response_deserializer=backend__pb2.Result.FromString,
)
self.TokenizeString = channel.unary_unary(
'/backend.Backend/TokenizeString',
request_serializer=backend__pb2.PredictOptions.SerializeToString,
response_deserializer=backend__pb2.TokenizationResponse.FromString,
)
self.Status = channel.unary_unary(
'/backend.Backend/Status',
request_serializer=backend__pb2.HealthMessage.SerializeToString,
response_deserializer=backend__pb2.StatusResponse.FromString,
)
class BackendServicer(object):
"""Missing associated documentation comment in .proto file."""
def Health(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Predict(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def LoadModel(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def PredictStream(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Embedding(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def GenerateImage(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def AudioTranscription(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TTS(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def TokenizeString(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def Status(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_BackendServicer_to_server(servicer, server):
rpc_method_handlers = {
'Health': grpc.unary_unary_rpc_method_handler(
servicer.Health,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Predict': grpc.unary_unary_rpc_method_handler(
servicer.Predict,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'LoadModel': grpc.unary_unary_rpc_method_handler(
servicer.LoadModel,
request_deserializer=backend__pb2.ModelOptions.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'PredictStream': grpc.unary_stream_rpc_method_handler(
servicer.PredictStream,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.Reply.SerializeToString,
),
'Embedding': grpc.unary_unary_rpc_method_handler(
servicer.Embedding,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.EmbeddingResult.SerializeToString,
),
'GenerateImage': grpc.unary_unary_rpc_method_handler(
servicer.GenerateImage,
request_deserializer=backend__pb2.GenerateImageRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'AudioTranscription': grpc.unary_unary_rpc_method_handler(
servicer.AudioTranscription,
request_deserializer=backend__pb2.TranscriptRequest.FromString,
response_serializer=backend__pb2.TranscriptResult.SerializeToString,
),
'TTS': grpc.unary_unary_rpc_method_handler(
servicer.TTS,
request_deserializer=backend__pb2.TTSRequest.FromString,
response_serializer=backend__pb2.Result.SerializeToString,
),
'TokenizeString': grpc.unary_unary_rpc_method_handler(
servicer.TokenizeString,
request_deserializer=backend__pb2.PredictOptions.FromString,
response_serializer=backend__pb2.TokenizationResponse.SerializeToString,
),
'Status': grpc.unary_unary_rpc_method_handler(
servicer.Status,
request_deserializer=backend__pb2.HealthMessage.FromString,
response_serializer=backend__pb2.StatusResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'backend.Backend', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
# This class is part of an EXPERIMENTAL API.
class Backend(object):
"""Missing associated documentation comment in .proto file."""
@staticmethod
def Health(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Health',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Predict(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Predict',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def LoadModel(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/LoadModel',
backend__pb2.ModelOptions.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def PredictStream(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_stream(request, target, '/backend.Backend/PredictStream',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.Reply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Embedding(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Embedding',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.EmbeddingResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def GenerateImage(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/GenerateImage',
backend__pb2.GenerateImageRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def AudioTranscription(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/AudioTranscription',
backend__pb2.TranscriptRequest.SerializeToString,
backend__pb2.TranscriptResult.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TTS(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TTS',
backend__pb2.TTSRequest.SerializeToString,
backend__pb2.Result.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def TokenizeString(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/TokenizeString',
backend__pb2.PredictOptions.SerializeToString,
backend__pb2.TokenizationResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
@staticmethod
def Status(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(request, target, '/backend.Backend/Status',
backend__pb2.HealthMessage.SerializeToString,
backend__pb2.StatusResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

View File

@@ -1,11 +0,0 @@
#!/bin/bash
##
## A bash script wrapper that runs the transformers server with conda
# Activate conda environment
source activate vllm
# get the directory where the bash script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
python -m unittest $DIR/test_backend_vllm.py

View File

@@ -1,76 +0,0 @@
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
import unittest
import subprocess
import time
import grpc
import backend_pb2_grpc
import backend_pb2
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service.
This class contains methods to test the startup and shutdown of the gRPC service.
"""
def setUp(self):
self.service = subprocess.Popen(["python", "backend_vllm.py", "--addr", "localhost:50051"])
time.sleep(10)
def tearDown(self) -> None:
self.service.terminate()
self.service.wait()
def test_server_startup(self):
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_text(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
req = backend_pb2.PredictOptions(prompt="The capital of France is")
resp = stub.Predict(req)
self.assertIsNotNone(resp.message)
except Exception as err:
print(err)
self.fail("text service failed")
finally:
self.tearDown()

View File

@@ -5,6 +5,7 @@ package main
import (
"flag"
bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
@@ -15,7 +16,7 @@ var (
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &StableDiffusion{}); err != nil {
if err := grpc.StartServer(*addr, &bert.Embeddings{}); err != nil {
panic(err)
}
}

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

View File

@@ -5,6 +5,8 @@ package main
import (
"flag"
gpt4all "github.com/go-skynet/LocalAI/pkg/backend/llm/gpt4all"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
@@ -15,7 +17,7 @@ var (
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
if err := grpc.StartServer(*addr, &gpt4all.LLM{}); err != nil {
panic(err)
}
}

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

View File

@@ -0,0 +1,23 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
langchain "github.com/go-skynet/LocalAI/pkg/backend/llm/langchain"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &langchain.LLM{}); err != nil {
panic(err)
}
}

View File

@@ -3,6 +3,8 @@ package main
import (
"flag"
llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
@@ -13,7 +15,7 @@ var (
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
panic(err)
}
}

25
cmd/grpc/llama/main.go Normal file
View File

@@ -0,0 +1,25 @@
package main
// GRPC Falcon server
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
panic(err)
}
}

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

View File

@@ -1,12 +1,12 @@
package main
// GRPC Falcon server
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
tts "github.com/go-skynet/LocalAI/pkg/backend/tts"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
@@ -17,7 +17,7 @@ var (
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
if err := grpc.StartServer(*addr, &tts.Piper{}); err != nil {
panic(err)
}
}

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

View File

@@ -5,6 +5,8 @@ package main
import (
"flag"
rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
@@ -15,7 +17,7 @@ var (
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
if err := grpc.StartServer(*addr, &rwkv.LLM{}); err != nil {
panic(err)
}
}

View File

@@ -5,6 +5,8 @@ package main
import (
"flag"
image "github.com/go-skynet/LocalAI/pkg/backend/image"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
@@ -15,7 +17,7 @@ var (
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
if err := grpc.StartServer(*addr, &image.StableDiffusion{}); err != nil {
panic(err)
}
}

View File

@@ -5,7 +5,7 @@ package main
import (
"flag"
transformers "github.com/go-skynet/LocalAI/backend/go/llm/transformers"
transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)

23
cmd/grpc/whisper/main.go Normal file
View File

@@ -0,0 +1,23 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
transcribe "github.com/go-skynet/LocalAI/pkg/backend/transcribe"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &transcribe.Whisper{}); err != nil {
panic(err)
}
}

View File

@@ -1,4 +0,0 @@
FROM klakegg/hugo:ext-alpine
RUN apk add git && \
git config --global --add safe.directory /src

Some files were not shown because too many files have changed in this diff Show More