Compare commits

..

2 Commits

Author SHA1 Message Date
Ettore Di Giacinto
63c5d843b6 chore(gosec): fix CI
downgrade to latest known version of the gosec action

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-09-13 19:17:27 +02:00
Ettore Di Giacinto
a9b0e264f2 chore(exllama): drop exllama backend
For polishing and cleaning up it makes now sense to drop exllama which
is completely unmaintained, and was only supporting the llamav1
architecture (nowadays it's superseded by llamav1) .

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-09-13 19:09:43 +02:00
196 changed files with 1184 additions and 4335 deletions

View File

@@ -9,7 +9,6 @@
# Param 2: email # Param 2: email
# #
config_user() { config_user() {
echo "Configuring git for $1 <$2>"
local gcn=$(git config --global user.name) local gcn=$(git config --global user.name)
if [ -z "${gcn}" ]; then if [ -z "${gcn}" ]; then
echo "Setting up git user / remote" echo "Setting up git user / remote"
@@ -25,7 +24,6 @@ config_user() {
# Param 2: remote url # Param 2: remote url
# #
config_remote() { config_remote() {
echo "Adding git remote and fetching $2 as $1"
local gr=$(git remote -v | grep $1) local gr=$(git remote -v | grep $1)
if [ -z "${gr}" ]; then if [ -z "${gr}" ]; then
git remote add $1 $2 git remote add $1 $2

View File

@@ -29,14 +29,9 @@ def calculate_sha256(file_path):
def manual_safety_check_hf(repo_id): def manual_safety_check_hf(repo_id):
scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan") scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
scan = scanResponse.json() scan = scanResponse.json()
# Check if 'hasUnsafeFile' exists in the response if scan['hasUnsafeFile']:
if 'hasUnsafeFile' in scan: return scan
if scan['hasUnsafeFile']: return None
return scan
else:
return None
else:
return None
download_type, repo_id_or_url = parse_uri(uri) download_type, repo_id_or_url = parse_uri(uri)

View File

@@ -6,7 +6,6 @@ import (
"io/ioutil" "io/ioutil"
"os" "os"
"github.com/microcosm-cc/bluemonday"
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
) )
@@ -280,12 +279,6 @@ func main() {
return return
} }
// Ensure that all arbitrary text content is sanitized before display
for i, m := range models {
models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
}
// render the template // render the template
data := struct { data := struct {
Models []*GalleryModel Models []*GalleryModel

View File

@@ -9,8 +9,6 @@ updates:
directory: "/" directory: "/"
schedule: schedule:
interval: "weekly" interval: "weekly"
ignore:
- dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
- package-ecosystem: "github-actions" - package-ecosystem: "github-actions"
# Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.) # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
directory: "/" directory: "/"

View File

@@ -33,7 +33,7 @@ jobs:
run: | run: |
CGO_ENABLED=0 make build-api CGO_ENABLED=0 make build-api
- name: rm - name: rm
uses: appleboy/ssh-action@v1.1.0 uses: appleboy/ssh-action@v1.0.3
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
rm: true rm: true
target: ./local-ai target: ./local-ai
- name: restarting - name: restarting
uses: appleboy/ssh-action@v1.1.0 uses: appleboy/ssh-action@v1.0.3
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}

View File

@@ -13,78 +13,6 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
hipblas-jobs:
uses: ./.github/workflows/image_build.yml
with:
tag-latest: ${{ matrix.tag-latest }}
tag-suffix: ${{ matrix.tag-suffix }}
ffmpeg: ${{ matrix.ffmpeg }}
image-type: ${{ matrix.image-type }}
build-type: ${{ matrix.build-type }}
cuda-major-version: ${{ matrix.cuda-major-version }}
cuda-minor-version: ${{ matrix.cuda-minor-version }}
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
base-image: ${{ matrix.base-image }}
grpc-base-image: ${{ matrix.grpc-base-image }}
aio: ${{ matrix.aio }}
makeflags: ${{ matrix.makeflags }}
latest-image: ${{ matrix.latest-image }}
latest-image-aio: ${{ matrix.latest-image-aio }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
strategy:
# Pushing with all jobs in parallel
# eats the bandwidth of all the nodes
max-parallel: 2
matrix:
include:
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-hipblas-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
aio: "-aio-gpu-hipblas"
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
latest-image: 'latest-gpu-hipblas'
latest-image-aio: 'latest-aio-gpu-hipblas'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas'
ffmpeg: 'false'
image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-core'
ffmpeg: 'false'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
self-hosted-jobs: self-hosted-jobs:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
with: with:
@@ -111,7 +39,7 @@ jobs:
strategy: strategy:
# Pushing with all jobs in parallel # Pushing with all jobs in parallel
# eats the bandwidth of all the nodes # eats the bandwidth of all the nodes
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }} max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
matrix: matrix:
include: include:
# Extra images # Extra images
@@ -194,6 +122,29 @@ jobs:
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-hipblas-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
aio: "-aio-gpu-hipblas"
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
latest-image: 'latest-gpu-hipblas'
latest-image-aio: 'latest-aio-gpu-hipblas'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas'
ffmpeg: 'false'
image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16' - build-type: 'sycl_f16'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'auto' tag-latest: 'auto'
@@ -261,6 +212,26 @@ jobs:
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-core'
ffmpeg: 'false'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
core-image-build: core-image-build:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml

View File

@@ -79,7 +79,7 @@ jobs:
args: ${{ steps.summarize.outputs.message }} args: ${{ steps.summarize.outputs.message }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -161,7 +161,7 @@ jobs:
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View File

@@ -123,7 +123,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -232,7 +232,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -308,7 +308,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -350,7 +350,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View File

@@ -18,7 +18,7 @@ jobs:
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
- name: Run Gosec Security Scanner - name: Run Gosec Security Scanner
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
uses: securego/gosec@v2.21.4 uses: securego/gosec@v2.21.0
with: with:
# we let the report trigger content trigger a failure using the GitHub Security features. # we let the report trigger content trigger a failure using the GitHub Security features.
args: '-no-fail -fmt sarif -out results.sarif ./...' args: '-no-fail -fmt sarif -out results.sarif ./...'

View File

@@ -105,14 +105,6 @@ jobs:
tests-parler-tts: tests-parler-tts:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Force Install GIT latest
run: |
sudo apt-get update \
&& sudo apt-get install -y software-properties-common \
&& sudo apt-get update \
&& sudo add-apt-repository -y ppa:git-core/ppa \
&& sudo apt-get update \
&& sudo apt-get install -y git
- name: Clone - name: Clone
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:

View File

@@ -133,7 +133,7 @@ jobs:
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -178,26 +178,17 @@ jobs:
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
submodules: true submodules: true
- name: Dependencies
run: |
# Install protoc
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
rm protoc.zip
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
PATH="$PATH:$HOME/go/bin" make protogen-go
- name: Build images - name: Build images
run: | run: |
docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile . docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
- name: Test - name: Test
run: | run: |
PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \ LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
make run-e2e-aio make run-e2e-aio
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@@ -235,7 +226,7 @@ jobs:
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19 uses: mxschmitt/action-tmate@v3.18
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View File

@@ -15,6 +15,8 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
- [Documentation](#documentation) - [Documentation](#documentation)
- [Community and Communication](#community-and-communication) - [Community and Communication](#community-and-communication)
## Getting Started ## Getting Started
### Prerequisites ### Prerequisites
@@ -52,7 +54,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
## Coding Guidelines ## Coding Guidelines
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here. - No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
## Testing ## Testing
@@ -82,3 +84,5 @@ We are welcome the contribution of the documents, please open new PR or create a
- You can reach out via the Github issue tracker. - You can reach out via the Github issue tracker.
- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions) - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy) - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
---

View File

@@ -9,8 +9,6 @@ FROM ${BASE_IMAGE} AS requirements-core
USER root USER root
ARG GO_VERSION=1.22.6 ARG GO_VERSION=1.22.6
ARG CMAKE_VERSION=3.26.4
ARG CMAKE_FROM_SOURCE=false
ARG TARGETARCH ARG TARGETARCH
ARG TARGETVARIANT ARG TARGETVARIANT
@@ -23,25 +21,13 @@ RUN apt-get update && \
build-essential \ build-essential \
ccache \ ccache \
ca-certificates \ ca-certificates \
curl libssl-dev \ cmake \
curl \
git \ git \
unzip upx-ucl && \ unzip upx-ucl && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# Install CMake (the version in 22.04 is too old)
RUN <<EOT bash
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
else
apt-get update && \
apt-get install -y \
cmake && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
fi
EOT
# Install Go # Install Go
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
@@ -202,8 +188,6 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
ARG GRPC_MAKEFLAGS="-j4 -Otarget" ARG GRPC_MAKEFLAGS="-j4 -Otarget"
ARG GRPC_VERSION=v1.65.0 ARG GRPC_VERSION=v1.65.0
ARG CMAKE_FROM_SOURCE=false
ARG CMAKE_VERSION=3.26.4
ENV MAKEFLAGS=${GRPC_MAKEFLAGS} ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
@@ -212,24 +196,12 @@ WORKDIR /build
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
ca-certificates \ ca-certificates \
build-essential curl libssl-dev \ build-essential \
cmake \
git && \ git && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# Install CMake (the version in 22.04 is too old)
RUN <<EOT bash
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
else
apt-get update && \
apt-get install -y \
cmake && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
fi
EOT
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
# and running make install in the target container # and running make install in the target container
@@ -325,10 +297,10 @@ COPY .git .
RUN make prepare RUN make prepare
## Build the binary ## Build the binary
## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space ## If it's CUDA, we want to skip some of the llama-compat backends to save space
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build ## We only leave the most CPU-optimized variant and the fallback for the cublas build
## (both will use CUDA or hipblas for the actual computation) ## (both will use CUDA for the actual computation)
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \ RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \ SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \ else \
make build; \ make build; \
@@ -366,8 +338,9 @@ RUN if [ "${FFMPEG}" = "true" ]; then \
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
ssh less wget ssh less && \
# For the devcontainer, leave apt functional in case additional devtools are needed at runtime. apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN go install github.com/go-delve/delve/cmd/dlv@latest RUN go install github.com/go-delve/delve/cmd/dlv@latest

View File

@@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=0a1c750c80147687df267114c81956757cc14382 CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1
# go-rwkv version # go-rwkv version
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
# whisper.cpp version # whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
WHISPER_CPP_VERSION?=0fbaac9c891055796456df7b9122a70c220f9ca1 WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
# bert.cpp version # bert.cpp version
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -359,9 +359,6 @@ clean-tests:
rm -rf test-dir rm -rf test-dir
rm -rf core/http/backend-assets rm -rf core/http/backend-assets
clean-dc: clean
cp -r /build/backend-assets /workspace/backend-assets
## Build: ## Build:
build: prepare backend-assets grpcs ## Build the project build: prepare backend-assets grpcs ## Build the project
$(info ${GREEN}I local-ai build info:${RESET}) $(info ${GREEN}I local-ai build info:${RESET})
@@ -468,15 +465,15 @@ run-e2e-image:
ls -liah $(abspath ./tests/e2e-fixtures) ls -liah $(abspath ./tests/e2e-fixtures)
docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
run-e2e-aio: protogen-go run-e2e-aio:
@echo 'Running e2e AIO tests' @echo 'Running e2e AIO tests'
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
test-e2e: test-e2e:
@echo 'Running e2e tests' @echo 'Running e2e tests'
BUILD_TYPE=$(BUILD_TYPE) \ BUILD_TYPE=$(BUILD_TYPE) \
LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \ LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
teardown-e2e: teardown-e2e:
rm -rf $(TEST_DIR) || true rm -rf $(TEST_DIR) || true
@@ -484,24 +481,24 @@ teardown-e2e:
test-llama: prepare-test test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
test-tts: prepare-test test-tts: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
test-stablediffusion: prepare-test test-stablediffusion: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
test-stores: backend-assets/grpc/local-store test-stores: backend-assets/grpc/local-store
mkdir -p tests/integration/backend-assets/grpc mkdir -p tests/integration/backend-assets/grpc
cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/ cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
test-container: test-container:
docker build --target requirements -t local-ai-test-container . docker build --target requirements -t local-ai-test-container .

View File

@@ -66,24 +66,11 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
``` ```
To load models:
```bash
# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
local-ai run llama-3.2-1b-instruct:q4_k_m
# Start LocalAI with the phi-2 model directly from huggingface
local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
# Install and run a model from the Ollama OCI registry
local-ai run ollama://gemma:2b
# Run a model from a configuration file
local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
# Install and run a model from a standard OCI registry (e.g., Docker Hub)
local-ai run oci://localai/phi-2:latest
```
[💻 Getting started](https://localai.io/basics/getting_started/index.html) [💻 Getting started](https://localai.io/basics/getting_started/index.html)
## 📰 Latest project news ## 🔥🔥 Hot topics / Roadmap
[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io) - Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
@@ -96,12 +83,8 @@ local-ai run oci://localai/phi-2:latest
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222 - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) Hot topics (looking for contributors):
## 🔥🔥 Hot topics (looking for help):
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113 - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126 - Backends v2: https://github.com/mudler/LocalAI/issues/1126

View File

@@ -2,7 +2,7 @@ backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
name: gpt-4o name: gpt-4-vision-preview
roles: roles:
user: "USER:" user: "USER:"

View File

@@ -2,7 +2,7 @@ backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
name: gpt-4o name: gpt-4-vision-preview
roles: roles:
user: "USER:" user: "USER:"

View File

@@ -2,7 +2,7 @@ backend: llama-cpp
context_size: 4096 context_size: 4096
mmap: false mmap: false
f16: false f16: false
name: gpt-4o name: gpt-4-vision-preview
roles: roles:
user: "USER:" user: "USER:"

View File

@@ -26,19 +26,6 @@ service Backend {
rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {} rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
rpc Rerank(RerankRequest) returns (RerankResult) {} rpc Rerank(RerankRequest) returns (RerankResult) {}
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
}
// Define the empty request
message MetricsRequest {}
message MetricsResponse {
int32 slot_id = 1;
string prompt_json_for_slot = 2; // Stores the prompt as a JSON string.
float tokens_per_second = 3;
int32 tokens_generated = 4;
int32 prompt_tokens_processed = 5;
} }
message RerankRequest { message RerankRequest {
@@ -147,9 +134,6 @@ message PredictOptions {
repeated string Images = 42; repeated string Images = 42;
bool UseTokenizerTemplate = 43; bool UseTokenizerTemplate = 43;
repeated Message Messages = 44; repeated Message Messages = 44;
repeated string Videos = 45;
repeated string Audios = 46;
string CorrelationId = 47;
} }
// The response message containing the result // The response message containing the result
@@ -219,7 +203,6 @@ message ModelOptions {
int32 SwapSpace = 53; int32 SwapSpace = 53;
int32 MaxModelLen = 54; int32 MaxModelLen = 54;
int32 TensorParallelSize = 55; int32 TensorParallelSize = 55;
string LoadFormat = 58;
string MMProj = 41; string MMProj = 41;

View File

@@ -13,7 +13,6 @@
#include <getopt.h> #include <getopt.h>
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "log.h"
#include "stb_image.h" #include "stb_image.h"
#include "common.h" #include "common.h"
#include "json.hpp" #include "json.hpp"
@@ -113,7 +112,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
std::string ret; std::string ret;
for (; begin != end; ++begin) for (; begin != end; ++begin)
{ {
ret += common_token_to_piece(ctx, *begin); ret += llama_token_to_piece(ctx, *begin);
} }
return ret; return ret;
} }
@@ -121,7 +120,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
// format incomplete utf-8 multibyte character for output // format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
{ {
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
// if the size is 1 and first bit is 1, meaning it's a partial character // if the size is 1 and first bit is 1, meaning it's a partial character
// (size > 1 meaning it's already a known token) // (size > 1 meaning it's already a known token)
if (out.size() == 1 && (out[0] & 0x80) == 0x80) if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -203,8 +202,8 @@ struct llama_client_slot
std::string stopping_word; std::string stopping_word;
// sampling // sampling
struct common_sampler_params sparams; struct gpt_sampler_params sparams;
common_sampler *ctx_sampling = nullptr; gpt_sampler *ctx_sampling = nullptr;
int32_t ga_i = 0; // group-attention state int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1; // group-attention factor int32_t ga_n = 1; // group-attention factor
@@ -257,7 +256,7 @@ struct llama_client_slot
images.clear(); images.clear();
} }
bool has_budget(common_params &global_params) { bool has_budget(gpt_params &global_params) {
if (params.n_predict == -1 && global_params.n_predict == -1) if (params.n_predict == -1 && global_params.n_predict == -1)
{ {
return true; // limitless return true; // limitless
@@ -391,39 +390,6 @@ struct llama_metrics {
} }
}; };
struct llava_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
struct llama_server_context struct llama_server_context
{ {
llama_model *model = nullptr; llama_model *model = nullptr;
@@ -431,7 +397,7 @@ struct llama_server_context
clip_ctx *clp_ctx = nullptr; clip_ctx *clp_ctx = nullptr;
common_params params; gpt_params params;
llama_batch batch; llama_batch batch;
@@ -474,7 +440,7 @@ struct llama_server_context
} }
} }
bool load_model(const common_params &params_) bool load_model(const gpt_params &params_)
{ {
params = params_; params = params_;
if (!params.mmproj.empty()) { if (!params.mmproj.empty()) {
@@ -482,7 +448,7 @@ struct llama_server_context
LOG_INFO("Multi Modal Mode Enabled", {}); LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) { if(clp_ctx == nullptr) {
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str()); LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
return false; return false;
} }
@@ -491,12 +457,12 @@ struct llama_server_context
} }
} }
common_init_result common_init = common_init_from_params(params); llama_init_result llama_init = llama_init_from_gpt_params(params);
model = common_init.model; model = llama_init.model;
ctx = common_init.context; ctx = llama_init.context;
if (model == nullptr) if (model == nullptr)
{ {
LOG_ERR("unable to load model: %s", params.model.c_str()); LOG_ERROR("unable to load model", {{"model", params.model}});
return false; return false;
} }
@@ -504,7 +470,7 @@ struct llama_server_context
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_n_embd(model); const int n_embd_llm = llama_n_embd(model);
if (n_embd_clip != n_embd_llm) { if (n_embd_clip != n_embd_llm) {
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
return false; return false;
@@ -523,21 +489,11 @@ struct llama_server_context
std::vector<char> buf(1); std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) { if (res < 0) {
LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__); LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
} }
} }
llama_client_slot* get_active_slot() {
for (llama_client_slot& slot : slots) {
// Check if the slot is currently processing
if (slot.is_processing()) {
return &slot; // Return the active slot
}
}
return nullptr; // No active slot found
}
void initialize() { void initialize() {
// create slots // create slots
all_slots_are_idle = true; all_slots_are_idle = true;
@@ -611,12 +567,12 @@ struct llama_server_context
std::vector<llama_token> p; std::vector<llama_token> p;
if (first) if (first)
{ {
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
first = false; first = false;
} }
else else
{ {
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
} }
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
} }
@@ -633,7 +589,7 @@ struct llama_server_context
else else
{ {
auto s = json_prompt.template get<std::string>(); auto s = json_prompt.template get<std::string>();
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
} }
return prompt_tokens; return prompt_tokens;
@@ -662,7 +618,7 @@ struct llama_server_context
bool launch_slot_with_data(llama_client_slot* &slot, json data) { bool launch_slot_with_data(llama_client_slot* &slot, json data) {
slot_params default_params; slot_params default_params;
common_sampler_params default_sparams; gpt_sampler_params default_sparams;
slot->params.stream = json_value(data, "stream", false); slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false);
@@ -802,7 +758,7 @@ struct llama_server_context
} }
else if (el[0].is_string()) else if (el[0].is_string())
{ {
auto toks = common_tokenize(model, el[0].get<std::string>(), false); auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks) for (auto tok : toks)
{ {
slot->sparams.logit_bias.push_back({tok, bias}); slot->sparams.logit_bias.push_back({tok, bias});
@@ -834,7 +790,7 @@ struct llama_server_context
sampler_names.emplace_back(name); sampler_names.emplace_back(name);
} }
} }
slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false); slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
} }
else else
{ {
@@ -856,11 +812,10 @@ struct llama_server_context
img_sl.img_data = clip_image_u8_init(); img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{ {
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", LOG_ERROR("failed to load image", {
__func__, {"slot_id", slot->id},
slot->id, {"img_sl_id", img_sl.id}
img_sl.id });
);
return false; return false;
} }
LOG_VERBOSE("image loaded", { LOG_VERBOSE("image loaded", {
@@ -898,12 +853,12 @@ struct llama_server_context
} }
} }
if (!found) { if (!found) {
LOG("ERROR: Image with id: %i, not found.\n", img_id); LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
slot->images.clear(); slot->images.clear();
return false; return false;
} }
} catch (const std::invalid_argument& e) { } catch (const std::invalid_argument& e) {
LOG("Invalid image number id in prompt\n"); LOG_TEE("Invalid image number id in prompt\n");
slot->images.clear(); slot->images.clear();
return false; return false;
} }
@@ -918,9 +873,9 @@ struct llama_server_context
if (slot->ctx_sampling != nullptr) if (slot->ctx_sampling != nullptr)
{ {
common_sampler_free(slot->ctx_sampling); gpt_sampler_free(slot->ctx_sampling);
} }
slot->ctx_sampling = common_sampler_init(model, slot->sparams); slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
//llama_set_rng_seed(ctx, slot->params.seed); //llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT; slot->command = LOAD_PROMPT;
@@ -931,7 +886,7 @@ struct llama_server_context
{"task_id", slot->task_id}, {"task_id", slot->task_id},
}); });
// LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); // LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
return true; return true;
} }
@@ -947,13 +902,13 @@ struct llama_server_context
system_tokens.clear(); system_tokens.clear();
if (!system_prompt.empty()) { if (!system_prompt.empty()) {
system_tokens = common_tokenize(ctx, system_prompt, add_bos_token); system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
common_batch_clear(batch); llama_batch_clear(batch);
for (int i = 0; i < (int)system_tokens.size(); ++i) for (int i = 0; i < (int)system_tokens.size(); ++i)
{ {
common_batch_add(batch, system_tokens[i], i, { 0 }, false); llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
} }
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
@@ -967,10 +922,11 @@ struct llama_server_context
batch.n_seq_id + i, batch.n_seq_id + i,
batch.seq_id + i, batch.seq_id + i,
batch.logits + i, batch.logits + i,
0, 0, 0, // unused
}; };
if (llama_decode(ctx, batch_view) != 0) if (llama_decode(ctx, batch_view) != 0)
{ {
LOG("%s: llama_decode() failed\n", __func__); LOG_TEE("%s: llama_decode() failed\n", __func__);
return; return;
} }
} }
@@ -982,7 +938,7 @@ struct llama_server_context
} }
} }
LOG("system prompt updated\n"); LOG_TEE("system prompt updated\n");
system_need_update = false; system_need_update = false;
} }
@@ -1041,7 +997,7 @@ struct llama_server_context
bool process_token(completion_token_output &result, llama_client_slot &slot) { bool process_token(completion_token_output &result, llama_client_slot &slot) {
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = common_token_to_piece(ctx, result.tok); const std::string token_str = llama_token_to_piece(ctx, result.tok);
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
@@ -1164,7 +1120,7 @@ struct llama_server_context
} }
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG("Error processing the given image"); LOG_TEE("Error processing the given image");
return false; return false;
} }
@@ -1176,7 +1132,7 @@ struct llama_server_context
void send_error(task_server& task, const std::string &error) void send_error(task_server& task, const std::string &error)
{ {
LOG("task %i - error: %s\n", task.id, error.c_str()); LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
task_result res; task_result res;
res.id = task.id; res.id = task.id;
res.multitask_id = task.multitask_id; res.multitask_id = task.multitask_id;
@@ -1192,7 +1148,7 @@ struct llama_server_context
samplers.reserve(slot.sparams.samplers.size()); samplers.reserve(slot.sparams.samplers.size());
for (const auto & sampler : slot.sparams.samplers) for (const auto & sampler : slot.sparams.samplers)
{ {
samplers.emplace_back(common_sampler_type_to_str(sampler)); samplers.emplace_back(gpt_sampler_type_to_str(sampler));
} }
return json { return json {
@@ -1248,7 +1204,7 @@ struct llama_server_context
if (slot.sparams.n_probs > 0) if (slot.sparams.n_probs > 0)
{ {
std::vector<completion_token_output> probs_output = {}; std::vector<completion_token_output> probs_output = {};
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size()); size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size()); size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
if (probs_pos < probs_stop_pos) if (probs_pos < probs_stop_pos)
@@ -1300,7 +1256,7 @@ struct llama_server_context
std::vector<completion_token_output> probs = {}; std::vector<completion_token_output> probs = {};
if (!slot.params.stream && slot.stopped_word) if (!slot.params.stream && slot.stopped_word)
{ {
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size()); probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
} }
else else
@@ -1411,10 +1367,11 @@ struct llama_server_context
batch.n_seq_id + i, batch.n_seq_id + i,
batch.seq_id + i, batch.seq_id + i,
batch.logits + i, batch.logits + i,
0, 0, 0, // unused
}; };
if (llama_decode(ctx, batch_view)) if (llama_decode(ctx, batch_view))
{ {
LOG("%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return false; return false;
} }
} }
@@ -1429,18 +1386,17 @@ struct llama_server_context
} }
const int n_embd = llama_n_embd(model); const int n_embd = llama_n_embd(model);
float * embd = img.image_embedding + i * n_embd; llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0); if (llama_decode(ctx, batch_img))
if (llama_decode(ctx, llava_batch.batch))
{ {
LOG("%s : failed to eval image\n", __func__); LOG_TEE("%s : failed to eval image\n", __func__);
return false; return false;
} }
slot.n_past += n_eval; slot.n_past += n_eval;
} }
image_idx++; image_idx++;
common_batch_clear(batch); llama_batch_clear(batch);
// append prefix of next image // append prefix of next image
const auto json_prompt = (image_idx >= (int) slot.images.size()) ? const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
@@ -1450,7 +1406,7 @@ struct llama_server_context
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i) for (int i = 0; i < (int) append_tokens.size(); ++i)
{ {
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true); llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
slot.n_past += 1; slot.n_past += 1;
} }
} }
@@ -1582,7 +1538,7 @@ struct llama_server_context
update_system_prompt(); update_system_prompt();
} }
common_batch_clear(batch); llama_batch_clear(batch);
if (all_slots_are_idle) if (all_slots_are_idle)
{ {
@@ -1616,7 +1572,7 @@ struct llama_server_context
slot.n_past = 0; slot.n_past = 0;
slot.truncated = false; slot.truncated = false;
slot.has_next_token = true; slot.has_next_token = true;
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
continue; continue;
// END LOCALAI changes // END LOCALAI changes
@@ -1660,7 +1616,7 @@ struct llama_server_context
// TODO: we always have to take into account the "system_tokens" // TODO: we always have to take into account the "system_tokens"
// this is not great and needs to be improved somehow // this is not great and needs to be improved somehow
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true); llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
slot.n_past += 1; slot.n_past += 1;
} }
@@ -1754,7 +1710,7 @@ struct llama_server_context
if (!slot.params.cache_prompt) if (!slot.params.cache_prompt)
{ {
common_sampler_reset(slot.ctx_sampling); gpt_sampler_reset(slot.ctx_sampling);
slot.n_past = 0; slot.n_past = 0;
slot.n_past_se = 0; slot.n_past_se = 0;
@@ -1766,7 +1722,7 @@ struct llama_server_context
// push the prompt into the sampling context (do not apply grammar) // push the prompt into the sampling context (do not apply grammar)
for (auto &token : prompt_tokens) for (auto &token : prompt_tokens)
{ {
common_sampler_accept(slot.ctx_sampling, token, false); gpt_sampler_accept(slot.ctx_sampling, token, false);
} }
slot.n_past = common_part(slot.cache_tokens, prompt_tokens); slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1858,17 +1814,16 @@ struct llama_server_context
ga_i += ga_w/ga_n; ga_i += ga_w/ga_n;
} }
} }
common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
slot_npast++; slot_npast++;
} }
if (has_images && !ingest_images(slot, n_batch)) if (has_images && !ingest_images(slot, n_batch))
{ {
LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", LOG_ERROR("failed processing images", {
__func__, "slot_id", slot.id,
slot.id, "task_id", slot.task_id,
slot.task_id });
);
// FIXME @phymbert: to be properly tested // FIXME @phymbert: to be properly tested
// early returning without changing the slot state will block the slot for ever // early returning without changing the slot state will block the slot for ever
// no one at the moment is checking the return value // no one at the moment is checking the return value
@@ -1908,10 +1863,10 @@ struct llama_server_context
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
LOG("\n"); LOG_TEE("\n");
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1921,7 +1876,7 @@ struct llama_server_context
slot.ga_i += slot.ga_w / slot.ga_n; slot.ga_i += slot.ga_w / slot.ga_n;
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
} }
slot.n_past_se += n_tokens; slot.n_past_se += n_tokens;
} }
@@ -1936,6 +1891,7 @@ struct llama_server_context
batch.n_seq_id + i, batch.n_seq_id + i,
batch.seq_id + i, batch.seq_id + i,
batch.logits + i, batch.logits + i,
0, 0, 0, // unused
}; };
const int ret = llama_decode(ctx, batch_view); const int ret = llama_decode(ctx, batch_view);
@@ -1945,11 +1901,11 @@ struct llama_server_context
if (n_batch == 1 || ret < 0) if (n_batch == 1 || ret < 0)
{ {
// if you get here, it means the KV cache is full - try increasing it via the context size // if you get here, it means the KV cache is full - try increasing it via the context size
LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return false; return false;
} }
LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
// retry with half the batch size to try to find a free slot in the KV cache // retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2; n_batch /= 2;
@@ -1974,9 +1930,9 @@ struct llama_server_context
} }
completion_token_output result; completion_token_output result;
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i); const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
common_sampler_accept(slot.ctx_sampling, id, true); gpt_sampler_accept(slot.ctx_sampling, id, true);
slot.n_decoded += 1; slot.n_decoded += 1;
if (slot.n_decoded == 1) if (slot.n_decoded == 1)
@@ -1987,7 +1943,7 @@ struct llama_server_context
} }
result.tok = id; result.tok = id;
const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling); const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
result.probs.push_back({ result.probs.push_back({
@@ -2040,7 +1996,7 @@ static json format_partial_response(
struct token_translator struct token_translator
{ {
llama_context * ctx; llama_context * ctx;
std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); } std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
}; };
@@ -2147,9 +2103,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
data["ignore_eos"] = predict->ignoreeos(); data["ignore_eos"] = predict->ignoreeos();
data["embeddings"] = predict->embeddings(); data["embeddings"] = predict->embeddings();
// Add the correlationid to json data
data["correlation_id"] = predict->correlationid();
// for each image in the request, add the image data // for each image in the request, add the image data
// //
for (int i = 0; i < predict->images_size(); i++) { for (int i = 0; i < predict->images_size(); i++) {
@@ -2234,7 +2187,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
// } // }
static void params_parse(const backend::ModelOptions* request, static void params_parse(const backend::ModelOptions* request,
common_params & params) { gpt_params & params) {
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809 // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
@@ -2342,7 +2295,7 @@ public:
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) { grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
// Implement LoadModel RPC // Implement LoadModel RPC
common_params params; gpt_params params;
params_parse(request, params); params_parse(request, params);
llama_backend_init(); llama_backend_init();
@@ -2388,11 +2341,6 @@ public:
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0); int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
reply.set_prompt_tokens(tokens_evaluated); reply.set_prompt_tokens(tokens_evaluated);
// Log Request Correlation Id
LOG_VERBOSE("correlation:", {
{ "id", data["correlation_id"] }
});
// Send the reply // Send the reply
writer->Write(reply); writer->Write(reply);
@@ -2416,12 +2364,6 @@ public:
std::string completion_text; std::string completion_text;
task_result result = llama.queue_results.recv(task_id); task_result result = llama.queue_results.recv(task_id);
if (!result.error && result.stop) { if (!result.error && result.stop) {
// Log Request Correlation Id
LOG_VERBOSE("correlation:", {
{ "id", data["correlation_id"] }
});
completion_text = result.result_json.value("content", ""); completion_text = result.result_json.value("content", "");
int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0); int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0); int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
@@ -2461,31 +2403,6 @@ public:
return grpc::Status::OK; return grpc::Status::OK;
} }
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
llama_client_slot* active_slot = llama.get_active_slot();
if (active_slot != nullptr) {
// Calculate the tokens per second using existing logic
double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
// Populate the response with metrics
response->set_slot_id(active_slot->id);
response->set_prompt_json_for_slot(active_slot->prompt.dump());
response->set_tokens_per_second(tokens_per_second);
response->set_tokens_generated(active_slot->n_decoded);
response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
} else {
// Handle case when no active slot exists
response->set_slot_id(0);
response->set_prompt_json_for_slot("");
response->set_tokens_per_second(0);
response->set_tokens_generated(0);
response->set_prompt_tokens_processed(0);
}
return grpc::Status::OK;
}
}; };
void RunServer(const std::string& server_address) { void RunServer(const std::string& server_address) {

View File

@@ -1,2 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch

View File

@@ -1 +1 @@
torch==2.4.1 torch

View File

@@ -1,2 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0 torch

View File

@@ -2,4 +2,4 @@
intel-extension-for-pytorch intel-extension-for-pytorch
torch torch
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,6 +1,6 @@
accelerate accelerate
auto-gptq==0.7.1 auto-gptq==0.7.1
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi
transformers transformers

View File

@@ -1,4 +1,4 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch
torchaudio==2.4.1 torchaudio

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
torchaudio==2.4.1+cu118 torchaudio
transformers transformers
accelerate accelerate

View File

@@ -1,4 +1,4 @@
torch==2.4.1 torch
torchaudio==2.4.1 torchaudio
transformers transformers
accelerate accelerate

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0 torch
torchaudio==2.4.1+rocm6.0 torchaudio
transformers transformers
accelerate accelerate

View File

@@ -3,6 +3,6 @@ intel-extension-for-pytorch
torch torch
torchaudio torchaudio
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
transformers transformers
accelerate accelerate

View File

@@ -1,4 +1,4 @@
bark==0.1.5 bark==0.1.5
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi

View File

@@ -1,2 +1,2 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf

View File

@@ -1,4 +1,3 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch
coqui-tts

View File

@@ -1,6 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
torchaudio==2.4.1+cu118 torchaudio
transformers transformers
accelerate accelerate
coqui-tts

View File

@@ -1,5 +1,4 @@
torch==2.4.1 torch
torchaudio==2.4.1 torchaudio
transformers transformers
accelerate accelerate
coqui-tts

View File

@@ -1,6 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0 torch
torchaudio==2.4.1+rocm6.0 torchaudio
transformers transformers
accelerate accelerate
coqui-tts

View File

@@ -3,7 +3,6 @@ intel-extension-for-pytorch
torch torch
torchaudio torchaudio
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
transformers transformers
accelerate accelerate
coqui-tts

View File

@@ -1,4 +1,4 @@
grpcio==1.67.0 TTS==0.22.0
grpcio==1.66.1
protobuf protobuf
certifi certifi
packaging==24.1

View File

@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
This method sets up the gRPC service by starting the server This method sets up the gRPC service by starting the server
""" """
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"]) self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
time.sleep(30) time.sleep(10)
def tearDown(self) -> None: def tearDown(self) -> None:
""" """

View File

@@ -5,5 +5,5 @@ accelerate
compel compel
peft peft
sentencepiece sentencepiece
torch==2.4.1 torch
optimum-quanto optimum-quanto

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
diffusers diffusers
opencv-python opencv-python
transformers transformers

View File

@@ -1,4 +1,4 @@
torch==2.4.1 torch
diffusers diffusers
opencv-python opencv-python
transformers transformers

View File

@@ -3,7 +3,7 @@ intel-extension-for-pytorch
torch torch
torchvision torchvision
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
diffusers diffusers
opencv-python opencv-python
transformers transformers

View File

@@ -1,5 +1,5 @@
setuptools setuptools
grpcio==1.67.0 grpcio==1.66.1
pillow pillow
protobuf protobuf
certifi certifi

View File

@@ -1,3 +1,3 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch

View File

@@ -1,4 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
transformers transformers
accelerate accelerate

View File

@@ -1,3 +1,3 @@
torch==2.4.1 torch
transformers transformers
accelerate accelerate

View File

@@ -1,4 +1,4 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi
wheel wheel

View File

@@ -1,2 +1,2 @@
torch==2.4.1 torch
transformers transformers

View File

@@ -1,3 +1,3 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
transformers transformers

View File

@@ -1,2 +1,2 @@
torch==2.4.1 torch
transformers transformers

View File

@@ -1,3 +1,3 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi

View File

@@ -1,3 +1 @@
torch==2.4.1 torch
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git

View File

@@ -1,4 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git

View File

@@ -1,3 +1 @@
torch==2.4.1 torch
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git

View File

@@ -1,4 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0 torch
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git

View File

@@ -2,22 +2,22 @@
intel-extension-for-pytorch intel-extension-for-pytorch
torch torch
optimum[openvino] optimum[openvino]
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
librosa==0.9.1 librosa==0.9.1
faster-whisper==0.9.0 faster-whisper==1.0.3
pydub==0.25.1 pydub==0.25.1
wavmark==0.0.3 wavmark==0.0.3
numpy==1.22.0 numpy==1.26.4
eng_to_ipa==0.0.2 eng_to_ipa==0.0.2
inflect==7.0.0 inflect==7.0.0
unidecode==1.3.7 unidecode==1.3.7
whisper-timestamped==1.14.2 whisper-timestamped==1.15.4
openai openai
python-dotenv python-dotenv
pypinyin==0.50.0 pypinyin==0.50.0
cn2an==0.5.22 cn2an==0.5.22
jieba==0.42.1 jieba==0.42.1
gradio==4.38.1
langid==1.1.6 langid==1.1.6
git+https://github.com/myshell-ai/MeloTTS.git git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git

View File

@@ -1,10 +1,10 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
librosa librosa
faster-whisper faster-whisper
pydub==0.25.1 pydub==0.25.1
wavmark==0.0.3 wavmark==0.0.3
numpy==1.22.0 numpy
eng_to_ipa==0.0.2 eng_to_ipa==0.0.2
inflect inflect
unidecode unidecode
@@ -13,8 +13,8 @@ openai
python-dotenv python-dotenv
pypinyin pypinyin
cn2an==0.5.22 cn2an==0.5.22
networkx==2.8.8
jieba==0.42.1 jieba==0.42.1
gradio==3.48.0 gradio
langid==1.1.6 langid==1.1.6
llvmlite==0.43.0 git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git

View File

@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
This method sets up the gRPC service by starting the server This method sets up the gRPC service by starting the server
""" """
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"]) self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
time.sleep(30) time.sleep(10)
def tearDown(self) -> None: def tearDown(self) -> None:
""" """

View File

@@ -15,12 +15,5 @@ installRequirements
# https://github.com/descriptinc/audiotools/issues/101 # https://github.com/descriptinc/audiotools/issues/101
# incompatible protobuf versions. # incompatible protobuf versions.
# PYDIR=python3.10 PYDIR=$(ls ${MY_DIR}/venv/lib)
# pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/" curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
# if [ ! -d ${pyenv} ]; then
# echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
# exit 1
# fi
# curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py

View File

@@ -1,4 +1,3 @@
git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17 git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17
llvmlite==0.43.0 llvmlite==0.43.0
numba==0.60.0 numba==0.60.0
git+https://github.com/descriptinc/audiotools

View File

@@ -1,3 +1,3 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
torchaudio==2.4.1+cu118 torchaudio
transformers transformers
accelerate accelerate

View File

@@ -1,4 +1,4 @@
torch==2.4.1 torch
torchaudio==2.4.1 torchaudio
transformers transformers
accelerate accelerate

View File

@@ -3,6 +3,6 @@ intel-extension-for-pytorch
torch torch
torchaudio torchaudio
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
transformers transformers
accelerate accelerate

View File

@@ -1,4 +1,4 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi
llvmlite==0.43.0 llvmlite==0.43.0

View File

@@ -1,4 +1,4 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch
rerankers[transformers] rerankers[transformers]

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
transformers transformers
accelerate accelerate
torch==2.4.1+cu118 torch
rerankers[transformers] rerankers[transformers]

View File

@@ -1,4 +1,4 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch
rerankers[transformers] rerankers[transformers]

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
transformers transformers
accelerate accelerate
torch==2.4.1+rocm6.0 torch
rerankers[transformers] rerankers[transformers]

View File

@@ -5,4 +5,4 @@ accelerate
torch torch
rerankers[transformers] rerankers[transformers]
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,3 +1,3 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi

View File

@@ -55,7 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
""" """
model_name = request.Model model_name = request.Model
try: try:
self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode) self.model = SentenceTransformer(model_name)
except Exception as err: except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")

View File

@@ -1,6 +1,6 @@
torch==2.4.1 torch
accelerate accelerate
transformers transformers
bitsandbytes bitsandbytes
sentence-transformers==3.2.0 sentence-transformers==3.0.1
transformers transformers

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
accelerate accelerate
sentence-transformers==3.2.0 sentence-transformers==3.0.1
transformers transformers

View File

@@ -1,4 +1,4 @@
torch==2.4.1 torch
accelerate accelerate
sentence-transformers==3.2.0 sentence-transformers==3.0.1
transformers transformers

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0 torch
accelerate accelerate
sentence-transformers==3.2.0 sentence-transformers==3.0.1
transformers transformers

View File

@@ -4,5 +4,5 @@ torch
optimum[openvino] optimum[openvino]
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
accelerate accelerate
sentence-transformers==3.2.0 sentence-transformers==3.0.1
transformers transformers

View File

@@ -1,5 +1,3 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi
datasets
einops

View File

@@ -1,3 +1,3 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch

View File

@@ -1,4 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
transformers transformers
accelerate accelerate
torch==2.4.1+cu118 torch

View File

@@ -1,3 +1,3 @@
transformers transformers
accelerate accelerate
torch==2.4.1 torch

View File

@@ -1,4 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
transformers transformers
accelerate accelerate
torch==2.4.1+rocm6.0 torch

View File

@@ -4,4 +4,4 @@ transformers
accelerate accelerate
torch torch
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,4 +1,4 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
scipy==1.14.0 scipy==1.14.0
certifi certifi

View File

@@ -72,12 +72,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
Returns: Returns:
A Result object that contains the result of the LoadModel operation. A Result object that contains the result of the LoadModel operation.
""" """
model_name = request.Model model_name = request.Model
# Check to see if the Model exists in the filesystem already.
if os.path.exists(request.ModelFile):
model_name = request.ModelFile
compute = torch.float16 compute = torch.float16
if request.F16Memory == True: if request.F16Memory == True:

View File

@@ -1,4 +1,4 @@
torch==2.4.1 torch
accelerate accelerate
transformers transformers
bitsandbytes bitsandbytes

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch
accelerate accelerate
transformers transformers
bitsandbytes bitsandbytes

View File

@@ -1,4 +1,4 @@
torch==2.4.1 torch
accelerate accelerate
transformers transformers
bitsandbytes bitsandbytes

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0 torch
accelerate accelerate
transformers transformers
bitsandbytes bitsandbytes

View File

@@ -1,4 +1,4 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,3 +1,3 @@
accelerate accelerate
torch==2.4.1 torch
torchaudio==2.4.1 torchaudio

View File

@@ -1,4 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
accelerate accelerate
torch==2.4.1+cu118 torch
torchaudio==2.4.1+cu118 torchaudio

View File

@@ -1,3 +1,3 @@
accelerate accelerate
torch==2.4.1 torch
torchaudio==2.4.1 torchaudio

View File

@@ -4,4 +4,4 @@ accelerate
torch torch
torchaudio torchaudio
optimum[openvino] optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,3 +1,3 @@
grpcio==1.67.0 grpcio==1.66.1
protobuf protobuf
certifi certifi

View File

@@ -5,8 +5,6 @@ import argparse
import signal import signal
import sys import sys
import os import os
from typing import List
from PIL import Image
import backend_pb2 import backend_pb2
import backend_pb2_grpc import backend_pb2_grpc
@@ -17,10 +15,6 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid from vllm.utils import random_uuid
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.multimodal.utils import fetch_image
from vllm.assets.video import VideoAsset
import base64
import io
_ONE_DAY_IN_SECONDS = 60 * 60 * 24 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -95,8 +89,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.Quantization != "": if request.Quantization != "":
engine_args.quantization = request.Quantization engine_args.quantization = request.Quantization
if request.LoadFormat != "":
engine_args.load_format = request.LoadFormat
if request.GPUMemoryUtilization != 0: if request.GPUMemoryUtilization != 0:
engine_args.gpu_memory_utilization = request.GPUMemoryUtilization engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
if request.TrustRemoteCode: if request.TrustRemoteCode:
@@ -113,7 +105,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
try: try:
self.llm = AsyncLLMEngine.from_engine_args(engine_args) self.llm = AsyncLLMEngine.from_engine_args(engine_args)
except Exception as err: except Exception as err:
print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
try: try:
@@ -126,7 +117,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
) )
except Exception as err: except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
print("Model loaded successfully", file=sys.stderr)
return backend_pb2.Result(message="Model loaded successfully", success=True) return backend_pb2.Result(message="Model loaded successfully", success=True)
async def Predict(self, request, context): async def Predict(self, request, context):
@@ -205,35 +196,15 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.Seed != 0: if request.Seed != 0:
sampling_params.seed = request.Seed sampling_params.seed = request.Seed
# Extract image paths and process images
prompt = request.Prompt prompt = request.Prompt
image_paths = request.Images # If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
image_data = [self.load_image(img_path) for img_path in image_paths]
videos_path = request.Videos
video_data = [self.load_video(video_path) for video_path in videos_path]
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
if not request.Prompt and request.UseTokenizerTemplate and request.Messages: if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True) prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
# Generate text using the LLM engine # Generate text
request_id = random_uuid() request_id = random_uuid()
print(f"Generating text with request_id: {request_id}", file=sys.stderr) outputs = self.llm.generate(prompt, sampling_params, request_id)
multi_modal_data = {}
if image_data:
multi_modal_data["image"] = image_data
if video_data:
multi_modal_data["video"] = video_data
outputs = self.llm.generate(
{
"prompt": prompt,
"multi_modal_data": multi_modal_data if multi_modal_data else None,
},
sampling_params=sampling_params,
request_id=request_id,
)
# Stream the results # Stream the results
generated_text = "" generated_text = ""
@@ -256,57 +227,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if streaming: if streaming:
return return
# Remove the image files from /tmp folder
for img_path in image_paths:
try:
os.remove(img_path)
except Exception as e:
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
# Sending the final generated text # Sending the final generated text
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8')) yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
def load_image(self, image_path: str):
"""
Load an image from the given file path or base64 encoded data.
Args:
image_path (str): The path to the image file or base64 encoded data.
Returns:
Image: The loaded image.
"""
try:
image_data = base64.b64decode(image_path)
image = Image.open(io.BytesIO(image_data))
return image
except Exception as e:
print(f"Error loading image {image_path}: {e}", file=sys.stderr)
return None
def load_video(self, video_path: str):
"""
Load a video from the given file path.
Args:
video_path (str): The path to the image file.
Returns:
Video: The loaded video.
"""
try:
timestamp = str(int(time.time() * 1000)) # Generate timestamp
p = f"/tmp/vl-{timestamp}.data" # Use timestamp in filename
with open(p, "wb") as f:
f.write(base64.b64decode(video_path))
video = VideoAsset(name=p).np_ndarrays
os.remove(p)
return video
except Exception as e:
print(f"Error loading video {video_path}: {e}", file=sys.stderr)
return None
async def serve(address): async def serve(address):
# Start asyncio gRPC server # Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))

View File

@@ -13,20 +13,4 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi fi
# We don't embed this into the images as it is a large dependency and not always needed. installRequirements
# Besides, the speed inference are not actually usable in the current state for production use-cases.
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
ensureVenv
# https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
if [ ! -d vllm ]; then
git clone https://github.com/vllm-project/vllm
fi
pushd vllm
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.67.0 protobuf bitsandbytes
uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
VLLM_TARGET_DEVICE=cpu python setup.py install
popd
rm -rf vllm
else
installRequirements
fi

View File

@@ -1,3 +1,3 @@
accelerate accelerate
torch==2.4.1 torch
transformers transformers

View File

@@ -1,5 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
accelerate accelerate
torch==2.4.1+cu118 torch
transformers transformers
bitsandbytes

Some files were not shown because too many files have changed in this diff Show More