Compare commits

...

24 Commits

Author SHA1 Message Date
Ettore Di Giacinto
b16a01d0bd WIP speculative 2025-01-24 10:17:54 +01:00
Gianluca Boiano
9a1182fa01 chore(model gallery): add flux.1, stablediffusion and whisper icons (#4680)
Signed-off-by: Gianluca Boiano <morf3089@gmail.com>
2025-01-24 08:29:02 +01:00
Gianluca Boiano
66e9ef3f33 chore(model gallery): add DeepSeek R1 14b, 32b and 70b (#4679)
Signed-off-by: Gianluca Boiano <morf3089@gmail.com>
2025-01-24 08:28:44 +01:00
Ettore Di Giacinto
8282414583 chore(downloader): support hf.co and hf:// URIs (#4677)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-24 08:27:22 +01:00
Gianluca Boiano
d1d7ce83d4 chore(model gallery): add MiniCPM-o-2.6-7.6b (#4676)
Signed-off-by: Gianluca Boiano <morf3089@gmail.com>
2025-01-24 08:27:02 +01:00
Ettore Di Giacinto
5177837ab0 chore: detect and enable avx512 builds (#4675)
chore(avx512): add support

Fixes https://github.com/mudler/LocalAI/issues/4662

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-24 08:26:44 +01:00
Ettore Di Giacinto
f9e368b7c4 chore(refactor): group cpu cap detection (#4674)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-23 16:35:44 +01:00
Ettore Di Giacinto
eef80b9880 chore(ci): cleanup tests
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-23 10:02:57 +01:00
Ettore Di Giacinto
073eaec729 chore(openvoice): drop backend (#4673)
The project (MeloTTS) has been quite since long, newer backends are much
performant and better quality overall.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-23 10:00:36 +01:00
Ettore Di Giacinto
318225f631 chore(parler-tts): drop backend (#4672)
We support at this point more extensive backends that are SOTA and
support also voice cloning, and many other features. This backend is
superseded and also poses significant maintenance burden as there is an
open issue https://github.com/mudler/LocalAI/issues/3941 which is still
open as it deps are pinning old versions of grpc.

Closes https://github.com/mudler/LocalAI/issues/3941

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-23 09:46:16 +01:00
Ettore Di Giacinto
89429a439b feat(transformers): add support to Mamba (#4669)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-23 09:30:47 +01:00
LocalAI [bot]
200fe358f0 chore: ⬆️ Update ggerganov/llama.cpp to 6152129d05870cb38162c422c6ba80434e021e9f (#4668)
⬆️ Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2025-01-23 08:06:43 +01:00
Ettore Di Giacinto
e426ab7c23 feat(faster-whisper): add backend (#4666)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-23 08:06:18 +01:00
LocalAI [bot]
715071b68d feat(swagger): update swagger (#4667)
Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2025-01-22 21:51:38 +01:00
Peter Cover
a05737c7e4 chore: fix some function names in comment (#4665)
Signed-off-by: petercover <raowanxiang@outlook.com>
2025-01-22 19:35:53 +01:00
Richard Palethorpe
e8eb0b2c50 fix(stores): Stores fixes and testing (#4663)
* fix(stores): Actually check a vector is a unit vector/normalized

Instead of just summing the components to see if they equal 1.0, take
the actual magnitude/p-norm of the vector and check that is
approximately 1.0.

Note that this shouldn't change the order of results except in edge
cases if I am too lax with the precision of the equality
comparison. However it should improve performance for normalized
vectors which were being misclassified.

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(stores): Add tests for known results and triangle inequality

This adds some more tests to check the cosine similarity function has
some expected mathematical properties.

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2025-01-22 19:35:05 +01:00
Ettore Di Giacinto
e15d29aba2 chore(stablediffusion-ncn): drop in favor of ggml implementation (#4652)
* chore(stablediffusion-ncn): drop in favor of ggml implementation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(ci): drop stablediffusion build

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(tests): add

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(tests): try to fixup current tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Try to fix tests

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Tests improvements

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(tests): use quality to specify step

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(tests): switch to sd-1.5

also increase prep time for downloading models

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-22 19:34:16 +01:00
Ettore Di Giacinto
10675ac28e Update README.md
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2025-01-22 18:07:30 +01:00
Ettore Di Giacinto
0ec25b8b07 chore(model gallery): add sd-1.5-ggml and sd-3.5-medium-ggml (#4664)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-22 16:37:20 +01:00
LocalAI [bot]
e81ceff681 chore: ⬆️ Update ggerganov/llama.cpp to 6171c9d25820ccf676b243c172868819d882848f (#4661)
⬆️ Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2025-01-21 22:04:29 +00:00
Ettore Di Giacinto
6831719e1e chore(model gallery): add deepseek-r1-distill-qwen-7b (#4660)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-01-21 15:09:36 +01:00
Gianluca Boiano
b264a91b3f chore(model gallery): add Deepseek-R1-Distill models (#4646)
* chore(model gallery): add Deepseek-R1-Distill-Llama-8b

Signed-off-by: Gianluca Boiano <morf3089@gmail.com>

* chore(model gallery): add Deepseek-R1-Distill-Qwen-1.5b

Signed-off-by: Gianluca Boiano <morf3089@gmail.com>

---------

Signed-off-by: Gianluca Boiano <morf3089@gmail.com>
2025-01-21 10:37:05 +01:00
LocalAI [bot]
1a08948e63 chore: ⬆️ Update ggerganov/llama.cpp to aea8ddd5165d525a449e2fc3839db77a71f4a318 (#4657)
⬆️ Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2025-01-21 08:37:13 +01:00
dependabot[bot]
14a1e02f44 chore(deps): Bump docs/themes/hugo-theme-relearn from 80e448e to 8dad5ee (#4656)
chore(deps): Bump docs/themes/hugo-theme-relearn

Bumps [docs/themes/hugo-theme-relearn](https://github.com/McShelby/hugo-theme-relearn) from `80e448e` to `8dad5ee`.
- [Release notes](https://github.com/McShelby/hugo-theme-relearn/releases)
- [Commits](80e448e5bd...8dad5ee419)

---
updated-dependencies:
- dependency-name: docs/themes/hugo-theme-relearn
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-01-20 23:33:40 +00:00
80 changed files with 887 additions and 1434 deletions

View File

@@ -7,7 +7,7 @@ services:
args:
- FFMPEG=true
- IMAGE_TYPE=extras
- GO_TAGS=stablediffusion p2p tts
- GO_TAGS=p2p tts
env_file:
- ../.env
ports:

6
.env
View File

@@ -38,12 +38,12 @@
## Uncomment and set to true to enable rebuilding from source
# REBUILD=true
## Enable go tags, available: stablediffusion, tts
## stablediffusion: image generation with stablediffusion
## Enable go tags, available: p2p, tts
## p2p: enable distributed inferencing
## tts: enables text-to-speech with go-piper
## (requires REBUILD=true)
#
# GO_TAGS=stablediffusion
# GO_TAGS=p2p
## Path where to store generated images
# LOCALAI_IMAGE_PATH=/tmp/generated/images

View File

@@ -237,40 +237,7 @@ jobs:
detached: true
connect-timeout-seconds: 180
limit-access-to-actor: true
build-stablediffusion:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- uses: actions/setup-go@v5
with:
go-version: '1.21.x'
cache: false
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
- name: Build stablediffusion
run: |
export PATH=$PATH:$GOPATH/bin
make backend-assets/grpc/stablediffusion
mkdir -p release && cp backend-assets/grpc/stablediffusion release
env:
GO_TAGS: stablediffusion
- uses: actions/upload-artifact@v4
with:
name: stablediffusion
path: release/
- name: Release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
with:
files: |
release/*
build-macOS-x86_64:
runs-on: macos-13

View File

@@ -78,57 +78,6 @@ jobs:
make --jobs=5 --output-sync=target -C backend/python/diffusers
make --jobs=5 --output-sync=target -C backend/python/diffusers test
tests-parler-tts:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test parler-tts
run: |
make --jobs=5 --output-sync=target -C backend/python/parler-tts
make --jobs=5 --output-sync=target -C backend/python/parler-tts test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19
with:
detached: true
connect-timeout-seconds: 180
limit-access-to-actor: true
tests-openvoice:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test openvoice
run: |
make --jobs=5 --output-sync=target -C backend/python/openvoice
make --jobs=5 --output-sync=target -C backend/python/openvoice test
# tests-transformers-musicgen:
# runs-on: ubuntu-latest
# steps:

View File

@@ -105,9 +105,7 @@ jobs:
# Pre-build piper before we start tests in order to have shared libraries in place
make sources/go-piper && \
GO_TAGS="tts" make -C sources/go-piper piper.o && \
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
# Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
env:
CUDA_VERSION: 12-4
- name: Cache grpc
@@ -129,7 +127,7 @@ jobs:
cd grpc && cd cmake/build && sudo make --jobs 5 install
- name: Test
run: |
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19

2
.vscode/launch.json vendored
View File

@@ -26,7 +26,7 @@
"LOCALAI_P2P": "true",
"LOCALAI_FEDERATED": "true"
},
"buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
"buildFlags": ["-tags", "p2p tts", "-v"],
"envFile": "${workspaceFolder}/.env",
"cwd": "${workspaceRoot}"
}

View File

@@ -15,8 +15,7 @@ ARG TARGETARCH
ARG TARGETVARIANT
ENV DEBIAN_FRONTEND=noninteractive
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
RUN apt-get update && \
apt-get install -y --no-install-recommends \
@@ -69,14 +68,10 @@ ENV PATH=/opt/rocm/bin:${PATH}
# OpenBLAS requirements and stable diffusion
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libopenblas-dev \
libopencv-dev && \
libopenblas-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set up OpenCV
RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
WORKDIR /build
###################################
@@ -251,7 +246,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
FROM requirements-drivers AS builder-base
ARG GO_TAGS="stablediffusion tts p2p"
ARG GO_TAGS="tts p2p"
ARG GRPC_BACKENDS
ARG MAKEFLAGS
ARG LD_FLAGS="-s -w"
@@ -285,35 +280,12 @@ RUN <<EOT bash
fi
EOT
###################################
###################################
# This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
FROM builder-base AS builder-sd
# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
COPY Makefile .
COPY go.mod .
COPY go.sum .
COPY backend/backend.proto ./backend/backend.proto
COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
COPY pkg/grpc ./pkg/grpc
COPY pkg/stablediffusion ./pkg/stablediffusion
RUN git init
RUN make sources/go-stable-diffusion
RUN touch prepare-sources
# Actually build the backend
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
###################################
###################################
# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
# Adjustments to the build process should likely be made here.
FROM builder-sd AS builder
FROM builder-base AS builder
# Install the pre-built GRPC
COPY --from=grpc /opt/grpc /usr/local
@@ -331,7 +303,7 @@ RUN make prepare
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
## (both will use CUDA or hipblas for the actual computation)
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \
make build; \
fi
@@ -353,8 +325,6 @@ ARG FFMPEG
COPY --from=grpc /opt/grpc /usr/local
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
COPY .devcontainer-scripts /.devcontainer-scripts
# Add FFmpeg
@@ -427,9 +397,6 @@ COPY --from=builder /build/local-ai ./
# Copy shared libraries for piper
COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
# do not let stablediffusion rebuild (requires an older version of absl)
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
# Change the shell to bash so we can use [[ tests below
SHELL ["/bin/bash", "-c"]
# We try to strike a balance between individual layer size (as that affects total push time) and total image size
@@ -443,8 +410,8 @@ RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/coqui \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/parler-tts \
if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/faster-whisper \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/diffusers \
@@ -453,9 +420,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/kokoro \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/openvoice \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/exllama2 \
; fi && \
@@ -474,9 +438,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/rerankers \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/mamba \
; fi
# Make sure the models directory exists

View File

@@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=92bc493917d43b83e592349e138b54c90b1c3ea7
CPPLLAMA_VERSION?=6152129d05870cb38162c422c6ba80434e021e9f
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -18,10 +18,6 @@ WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
PIPER_REPO?=https://github.com/mudler/go-piper
PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
# stablediffusion version
STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
# bark.cpp
BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=v1.0.0
@@ -179,11 +175,6 @@ ifeq ($(STATIC),true)
LD_FLAGS+=-linkmode external -extldflags -static
endif
ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
# OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
endif
ifeq ($(findstring tts,$(GO_TAGS)),tts)
# OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
# OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@@ -195,6 +186,7 @@ endif
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
@@ -273,19 +265,6 @@ sources/go-piper:
sources/go-piper/libpiper_binding.a: sources/go-piper
$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
## stable diffusion (onnx)
sources/go-stable-diffusion:
mkdir -p sources/go-stable-diffusion
cd sources/go-stable-diffusion && \
git init && \
git remote add origin $(STABLEDIFFUSION_REPO) && \
git fetch origin && \
git checkout $(STABLEDIFFUSION_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
## stablediffusion (ggml)
sources/stablediffusion-ggml.cpp:
git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
@@ -331,20 +310,18 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion backend/cpp/llama/llama.cpp
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace
@@ -355,7 +332,6 @@ rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-stable-diffusion clean
$(MAKE) -C sources/go-piper clean
$(MAKE) build
@@ -470,7 +446,7 @@ prepare-test: grpcs
test: prepare test-models/testmodel.ggml grpcs
@echo 'Running tests'
export GO_TAGS="tts stablediffusion debug"
export GO_TAGS="tts debug"
$(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
@@ -558,10 +534,10 @@ protogen-go-clean:
$(RM) bin/*
.PHONY: protogen-python
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen transformers-protogen parler-tts-protogen kokoro-protogen vllm-protogen openvoice-protogen
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
.PHONY: protogen-python-clean
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean rerankers-protogen-clean transformers-protogen-clean parler-tts-protogen-clean kokoro-protogen-clean vllm-protogen-clean openvoice-protogen-clean
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
.PHONY: autogptq-protogen
autogptq-protogen:
@@ -595,6 +571,14 @@ diffusers-protogen:
diffusers-protogen-clean:
$(MAKE) -C backend/python/diffusers protogen-clean
.PHONY: faster-whisper-protogen
faster-whisper-protogen:
$(MAKE) -C backend/python/faster-whisper protogen
.PHONY: faster-whisper-protogen-clean
faster-whisper-protogen-clean:
$(MAKE) -C backend/python/faster-whisper protogen-clean
.PHONY: exllama2-protogen
exllama2-protogen:
$(MAKE) -C backend/python/exllama2 protogen
@@ -603,14 +587,6 @@ exllama2-protogen:
exllama2-protogen-clean:
$(MAKE) -C backend/python/exllama2 protogen-clean
.PHONY: mamba-protogen
mamba-protogen:
$(MAKE) -C backend/python/mamba protogen
.PHONY: mamba-protogen-clean
mamba-protogen-clean:
$(MAKE) -C backend/python/mamba protogen-clean
.PHONY: rerankers-protogen
rerankers-protogen:
$(MAKE) -C backend/python/rerankers protogen
@@ -627,14 +603,6 @@ transformers-protogen:
transformers-protogen-clean:
$(MAKE) -C backend/python/transformers protogen-clean
.PHONY: parler-tts-protogen
parler-tts-protogen:
$(MAKE) -C backend/python/parler-tts protogen
.PHONY: parler-tts-protogen-clean
parler-tts-protogen-clean:
$(MAKE) -C backend/python/parler-tts protogen-clean
.PHONY: kokoro-protogen
kokoro-protogen:
$(MAKE) -C backend/python/kokoro protogen
@@ -643,14 +611,6 @@ kokoro-protogen:
kokoro-protogen-clean:
$(MAKE) -C backend/python/kokoro protogen-clean
.PHONY: openvoice-protogen
openvoice-protogen:
$(MAKE) -C backend/python/openvoice protogen
.PHONY: openvoice-protogen-clean
openvoice-protogen-clean:
$(MAKE) -C backend/python/openvoice protogen-clean
.PHONY: vllm-protogen
vllm-protogen:
$(MAKE) -C backend/python/vllm protogen
@@ -666,13 +626,11 @@ prepare-extra-conda-environments: protogen-python
$(MAKE) -C backend/python/bark
$(MAKE) -C backend/python/coqui
$(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/faster-whisper
$(MAKE) -C backend/python/vllm
$(MAKE) -C backend/python/mamba
$(MAKE) -C backend/python/rerankers
$(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/parler-tts
$(MAKE) -C backend/python/kokoro
$(MAKE) -C backend/python/openvoice
$(MAKE) -C backend/python/exllama2
prepare-test-extra: protogen-python
@@ -742,6 +700,13 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx512
$(MAKE) -C backend/cpp/llama-avx512 purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx
$(MAKE) -C backend/cpp/llama-avx purge
@@ -816,13 +781,6 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/piper
endif
backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion
endif
backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero

View File

@@ -39,7 +39,7 @@
</p>
<p align="center">
<a href="https://trendshift.io/repositories/1484" target="_blank"><img src="https://trendshift.io/api/badge/repositories/1484" alt="go-skynet%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)

View File

@@ -1,56 +1,17 @@
name: stablediffusion
backend: stablediffusion
backend: stablediffusion-ggml
cfg_scale: 4.5
options:
- sampler:euler
parameters:
model: stablediffusion_assets
license: "BSD-3"
urls:
- https://github.com/EdVince/Stable-Diffusion-NCNN
- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
description: |
Stable Diffusion in NCNN with c++, supported txt2img and img2img
model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
step: 25
download_files:
- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
- filename: "stablediffusion_assets/log_sigmas.bin"
sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
- filename: "stablediffusion_assets/vocab.txt"
sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
usage: |
curl http://localhost:8080/v1/images/generations \

View File

@@ -22,6 +22,7 @@
#include "backend.grpc.pb.h"
#include "utils.hpp"
#include "sampling.h"
#include "speculative.h"
// include std::regex
#include <cstddef>
#include <thread>
@@ -185,12 +186,45 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
return out;
}
struct llama_slot_params {
uint32_t seed = -1; // RNG seed
bool stream = true;
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
bool return_tokens = false;
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_adapter_lora_info> lora;
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
bool timings_per_token = false;
bool post_sampling_probs = false;
bool ignore_eos = false;
json input_prefix;
json input_suffix;
struct common_params_sampling sampling;
struct common_params_speculative speculative;
};
struct llama_client_slot
{
int id;
int task_id = -1;
struct slot_params params;
struct llama_slot_params params;
common_speculative * spec = nullptr;
llama_batch batch_spec = {};
slot_state state = IDLE;
slot_command command = NONE;
@@ -283,6 +317,7 @@ struct llama_client_slot
images.clear();
}
bool has_budget(common_params &global_params) {
if (params.n_predict == -1 && global_params.n_predict == -1)
{
@@ -454,6 +489,10 @@ struct llama_server_context
{
llama_model *model = nullptr;
llama_context *ctx = nullptr;
common_init_result llama_init_dft;
llama_context * ctx_dft = nullptr;
llama_model * model_dft = nullptr;
llama_context_params cparams_dft;
const llama_vocab * vocab = nullptr;
clip_ctx *clp_ctx = nullptr;
@@ -502,6 +541,7 @@ struct llama_server_context
}
}
bool load_model(const common_params &params_)
{
params = params_;
@@ -545,6 +585,45 @@ struct llama_server_context
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params.speculative.model.empty()) {
LOG("loading draft model '%s'\n", params.speculative.model.c_str());
auto params_dft = params;
params_dft.devices = params.speculative.devices;
params_dft.model = params.speculative.model;
params_dft.n_ctx = params.speculative.n_ctx == 0 ? params.n_ctx / params.n_parallel : params.speculative.n_ctx;
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
llama_init_dft = common_init_from_params(params_dft);
model_dft = llama_init_dft.model.get();
if (model_dft == nullptr) {
LOG("failed to load draft model, '%s'\n", params.speculative.model.c_str());
return false;
}
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
LOG("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str());
return false;
}
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
cparams_dft = common_context_params_to_llama(params_dft);
cparams_dft.n_batch = n_ctx_dft;
// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;
// the context is not needed - we will create one for each slot
llama_init_dft.context.reset();
}
return true;
}
@@ -573,6 +652,22 @@ struct llama_server_context
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
if (model_dft) {
slot.batch_spec = llama_batch_init(params.speculative.n_max + 1, 0, 1);
ctx_dft = llama_init_from_model(model_dft, cparams_dft);
if (ctx_dft == nullptr) {
LOG("%s", "failed to create draft context\n");
return;
}
slot.spec = common_speculative_init(ctx_dft);
if (slot.spec == nullptr) {
LOG("%s", "failed to create speculator\n");
return;
}
}
LOG_INFO("new slot", {
{"slot_id", slot.id},
{"n_ctx_slot", slot.n_ctx}
@@ -681,9 +776,11 @@ struct llama_server_context
}
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
slot_params default_params;
llama_slot_params default_params;
common_params_sampling default_sparams;
default_sparams.speculative = params_base.speculative;
slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
@@ -707,6 +804,15 @@ struct llama_server_context
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
slot->sparams.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
slot->sparams.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
slot->sparams.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
slot->sparams.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
slot->sparams.speculative.n_min = std::max(params.speculative.n_min, 2);
slot->sparams.speculative.n_max = std::max(params.speculative.n_max, 0);
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
@@ -2024,6 +2130,97 @@ struct llama_server_context
}
}
// do speculative decoding
for (auto & slot : slots) {
if (!slot.is_processing() || !(ctx_dft && params.speculative.n_max > 0)) {
continue;
}
if (slot.state != PROCESSING) {
continue;
}
// determine the max draft that fits the current slot state
int n_draft_max = slot.params.speculative.n_max;
// note: n_past is not yet increased for the `id` token sampled above
// also, need to leave space for 1 extra token to allow context shifts
n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2);
if (slot.n_remaining > 0) {
n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
}
LOG("max possible draft: %d\n", n_draft_max);
if (n_draft_max < slot.params.speculative.n_min) {
LOG("the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min);
continue;
}
llama_token id = slot.sampled;
struct common_speculative_params params_spec;
params_spec.n_draft = n_draft_max;
params_spec.n_reuse = llama_n_ctx(ctx_dft) - slot.params.speculative.n_max;
params_spec.p_min = slot.params.speculative.p_min;
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
// ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) {
LOG("ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
continue;
}
// construct the speculation batch
common_batch_clear(slot.batch_spec);
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
for (size_t i = 0; i < draft.size(); ++i) {
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
}
LOG("decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
llama_decode(ctx, slot.batch_spec);
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, draft);
slot.n_past += ids.size();
slot.n_decoded += ids.size();
slot.cache_tokens.push_back(id);
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
for (size_t i = 0; i < ids.size(); ++i) {
completion_token_output result;
result.tok = ids[i];
result.text_to_send = common_token_to_piece(ctx, result.tok, params.special);
//result.prob = 1.0f; // set later
// TODO: set result.probs
if (!process_token(result, slot)) {
// release slot because of stop condition
slot.release();
slot.print_timings();
send_final_response(slot);
metrics.on_prediction(slot);
break;
}
}
LOG("accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
}
LOG_VERBOSE("slots updated", {});
return true;
}
@@ -2296,6 +2493,30 @@ static void params_parse(const backend::ModelOptions* request,
params.cpuparams.n_threads = request->threads();
params.n_gpu_layers = request->ngpulayers();
params.n_batch = request->nbatch();
params.speculative.model = request->draftmodel();
// If options is not NULL, parse options
for (int i = 0; request->options()[i] != NULL; i++) {
char *optname = strtok(request->options()[i], ":");
char *optval = strtok(NULL, ":");
if (optval == NULL) {
optval = "true";
}
if (!strcmp(optname, "speculative.n_gpu_layers")) {
params.speculative.n_gpu_layers = std::stoi(optval);
}
if (!strcmp(optname, "speculative.n_ctx")) {
params.speculative.n_ctx = std::stoi(optval);
}
}
if params.speculative.n_gpu_layers == 0 {
params.speculative.n_gpu_layers = params.n_gpu_layers;
}
if params.speculative.n_ctx == 0 {
params.speculative.n_ctx = params.n_ctx;
}
// Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
//params.n_parallel = 1;
const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");

View File

@@ -1,21 +0,0 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Image{}); err != nil {
panic(err)
}
}

View File

@@ -1,33 +0,0 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/stablediffusion"
)
type Image struct {
base.SingleThread
stablediffusion *stablediffusion.StableDiffusion
}
func (image *Image) Load(opts *pb.ModelOptions) error {
var err error
// Note: the Model here is a path to a directory containing the model files
image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
return err
}
func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
return image.stablediffusion.GenerateImage(
int(opts.Height),
int(opts.Width),
int(opts.Mode),
int(opts.Step),
int(opts.Seed),
opts.PositivePrompt,
opts.NegativePrompt,
opts.Dst)
}

View File

@@ -311,12 +311,16 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
}
func isNormalized(k []float32) bool {
var sum float32
var sum float64
for _, v := range k {
sum += v
v64 := float64(v)
sum += v64*v64
}
return sum == 1.0
s := math.Sqrt(sum)
return s >= 0.99 && s <= 1.01
}
// TODO: This we could replace with handwritten SIMD code
@@ -328,7 +332,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
dot += k1[i] * k2[i]
}
assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
// 2.0 * (1.0 - dot) would be the Euclidean distance
return dot
@@ -418,7 +422,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
sim := float32(dot / (mag1 * math.Sqrt(mag2)))
assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
return sim
}

View File

@@ -1,8 +1,9 @@
.DEFAULT_GOAL := install
.PHONY: install
install: protogen
install:
bash install.sh
$(MAKE) protogen
.PHONY: protogen
protogen: backend_pb2_grpc.py backend_pb2.py
@@ -12,14 +13,8 @@ protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
backend_pb2_grpc.py backend_pb2.py:
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
bash protogen.sh
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__
.PHONY: test
test: protogen
@echo "Testing openvoice..."
bash test.sh
@echo "openvoice tested."
rm -rf venv __pycache__

View File

@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""
This is an extra gRPC server of LocalAI for Bark TTS
"""
from concurrent import futures
import time
import argparse
import signal
import sys
import os
import backend_pb2
import backend_pb2_grpc
from faster_whisper import WhisperModel
import grpc
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
BackendServicer is the class that implements the gRPC service
"""
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
device = "cpu"
# Get device
# device = "cuda" if request.CUDA else "cpu"
if request.CUDA:
device = "cuda"
try:
print("Preparing models, please wait", file=sys.stderr)
self.model = WhisperModel(request.Model, device=device, compute_type="float16")
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
# Implement your logic here for the LoadModel service
# Replace this with your desired response
return backend_pb2.Result(message="Model loaded successfully", success=True)
def AudioTranscription(self, request, context):
resultSegments = []
text = ""
try:
segments, info = self.model.transcribe(request.dst, beam_size=5, condition_on_previous_text=False)
id = 0
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
resultSegments.append(backend_pb2.TranscriptSegment(id=id, start=segment.start, end=segment.end, text=segment.text))
text += segment.text
id += 1
except Exception as err:
print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View File

@@ -12,5 +12,3 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
fi
installRequirements
python -m unidic download

View File

View File

@@ -0,0 +1,8 @@
faster-whisper
opencv-python
accelerate
compel
peft
sentencepiece
torch==2.4.1
optimum-quanto

View File

@@ -0,0 +1,9 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118
faster-whisper
opencv-python
accelerate
compel
peft
sentencepiece
optimum-quanto

View File

@@ -0,0 +1,8 @@
torch==2.4.1
faster-whisper
opencv-python
accelerate
compel
peft
sentencepiece
optimum-quanto

View File

@@ -0,0 +1,3 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch
faster-whisper

View File

@@ -1,8 +1,6 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
torchaudio==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
optimum[openvino]
transformers
accelerate
faster-whisper

View File

@@ -1,3 +1,3 @@
grpcio==1.69.0
protobuf
certifi
grpcio-tools

View File

@@ -1,29 +0,0 @@
.PHONY: mamba
mamba: protogen
bash install.sh
.PHONY: run
run: protogen
@echo "Running mamba..."
bash run.sh
@echo "mamba run."
.PHONY: test
test: protogen
@echo "Testing mamba..."
bash test.sh
@echo "mamba tested."
.PHONY: protogen
protogen: backend_pb2_grpc.py backend_pb2.py
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
backend_pb2_grpc.py backend_pb2.py:
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
.PHONY: clean
clean: protogen-clean
$(RM) -r venv __pycache__

View File

@@ -1,5 +0,0 @@
# Creating a separate environment for the mamba project
```
make mamba
```

View File

@@ -1,179 +0,0 @@
#!/usr/bin/env python3
from concurrent import futures
import time
import argparse
import signal
import sys
import os
import backend_pb2
import backend_pb2_grpc
import grpc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
MAMBA_CHAT= os.environ.get('MAMBA_CHAT', '1') == '1'
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
A gRPC servicer that implements the Backend service defined in backend.proto.
"""
def generate(self,prompt, max_new_tokens):
"""
Generates text based on the given prompt and maximum number of new tokens.
Args:
prompt (str): The prompt to generate text from.
max_new_tokens (int): The maximum number of new tokens to generate.
Returns:
str: The generated text.
"""
self.generator.end_beam_search()
# Tokenizing the input
ids = self.generator.tokenizer.encode(prompt)
self.generator.gen_begin_reuse(ids)
initial_len = self.generator.sequence[0].shape[0]
has_leading_space = False
decoded_text = ''
for i in range(max_new_tokens):
token = self.generator.gen_single_token()
if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith(''):
has_leading_space = True
decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
if has_leading_space:
decoded_text = ' ' + decoded_text
if token.item() == self.generator.tokenizer.eos_token_id:
break
return decoded_text
def Health(self, request, context):
"""
Returns a health check message.
Args:
request: The health check request.
context: The gRPC context.
Returns:
backend_pb2.Reply: The health check reply.
"""
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
"""
Loads a language model.
Args:
request: The load model request.
context: The gRPC context.
Returns:
backend_pb2.Result: The load model result.
"""
try:
tokenizerModel = request.Tokenizer
if tokenizerModel == "":
tokenizerModel = request.Model
tokenizer = AutoTokenizer.from_pretrained(tokenizerModel)
if MAMBA_CHAT:
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token
self.tokenizer = tokenizer
self.model = MambaLMHeadModel.from_pretrained(request.Model, device="cuda", dtype=torch.float16)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
"""
Generates text based on the given prompt and sampling parameters.
Args:
request: The predict request.
context: The gRPC context.
Returns:
backend_pb2.Result: The predict result.
"""
if request.TopP == 0:
request.TopP = 0.9
max_tokens = request.Tokens
if request.Tokens == 0:
max_tokens = 2000
# encoded_input = self.tokenizer(request.Prompt)
tokens = self.tokenizer(request.Prompt, return_tensors="pt")
input_ids = tokens.input_ids.to(device="cuda")
out = self.model.generate(input_ids=input_ids, max_length=max_tokens, temperature=request.Temperature,
top_p=request.TopP, eos_token_id=self.tokenizer.eos_token_id)
decoded = self.tokenizer.batch_decode(out)
generated_text = decoded[0]
# Remove prompt from response if present
if request.Prompt in generated_text:
generated_text = generated_text.replace(request.Prompt, "")
return backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
def PredictStream(self, request, context):
"""
Generates text based on the given prompt and sampling parameters, and streams the results.
Args:
request: The predict stream request.
context: The gRPC context.
Returns:
backend_pb2.Result: The predict stream result.
"""
yield self.Predict(request, context)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View File

@@ -1,9 +0,0 @@
#!/bin/bash
set -e
LIMIT_TARGETS="cublas"
EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
source $(dirname $0)/../common/libbackend.sh
installRequirements

View File

@@ -1,2 +0,0 @@
causal-conv1d==1.4.0
mamba-ssm==2.2.2

View File

@@ -1,2 +0,0 @@
torch==2.4.1
transformers

View File

@@ -1,3 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118
transformers

View File

@@ -1,2 +0,0 @@
torch==2.4.1
transformers

View File

@@ -1,6 +0,0 @@
# mabma does not specify it's build dependencies per PEP517, so we need to disable build isolation
# this also means that we need to install the basic build dependencies into the venv ourselves
# https://github.com/Dao-AILab/causal-conv1d/issues/24
packaging
setuptools
wheel

View File

@@ -1,6 +0,0 @@
#!/bin/bash
LIMIT_TARGETS="cublas"
source $(dirname $0)/../common/libbackend.sh
startBackend $@

View File

@@ -1,76 +0,0 @@
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
import unittest
import subprocess
import time
import grpc
import backend_pb2_grpc
import backend_pb2
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service.
This class contains methods to test the startup and shutdown of the gRPC service.
"""
def setUp(self):
self.service = subprocess.Popen(["python", "backend.py", "--addr", "localhost:50051"])
time.sleep(10)
def tearDown(self) -> None:
self.service.terminate()
self.service.wait()
def test_server_startup(self):
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_text(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
req = backend_pb2.PredictOptions(Prompt="The capital of France is")
resp = stub.Predict(req)
self.assertIsNotNone(resp.message)
except Exception as err:
print(err)
self.fail("text service failed")
finally:
self.tearDown()

View File

@@ -1,158 +0,0 @@
#!/usr/bin/env python3
"""
Extra gRPC server for OpenVoice models.
"""
from concurrent import futures
import argparse
import signal
import sys
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from melo.api import TTS
import time
import backend_pb2
import backend_pb2_grpc
import grpc
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
A gRPC servicer for the backend service.
This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
"""
def Health(self, request, context):
"""
A gRPC method that returns the health status of the backend service.
Args:
request: A HealthRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Reply object that contains the health status of the backend service.
"""
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
"""
A gRPC method that loads a model into memory.
Args:
request: A LoadModelRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Result object that contains the result of the LoadModel operation.
"""
model_name = request.Model
try:
self.clonedVoice = False
# Assume directory from request.ModelFile.
# Only if request.LoraAdapter it's not an absolute path
if request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
# get base path of modelFile
modelFileBase = os.path.dirname(request.ModelFile)
request.AudioPath = os.path.join(modelFileBase, request.AudioPath)
if request.AudioPath != "":
self.clonedVoice = True
self.modelpath = request.ModelFile
self.speaker = request.Type
self.ClonedVoicePath = request.AudioPath
ckpt_converter = request.Model+'/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.device = device
self.tone_color_converter = None
if self.clonedVoice:
self.tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
self.tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def TTS(self, request, context):
model_name = request.model
if model_name == "":
return backend_pb2.Result(success=False, message="request.model is required")
try:
# Speed is adjustable
speed = 1.0
voice = "EN"
if request.voice:
voice = request.voice
model = TTS(language=voice, device=self.device)
speaker_ids = model.hps.data.spk2id
speaker_key = self.speaker
modelpath = self.modelpath
for s in speaker_ids.keys():
print(f"Speaker: {s} - ID: {speaker_ids[s]}")
speaker_id = speaker_ids[speaker_key]
speaker_key = speaker_key.lower().replace('_', '-')
source_se = torch.load(f'{modelpath}/base_speakers/ses/{speaker_key}.pth', map_location=self.device)
model.tts_to_file(request.text, speaker_id, request.dst, speed=speed)
if self.clonedVoice:
reference_speaker = self.ClonedVoicePath
target_se, audio_name = se_extractor.get_se(reference_speaker, self.tone_color_converter, vad=False)
# Run the tone color converter
encode_message = "@MyShell"
self.tone_color_converter.convert(
audio_src_path=request.dst,
src_se=source_se,
tgt_se=target_se,
output_path=request.dst,
message=encode_message)
print("[OpenVoice] TTS generated!", file=sys.stderr)
print("[OpenVoice] TTS saved to", request.dst, file=sys.stderr)
print(request, file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("[OpenVoice] Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("[OpenVoice] Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
print(f"[OpenVoice] startup: {args}", file=sys.stderr)
serve(args.addr)

View File

@@ -1,7 +0,0 @@
torch==2.4.1
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git
whisper-timestamped
pydub==0.25.1
wavmark==0.0.3
eng_to_ipa==0.0.2

View File

@@ -1,8 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git
whisper-timestamped
pydub==0.25.1
wavmark==0.0.3
eng_to_ipa==0.0.2

View File

@@ -1,7 +0,0 @@
torch==2.4.1
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git
whisper-timestamped
pydub==0.25.1
wavmark==0.0.3
eng_to_ipa==0.0.2

View File

@@ -1,8 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git
whisper-timestamped
pydub==0.25.1
wavmark==0.0.3
eng_to_ipa==0.0.2

View File

@@ -1,24 +0,0 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
torchaudio==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
optimum[openvino]
grpcio==1.69.0
protobuf
librosa==0.9.1
faster-whisper==0.9.0
pydub==0.25.1
wavmark==0.0.3
eng_to_ipa==0.0.2
inflect==7.0.0
unidecode==1.3.7
whisper-timestamped==1.14.2
openai
python-dotenv
pypinyin==0.50.0
cn2an==0.5.22
jieba==0.42.1
langid==1.1.6
git+https://github.com/myshell-ai/MeloTTS.git
git+https://github.com/myshell-ai/OpenVoice.git

View File

@@ -1,17 +0,0 @@
grpcio==1.69.0
protobuf
librosa
faster-whisper
inflect
unidecode
openai
python-dotenv
pypinyin
cn2an==0.5.22
numpy==1.22.0
networkx==2.8.8
jieba==0.42.1
gradio==5.9.1
langid==1.1.6
llvmlite==0.43.0
setuptools

View File

@@ -1,82 +0,0 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
time.sleep(30)
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.terminate()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="checkpoints_v2",
Type="en-us"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_tts(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="dingzhen"))
self.assertTrue(response.success)
tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story", voice="EN")
tts_response = stub.TTS(tts_request)
self.assertIsNotNone(tts_response)
except Exception as err:
print(err)
self.fail("TTS service failed")
finally:
self.tearDown()

View File

@@ -1,12 +0,0 @@
#!/bin/bash
set -e
source $(dirname $0)/../common/libbackend.sh
# Download checkpoints if not present
if [ ! -d "checkpoints_v2" ]; then
wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip -O checkpoints_v2.zip
unzip checkpoints_v2.zip
fi
runUnittests

View File

@@ -1,44 +0,0 @@
export CONDA_ENV_PATH = "parler.yml"
SKIP_CONDA?=0
ifeq ($(BUILD_TYPE), cublas)
export CONDA_ENV_PATH = "parler-nvidia.yml"
endif
# Intel GPU are supposed to have dependencies installed in the main python
# environment, so we skip conda installation for SYCL builds.
# https://github.com/intel/intel-extension-for-pytorch/issues/538
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
export SKIP_CONDA=1
endif
.PHONY: parler-tts
parler-tts:
@echo "Installing $(CONDA_ENV_PATH)..."
bash install.sh $(CONDA_ENV_PATH)
$(MAKE) protogen
.PHONY: run
run: protogen
@echo "Running transformers..."
bash run.sh
@echo "transformers run."
.PHONY: test
test: protogen
@echo "Testing transformers..."
bash test.sh
@echo "transformers tested."
.PHONY: protogen
protogen: backend_pb2_grpc.py backend_pb2.py
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
backend_pb2_grpc.py backend_pb2.py:
bash protogen.sh
.PHONY: clean
clean: protogen-clean
$(RM) -r venv __pycache__

View File

@@ -1,125 +0,0 @@
#!/usr/bin/env python3
"""
Extra gRPC server for MusicgenForConditionalGeneration models.
"""
from concurrent import futures
import argparse
import signal
import sys
import os
import time
import backend_pb2
import backend_pb2_grpc
import grpc
from scipy.io.wavfile import write as write_wav
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import torch
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
"""
A gRPC servicer for the backend service.
This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
"""
def Health(self, request, context):
"""
A gRPC method that returns the health status of the backend service.
Args:
request: A HealthRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Reply object that contains the health status of the backend service.
"""
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
"""
A gRPC method that loads a model into memory.
Args:
request: A LoadModelRequest object that contains the request parameters.
context: A grpc.ServicerContext object that provides information about the RPC.
Returns:
A Result object that contains the result of the LoadModel operation.
"""
model_name = request.Model
device = "cuda:0" if torch.cuda.is_available() else "cpu"
try:
self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def TTS(self, request, context):
model_name = request.model
voice = request.voice
if voice == "":
voice = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
if model_name == "":
return backend_pb2.Result(success=False, message="request.model is required")
try:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
input_ids = self.tokenizer(voice, return_tensors="pt").input_ids.to(device)
prompt_input_ids = self.tokenizer(request.text, return_tensors="pt").input_ids.to(device)
generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
print("[parler-tts] TTS generated!", file=sys.stderr)
sf.write(request.dst, audio_arr, self.model.config.sampling_rate)
print("[parler-tts] TTS saved to", request.dst, file=sys.stderr)
print("[parler-tts] TTS for", file=sys.stderr)
print(request, file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("[parler-tts] Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("[parler-tts] Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
print(f"[parler-tts] startup: {args}", file=sys.stderr)
serve(args.addr)

View File

@@ -1,28 +0,0 @@
#!/bin/bash
set -e
source $(dirname $0)/../common/libbackend.sh
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
installRequirements
# https://github.com/descriptinc/audiotools/issues/101
# incompatible protobuf versions.
PYDIR=python3.10
pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
if [ ! -d ${pyenv} ]; then
echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
exit 1
fi
curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py

View File

@@ -1,4 +0,0 @@
git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17
llvmlite==0.43.0
numba==0.60.0
grpcio-tools==1.42.0

View File

@@ -1,3 +0,0 @@
transformers
accelerate
torch==2.4.1

View File

@@ -1,5 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118
torchaudio==2.4.1+cu118
transformers
accelerate

View File

@@ -1,4 +0,0 @@
torch==2.4.1
torchaudio==2.4.1
transformers
accelerate

View File

@@ -1,5 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.3.0+rocm6.0
torchaudio==2.3.0+rocm6.0
transformers
accelerate

View File

@@ -1,4 +0,0 @@
grpcio==1.69.0
certifi
llvmlite==0.43.0
setuptools

View File

@@ -1,4 +0,0 @@
#!/bin/bash
source $(dirname $0)/../common/libbackend.sh
startBackend $@

View File

@@ -1,81 +0,0 @@
"""
A test script to test the gRPC service
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
"""
TestBackendServicer is the class that tests the gRPC service
"""
def setUp(self):
"""
This method sets up the gRPC service by starting the server
"""
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
time.sleep(10)
def tearDown(self) -> None:
"""
This method tears down the gRPC service by terminating the server
"""
self.service.terminate()
self.service.wait()
def test_server_startup(self):
"""
This method tests if the server starts up successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
print(err)
self.fail("Server failed to start")
finally:
self.tearDown()
def test_load_model(self):
"""
This method tests if the model is loaded successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
self.assertTrue(response.success)
self.assertEqual(response.message, "Model loaded successfully")
except Exception as err:
print(err)
self.fail("LoadModel service failed")
finally:
self.tearDown()
def test_tts(self):
"""
This method tests if the embeddings are generated successfully
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1"))
self.assertTrue(response.success)
tts_request = backend_pb2.TTSRequest(text="Hey, how are you doing today?")
tts_response = stub.TTS(tts_request)
self.assertIsNotNone(tts_response)
except Exception as err:
print(err)
self.fail("TTS service failed")
finally:
self.tearDown()

View File

@@ -1,6 +0,0 @@
#!/bin/bash
set -e
source $(dirname $0)/../common/libbackend.sh
runUnittests

View File

@@ -21,7 +21,7 @@ import torch.cuda
XPU=os.environ.get("XPU", "0") == "1"
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, MambaConfig, MambaForCausalLM
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from scipy.io import wavfile
import outetts
@@ -245,6 +245,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
autoTokenizer = False
self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
self.SentenceTransformer = True
elif request.Type == "Mamba":
autoTokenizer = False
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = MambaForCausalLM.from_pretrained(model_name)
else:
print("Automodel", file=sys.stderr)
self.model = AutoModel.from_pretrained(model_name,

View File

@@ -515,7 +515,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
}
}
if (u & FLAG_IMAGE) == FLAG_IMAGE {
imageBackends := []string{"diffusers", "stablediffusion"}
imageBackends := []string{"diffusers", "stablediffusion", "stablediffusion-ggml"}
if !slices.Contains(imageBackends, c.Backend) {
return false
}

View File

@@ -48,5 +48,66 @@ var _ = Describe("Test cases for config related functions", func() {
// config should includes whisper-1 models's api.config
Expect(loadedModelNames).To(ContainElements("whisper-1"))
})
It("Test new loadconfig", func() {
bcl := NewBackendConfigLoader(os.Getenv("MODELS_PATH"))
err := bcl.LoadBackendConfigsFromPath(os.Getenv("MODELS_PATH"))
Expect(err).To(BeNil())
configs := bcl.GetAllBackendConfigs()
loadedModelNames := []string{}
for _, v := range configs {
loadedModelNames = append(loadedModelNames, v.Name)
}
Expect(configs).ToNot(BeNil())
totalModels := len(loadedModelNames)
Expect(loadedModelNames).To(ContainElements("code-search-ada-code-001"))
// config should includes text-embedding-ada-002 models's api.config
Expect(loadedModelNames).To(ContainElements("text-embedding-ada-002"))
// config should includes rwkv_test models's api.config
Expect(loadedModelNames).To(ContainElements("rwkv_test"))
// config should includes whisper-1 models's api.config
Expect(loadedModelNames).To(ContainElements("whisper-1"))
// create a temp directory and store a temporary model
tmpdir, err := os.MkdirTemp("", "test")
Expect(err).ToNot(HaveOccurred())
defer os.RemoveAll(tmpdir)
// create a temporary model
model := `name: "test-model"
description: "test model"
options:
- foo
- bar
- baz
`
modelFile := tmpdir + "/test-model.yaml"
err = os.WriteFile(modelFile, []byte(model), 0644)
Expect(err).ToNot(HaveOccurred())
err = bcl.LoadBackendConfigsFromPath(tmpdir)
Expect(err).ToNot(HaveOccurred())
configs = bcl.GetAllBackendConfigs()
Expect(len(configs)).ToNot(Equal(totalModels))
loadedModelNames = []string{}
var testModel BackendConfig
for _, v := range configs {
loadedModelNames = append(loadedModelNames, v.Name)
if v.Name == "test-model" {
testModel = v
}
}
Expect(loadedModelNames).To(ContainElements("test-model"))
Expect(testModel.Description).To(Equal("test model"))
Expect(testModel.Options).To(ContainElements("foo", "bar", "baz"))
})
})
})

View File

@@ -687,6 +687,10 @@ var _ = Describe("API test", func() {
Name: "model-gallery",
URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/index.yaml",
},
{
Name: "localai",
URL: "https://raw.githubusercontent.com/mudler/LocalAI/refs/heads/master/gallery/index.yaml",
},
}
application, err := application.New(
@@ -764,10 +768,8 @@ var _ = Describe("API test", func() {
}
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
ID: "model-gallery@stablediffusion",
Overrides: map[string]interface{}{
"parameters": map[string]interface{}{"model": "stablediffusion_assets"},
},
ID: "localai@sd-1.5-ggml",
Name: "stablediffusion",
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -778,14 +780,14 @@ var _ = Describe("API test", func() {
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
fmt.Println(response)
return response["processed"].(bool)
}, "360s", "10s").Should(Equal(true))
}, "1200s", "10s").Should(Equal(true))
resp, err := http.Post(
"http://127.0.0.1:9090/v1/images/generations",
"application/json",
bytes.NewBuffer([]byte(`{
"prompt": "floating hair, portrait, ((loli)), ((one girl)), cute face, hidden hands, asymmetrical bangs, beautiful detailed eyes, eye shadow, hair ornament, ribbons, bowties, buttons, pleated skirt, (((masterpiece))), ((best quality)), colorful|((part of the head)), ((((mutated hands and fingers)))), deformed, blurry, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, poorly drawn hands, missing limb, blurry, floating limbs, disconnected limbs, malformed hands, blur, out of focus, long neck, long body, Octane renderer, lowres, bad anatomy, bad hands, text",
"mode": 2, "seed":9000,
"prompt": "a lovely cat",
"step": 1, "seed":9000,
"size": "256x256", "n":2}`)))
// The response should contain an URL
Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
@@ -794,6 +796,7 @@ var _ = Describe("API test", func() {
imgUrlResp := &schema.OpenAIResponse{}
err = json.Unmarshal(dat, imgUrlResp)
Expect(err).ToNot(HaveOccurred(), fmt.Sprint(dat))
Expect(imgUrlResp.Data).ToNot(Or(BeNil(), BeZero()))
imgUrl := imgUrlResp.Data[0].URL
Expect(imgUrl).To(ContainSubstring("http://127.0.0.1:9090/"), imgUrl)

View File

@@ -28,7 +28,7 @@ func BackendMonitorEndpoint(bm *services.BackendMonitorService) func(c *fiber.Ct
}
}
// BackendMonitorEndpoint shuts down the specified backend
// BackendShutdownEndpoint shuts down the specified backend
// @Summary Backend monitor endpoint
// @Param request body schema.BackendMonitorRequest true "Backend statistics request"
// @Router /backend/shutdown [post]

View File

@@ -72,7 +72,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
}
if m == "" {
m = model.StableDiffusionBackend
m = "stablediffusion"
}
log.Debug().Msgf("Loading model: %+v", m)
@@ -129,9 +129,9 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
switch config.Backend {
case "stablediffusion":
config.Backend = model.StableDiffusionBackend
config.Backend = model.StableDiffusionGGMLBackend
case "":
config.Backend = model.StableDiffusionBackend
config.Backend = model.StableDiffusionGGMLBackend
}
if !strings.Contains(input.Size, "x") {

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"strconv"
"github.com/gofiber/fiber/v2"
"github.com/google/uuid"
@@ -296,6 +297,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
}
}
}
// If a quality was defined as number, convert it to step
if input.Quality != "" {
q, err := strconv.Atoi(input.Quality)
if err == nil {
config.Step = q
}
}
}
func mergeRequestWithConfig(modelFile string, input *schema.OpenAIRequest, cm *config.BackendConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.BackendConfig, *schema.OpenAIRequest, error) {

View File

@@ -191,8 +191,9 @@ type OpenAIRequest struct {
Stream bool `json:"stream"`
// Image (not supported by OpenAI)
Mode int `json:"mode"`
Step int `json:"step"`
Mode int `json:"mode"`
Quality string `json:"quality"`
Step int `json:"step"`
// A grammar to constrain the LLM output
Grammar string `json:"grammar" yaml:"grammar"`

View File

@@ -5219,6 +5219,23 @@
- filename: Dolphin3.0-Llama3.1-8B-Q4_K_M.gguf
sha256: 268390e07edd407ad93ea21a868b7ae995b5950e01cad0db9e1802ae5049d405
uri: huggingface://bartowski/Dolphin3.0-Llama3.1-8B-GGUF/Dolphin3.0-Llama3.1-8B-Q4_K_M.gguf
- !!merge <<: *llama31
name: "deepseek-r1-distill-llama-8b"
icon: "https://avatars.githubusercontent.com/u/148330874"
urls:
- https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B
- https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF
description: |
DeepSeek-R1 is our advanced first-generation reasoning model designed to enhance performance in reasoning tasks.
Building on the foundation laid by its predecessor, DeepSeek-R1-Zero, which was trained using large-scale reinforcement learning (RL) without supervised fine-tuning, DeepSeek-R1 addresses the challenges faced by R1-Zero, such as endless repetition, poor readability, and language mixing.
By incorporating cold-start data prior to the RL phase,DeepSeek-R1 significantly improves reasoning capabilities and achieves performance levels comparable to OpenAI-o1 across a variety of domains, including mathematics, coding, and complex reasoning tasks.
overrides:
parameters:
model: deepseek-r1-distill-llama-8b-Q4_K_M.gguf
files:
- filename: deepseek-r1-distill-llama-8b-Q4_K_M.gguf
sha256: f8eba201522ab44b79bc54166126bfaf836111ff4cbf2d13c59c3b57da10573b
uri: huggingface://unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf
- &deepseek ## Deepseek
url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
name: "deepseek-coder-v2-lite-instruct"
@@ -5284,6 +5301,86 @@
- filename: archangel_sft_pythia2-8b.Q4_K_M.gguf
sha256: a47782c55ef2b39b19644213720a599d9849511a73c9ebb0c1de749383c0a0f8
uri: huggingface://RichardErkhov/ContextualAI_-_archangel_sft_pythia2-8b-gguf/archangel_sft_pythia2-8b.Q4_K_M.gguf
- &deepseek-r1 ## Start DeepSeek-R1
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
name: "deepseek-r1-distill-qwen-1.5b"
icon: "https://avatars.githubusercontent.com/u/148330874"
urls:
- https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5b
- https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF
description: |
DeepSeek-R1 is our advanced first-generation reasoning model designed to enhance performance in reasoning tasks.
Building on the foundation laid by its predecessor, DeepSeek-R1-Zero, which was trained using large-scale reinforcement learning (RL) without supervised fine-tuning, DeepSeek-R1 addresses the challenges faced by R1-Zero, such as endless repetition, poor readability, and language mixing.
By incorporating cold-start data prior to the RL phase,DeepSeek-R1 significantly improves reasoning capabilities and achieves performance levels comparable to OpenAI-o1 across a variety of domains, including mathematics, coding, and complex reasoning tasks.
overrides:
parameters:
model: DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf
files:
- filename: DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf
sha256: 1741e5b2d062b07acf048bf0d2c514dadf2a48f94e2b4aa0cfe069af3838ee2f
uri: huggingface://bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "deepseek-r1-distill-qwen-7b"
urls:
- https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
- https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF
overrides:
parameters:
model: DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf
files:
- filename: DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf
sha256: 731ece8d06dc7eda6f6572997feb9ee1258db0784827e642909d9b565641937b
uri: huggingface://bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "deepseek-r1-distill-qwen-14b"
urls:
- https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
- https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF
overrides:
parameters:
model: DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf
files:
- filename: DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf
sha256: 0b319bd0572f2730bfe11cc751defe82045fad5085b4e60591ac2cd2d9633181
uri: huggingface://bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF/DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "deepseek-r1-distill-qwen-32b"
urls:
- https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
- https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF
overrides:
parameters:
model: DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf
files:
- filename: DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf
sha256: bed9b0f551f5b95bf9da5888a48f0f87c37ad6b72519c4cbd775f54ac0b9fc62
uri: huggingface://bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF/DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "deepseek-r1-distill-llama-8b"
icon: "https://avatars.githubusercontent.com/u/148330874"
urls:
- https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B
- https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF
overrides:
parameters:
model: DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf
files:
- filename: DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf
sha256: 87bcba20b4846d8dadf753d3ff48f9285d131fc95e3e0e7e934d4f20bc896f5d
uri: huggingface://bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "deepseek-r1-distill-llama-70b"
icon: "https://avatars.githubusercontent.com/u/148330874"
urls:
- https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B
- https://huggingface.co/bartowski/DeepSeek-R 1-Distill-Llama-70B-GGUF
overrides:
parameters:
model: DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf
files:
- filename: DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf
sha256: 181a82a1d6d2fa24fe4db83a68eee030384986bdbdd4773ba76424e3a6eb9fd8
uri: huggingface://bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF/DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf
- &qwen2 ## Start QWEN2
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
name: "qwen2-7b-instruct"
@@ -5617,6 +5714,32 @@
- filename: marco-o1-uncensored.Q4_K_M.gguf
sha256: ad0440270a7254098f90779744d3e5b34fe49b7baf97c819909ba9c5648cc0d9
uri: huggingface://QuantFactory/marco-o1-uncensored-GGUF/marco-o1-uncensored.Q4_K_M.gguf
- !!merge <<: *qwen2
name: "minicpm-o-2_6"
icon: https://avatars.githubusercontent.com/u/89920203
urls:
- https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf
- https://huggingface.co/openbmb/MiniCPM-o-2_6
description: |
MiniCPM-o 2.6 is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip-400M, Whisper-medium-300M, ChatTTS-200M, and Qwen2.5-7B with a total of 8B parameters
tags:
- llm
- multimodal
- gguf
- gpu
- qwen2
- cpu
overrides:
mmproj: minicpm-o-2_6-mmproj-f16.gguf
parameters:
model: minicpm-o-2_6-Q4_K_M.gguf
files:
- filename: minicpm-o-2_6-Q4_K_M.gguf
sha256: 4f635fc0c0bb88d50ccd9cf1f1e5892b5cb085ff88fe0d8e1148fd9a8a836bc2
uri: huggingface://openbmb/MiniCPM-o-2_6-gguf/Model-7.6B-Q4_K_M.gguf
- filename: minicpm-o-2_6-mmproj-f16.gguf
sha256: efa4f7d96aa0f838f2023fc8d28e519179b16f1106777fa9280b32628191aa3e
uri: huggingface://openbmb/MiniCPM-o-2_6-gguf/mmproj-model-f16.gguf
- !!merge <<: *qwen2
name: "minicpm-v-2_6"
license: apache-2.0
@@ -11014,7 +11137,7 @@
uri: huggingface://Lykon/DreamShaper/DreamShaper_8_pruned.safetensors
sha256: 879db523c30d3b9017143d56705015e15a2cb5628762c11d086fed9538abd7fd
- name: stable-diffusion-3-medium
icon: https://huggingface.co/leo009/stable-diffusion-3-medium/resolve/main/sd3demo.jpg
icon: https://avatars.githubusercontent.com/u/100950301
license: other
description: |
Stable Diffusion 3 Medium is a Multimodal Diffusion Transformer (MMDiT) text-to-image model that features greatly improved performance in image quality, typography, complex prompt understanding, and resource-efficiency.
@@ -11028,6 +11151,63 @@
- sd-3
- gpu
url: "github:mudler/LocalAI/gallery/stablediffusion3.yaml@master"
- name: sd-1.5-ggml
icon: https://avatars.githubusercontent.com/u/37351293
license: creativeml-openrail-m
url: "github:mudler/LocalAI/gallery/sd-ggml.yaml@master"
description: |
Stable Diffusion 1.5
urls:
- https://huggingface.co/second-state/stable-diffusion-v1-5-GGUF
tags:
- text-to-image
- stablediffusion
- gpu
- cpu
overrides:
options:
- "sampler:euler"
parameters:
model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
files:
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
- name: sd-3.5-medium-ggml
license: stabilityai-ai-community
url: "github:mudler/LocalAI/gallery/sd-ggml.yaml@master"
description: |
Stable Diffusion 3.5 Medium is a Multimodal Diffusion Transformer (MMDiT) text-to-image model that features improved performance in image quality, typography, complex prompt understanding, and resource-efficiency.
urls:
- https://huggingface.co/stabilityai/stable-diffusion-3.5-medium
- https://huggingface.co/second-state/stable-diffusion-3.5-medium-GGUF
tags:
- text-to-image
- stablediffusion
- gpu
- cpu
icon: https://avatars.githubusercontent.com/u/100950301
overrides:
options:
- "clip_l_path:clip_l-Q4_0.gguf"
- "clip_g_path:clip_g-Q4_0.gguf"
- "t5xxl_path:t5xxl-Q4_0.gguf"
- "sampler:euler"
parameters:
model: sd3.5_medium-Q4_0.gguf
files:
- filename: "sd3.5_medium-Q4_0.gguf"
sha256: "3bb8c5e9ab0a841117089ed4ed81d885bb85161df2a766b812f829bc55b31adf"
uri: "huggingface://second-state/stable-diffusion-3.5-medium-GGUF/sd3.5_medium-Q4_0.gguf"
- filename: clip_g-Q4_0.gguf
sha256: c142411147e16b7c4b9cc1f5d977cbe596104435d76fde47172d3d35c5e58bb8
uri: huggingface://second-state/stable-diffusion-3.5-medium-GGUF/clip_g-Q4_0.gguf
- filename: clip_l-Q4_0.gguf
sha256: f5ad88ae2ac924eb4ac0298b77afa304b5e6014fc0c4128f0e3df40fdfcc0f8a
uri: huggingface://second-state/stable-diffusion-3.5-medium-GGUF/clip_l-Q4_0.gguf
- filename: t5xxl-Q4_0.gguf
sha256: 987ba47c158b890c274f78fd35324419f50941e846a49789f0977e9fe9d97ab7
uri: huggingface://second-state/stable-diffusion-3.5-medium-GGUF/t5xxl-Q4_0.gguf
- name: sd-3.5-large-ggml
license: stabilityai-ai-community
url: "github:mudler/LocalAI/gallery/sd-ggml.yaml@master"
@@ -11038,10 +11218,10 @@
- https://huggingface.co/second-state/stable-diffusion-3.5-large-GGUF
tags:
- text-to-image
- flux
- stablediffusion
- gpu
- cpu
icon: https://huggingface.co/stabilityai/stable-diffusion-3.5-large/media/main/sd3.5_large_demo.png
icon: https://avatars.githubusercontent.com/u/100950301
overrides:
parameters:
model: sd3.5_large-Q4_0.gguf
@@ -11060,6 +11240,7 @@
uri: huggingface://second-state/stable-diffusion-3.5-large-GGUF/t5xxl-Q5_0.gguf
- &flux
name: flux.1-dev
icon: https://avatars.githubusercontent.com/u/164064024
license: flux-1-dev-non-commercial-license
description: |
FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. For more information, please read our blog post.
@@ -11083,7 +11264,6 @@
- !!merge <<: *flux
name: flux.1-schnell
license: apache-2
icon: https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/schnell_grid.jpeg
description: |
FLUX.1 [schnell] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. For more information, please read our blog post.
Key Features
@@ -11116,7 +11296,6 @@
- flux
- gpu
- cpu
icon: https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/schnell_grid.jpeg
overrides:
parameters:
model: flux1-dev-Q2_K.gguf
@@ -11136,6 +11315,7 @@
- &whisper ## Whisper
url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master"
name: "whisper-1"
icon: https://avatars.githubusercontent.com/u/14957082
license: "MIT"
urls:
- https://github.com/ggerganov/whisper.cpp
@@ -11313,6 +11493,7 @@
description: |
Stable Diffusion in NCNN with c++, supported txt2img and img2img
name: stablediffusion-cpp
icon: https://avatars.githubusercontent.com/u/100950301
- &piper ## Piper TTS
url: github:mudler/LocalAI/gallery/piper.yaml@master
name: voice-en-us-kathleen-low
@@ -11893,6 +12074,7 @@
uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-zh_CN-huayan-medium.tar.gz
sha256: 0299a5e7f481ba853404e9f0e1515a94d5409585d76963fa4d30c64bd630aa99
- name: "silero-vad"
icon: https://github.com/snakers4/silero-models/raw/master/files/silero_logo.jpg
url: github:mudler/LocalAI/gallery/virtual.yaml@master
urls:
- https://github.com/snakers4/silero-vad
@@ -11912,6 +12094,7 @@
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
- name: "bark-cpp-small"
icon: https://avatars.githubusercontent.com/u/99442120
url: github:mudler/LocalAI/gallery/virtual.yaml@master
license: mit
urls:

View File

@@ -21,14 +21,16 @@ import (
)
const (
HuggingFacePrefix = "huggingface://"
OCIPrefix = "oci://"
OllamaPrefix = "ollama://"
HTTPPrefix = "http://"
HTTPSPrefix = "https://"
GithubURI = "github:"
GithubURI2 = "github://"
LocalPrefix = "file://"
HuggingFacePrefix = "huggingface://"
HuggingFacePrefix1 = "hf://"
HuggingFacePrefix2 = "hf.co/"
OCIPrefix = "oci://"
OllamaPrefix = "ollama://"
HTTPPrefix = "http://"
HTTPSPrefix = "https://"
GithubURI = "github:"
GithubURI2 = "github://"
LocalPrefix = "file://"
)
type URI string
@@ -127,6 +129,8 @@ func (u URI) LooksLikeURL() bool {
return strings.HasPrefix(string(u), HTTPPrefix) ||
strings.HasPrefix(string(u), HTTPSPrefix) ||
strings.HasPrefix(string(u), HuggingFacePrefix) ||
strings.HasPrefix(string(u), HuggingFacePrefix1) ||
strings.HasPrefix(string(u), HuggingFacePrefix2) ||
strings.HasPrefix(string(u), GithubURI) ||
strings.HasPrefix(string(u), OllamaPrefix) ||
strings.HasPrefix(string(u), OCIPrefix) ||
@@ -170,8 +174,10 @@ func (s URI) ResolveURL() string {
projectPath := strings.Join(repoPath[2:], "/")
return fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
case strings.HasPrefix(string(s), HuggingFacePrefix):
case strings.HasPrefix(string(s), HuggingFacePrefix) || strings.HasPrefix(string(s), HuggingFacePrefix1) || strings.HasPrefix(string(s), HuggingFacePrefix2):
repository := strings.Replace(string(s), HuggingFacePrefix, "", 1)
repository = strings.Replace(repository, HuggingFacePrefix1, "", 1)
repository = strings.Replace(repository, HuggingFacePrefix2, "", 1)
// convert repository to a full URL.
// e.g. TheBloke/Mixtral-8x7B-v0.1-GGUF/mixtral-8x7b-v0.1.Q2_K.gguf@main -> https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q2_K.gguf
owner := strings.Split(repository, "/")[0]

View File

@@ -34,7 +34,7 @@ type Tool struct {
}
type Tools []Tool
// ToJSONNameStructure converts a list of functions to a JSON structure that can be parsed to a grammar
// ToJSONStructure converts a list of functions to a JSON structure that can be parsed to a grammar
// This allows the LLM to return a response of the type: { "name": "function_name", "arguments": { "arg1": "value1", "arg2": "value2" } }
func (f Functions) ToJSONStructure(name, args string) JSONFunctionStructure {
nameKey := defaultFunctionNameKey

View File

@@ -29,11 +29,14 @@ var Aliases map[string]string = map[string]string{
"langchain-huggingface": LCHuggingFaceBackend,
"transformers-musicgen": TransformersBackend,
"sentencetransformers": TransformersBackend,
"mamba": TransformersBackend,
"stablediffusion": StableDiffusionGGMLBackend,
}
var TypeAlias map[string]string = map[string]string{
"sentencetransformers": "SentenceTransformer",
"huggingface-embeddings": "SentenceTransformer",
"mamba": "Mamba",
"transformers-musicgen": "MusicgenForConditionalGeneration",
}
@@ -45,6 +48,7 @@ const (
LLamaCPP = "llama-cpp"
LLamaCPPAVX2 = "llama-cpp-avx2"
LLamaCPPAVX512 = "llama-cpp-avx512"
LLamaCPPAVX = "llama-cpp-avx"
LLamaCPPFallback = "llama-cpp-fallback"
LLamaCPPCUDA = "llama-cpp-cuda"
@@ -54,15 +58,27 @@ const (
LLamaCPPGRPC = "llama-cpp-grpc"
WhisperBackend = "whisper"
StableDiffusionBackend = "stablediffusion"
PiperBackend = "piper"
LCHuggingFaceBackend = "huggingface"
WhisperBackend = "whisper"
StableDiffusionGGMLBackend = "stablediffusion-ggml"
PiperBackend = "piper"
LCHuggingFaceBackend = "huggingface"
TransformersBackend = "transformers"
LocalStoreBackend = "local-store"
)
var llamaCPPVariants = []string{
LLamaCPPAVX2,
LLamaCPPAVX512,
LLamaCPPAVX,
LLamaCPPFallback,
LLamaCPPCUDA,
LLamaCPPHipblas,
LLamaCPPSycl16,
LLamaCPPSycl32,
LLamaCPPGRPC,
}
func backendPath(assetDir, backend string) string {
return filepath.Join(assetDir, "backend-assets", "grpc", backend)
}
@@ -104,40 +120,14 @@ ENTRY:
if AutoDetect {
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
// when starting the service
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
foundVariants := map[string]bool{}
if _, ok := backends[LLamaCPP]; !ok {
for _, e := range entry {
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2)
foundLCPPAVX2 = true
}
if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
foundLCPPAVX = true
}
if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
foundLCPPFallback = true
}
if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
foundLCPPGRPC = true
}
if strings.Contains(e.Name(), LLamaCPPCUDA) && !foundLCPPCuda {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA)
foundLCPPCuda = true
}
if strings.Contains(e.Name(), LLamaCPPHipblas) && !foundLCPPHipblas {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
foundLCPPHipblas = true
}
if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16)
foundSycl16 = true
}
if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32)
foundSycl32 = true
for _, v := range llamaCPPVariants {
if strings.Contains(e.Name(), v) && !foundVariants[v] {
backends[LLamaCPP] = append(backends[LLamaCPP], v)
foundVariants[v] = true
}
}
}
}
@@ -280,6 +270,12 @@ func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) str
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
selectedProcess = p
}
} else if xsysinfo.HasCPUCaps(cpuid.AVX512F) {
p := backendPath(assetDir, LLamaCPPAVX512)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with AVX512 variant", backend)
selectedProcess = p
}
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
p := backendPath(assetDir, LLamaCPPAVX)
if _, err := os.Stat(p); err == nil {

View File

@@ -1,35 +0,0 @@
//go:build stablediffusion
// +build stablediffusion
package stablediffusion
import (
stableDiffusion "github.com/mudler/go-stable-diffusion"
)
func GenerateImage(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst, asset_dir string) error {
if height > 512 || width > 512 {
return stableDiffusion.GenerateImageUpscaled(
height,
width,
step,
seed,
positive_prompt,
negative_prompt,
dst,
asset_dir,
)
}
return stableDiffusion.GenerateImage(
height,
width,
mode,
step,
seed,
positive_prompt,
negative_prompt,
dst,
"",
asset_dir,
)
}

View File

@@ -1,10 +0,0 @@
//go:build !stablediffusion
// +build !stablediffusion
package stablediffusion
import "fmt"
func GenerateImage(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst, asset_dir string) error {
return fmt.Errorf("This version of LocalAI was built without the stablediffusion tag")
}

View File

@@ -1,20 +0,0 @@
package stablediffusion
import "os"
type StableDiffusion struct {
assetDir string
}
func New(assetDir string) (*StableDiffusion, error) {
if _, err := os.Stat(assetDir); err != nil {
return nil, err
}
return &StableDiffusion{
assetDir: assetDir,
}, nil
}
func (s *StableDiffusion) GenerateImage(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string) error {
return GenerateImage(height, width, mode, step, seed, positive_prompt, negative_prompt, dst, s.assetDir)
}

View File

@@ -1645,6 +1645,9 @@ const docTemplate = `{
"prompt": {
"description": "Prompt is read only by completion/image API calls"
},
"quality": {
"type": "string"
},
"repeat_last_n": {
"type": "integer"
},

View File

@@ -1638,6 +1638,9 @@
"prompt": {
"description": "Prompt is read only by completion/image API calls"
},
"quality": {
"type": "string"
},
"repeat_last_n": {
"type": "integer"
},

View File

@@ -570,6 +570,8 @@ definitions:
type: number
prompt:
description: Prompt is read only by completion/image API calls
quality:
type: string
repeat_last_n:
type: integer
repeat_penalty:

View File

@@ -54,7 +54,7 @@ var _ = BeforeSuite(func() {
Eventually(func() error {
_, err := client.ListModels(context.TODO())
return err
}, "20m").ShouldNot(HaveOccurred())
}, "50m").ShouldNot(HaveOccurred())
})
var _ = AfterSuite(func() {

View File

@@ -123,8 +123,9 @@ var _ = Describe("E2E test", func() {
It("correctly", func() {
resp, err := client.CreateImage(context.TODO(),
openai.ImageRequest{
Prompt: "test",
Size: openai.CreateImageSize512x512,
Prompt: "test",
Quality: "1",
Size: openai.CreateImageSize256x256,
},
)
Expect(err).ToNot(HaveOccurred())
@@ -135,7 +136,8 @@ var _ = Describe("E2E test", func() {
resp, err := client.CreateImage(context.TODO(),
openai.ImageRequest{
Prompt: "test",
Size: openai.CreateImageSize512x512,
Size: openai.CreateImageSize256x256,
Quality: "1",
ResponseFormat: openai.CreateImageResponseFormatURL,
},
)
@@ -147,7 +149,8 @@ var _ = Describe("E2E test", func() {
resp, err := client.CreateImage(context.TODO(),
openai.ImageRequest{
Prompt: "test",
Size: openai.CreateImageSize512x512,
Size: openai.CreateImageSize256x256,
Quality: "1",
ResponseFormat: openai.CreateImageResponseFormatB64JSON,
},
)

View File

@@ -4,6 +4,7 @@ import (
"context"
"embed"
"math"
"math/rand"
"os"
"path/filepath"
@@ -22,6 +23,19 @@ import (
//go:embed backend-assets/*
var backendAssets embed.FS
func normalize(vecs [][]float32) {
for i, k := range vecs {
norm := float64(0)
for _, x := range k {
norm += float64(x * x)
}
norm = math.Sqrt(norm)
for j, x := range k {
vecs[i][j] = x / float32(norm)
}
}
}
var _ = Describe("Integration tests for the stores backend(s) and internal APIs", Label("stores"), func() {
Context("Embedded Store get,set and delete", func() {
var sl *model.ModelLoader
@@ -192,17 +206,8 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
// set 3 vectors that are at varying angles to {0.5, 0.5, 0.5}
keys := [][]float32{{0.1, 0.3, 0.5}, {0.5, 0.5, 0.5}, {0.6, 0.6, -0.6}, {0.7, -0.7, -0.7}}
vals := [][]byte{[]byte("test0"), []byte("test1"), []byte("test2"), []byte("test3")}
// normalize the keys
for i, k := range keys {
norm := float64(0)
for _, x := range k {
norm += float64(x * x)
}
norm = math.Sqrt(norm)
for j, x := range k {
keys[i][j] = x / float32(norm)
}
}
normalize(keys)
err := store.SetCols(context.Background(), sc, keys, vals)
Expect(err).ToNot(HaveOccurred())
@@ -225,5 +230,121 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
Expect(ks[1]).To(Equal(keys[1]))
Expect(vals[1]).To(Equal(vals[1]))
})
It("It produces the correct cosine similarities for orthogonal and opposite unit vectors", func() {
keys := [][]float32{{1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {-1.0, 0.0, 0.0}}
vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}
err := store.SetCols(context.Background(), sc, keys, vals);
Expect(err).ToNot(HaveOccurred())
_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
Expect(err).ToNot(HaveOccurred())
Expect(sims).To(Equal([]float32{1.0, 0.0, 0.0, -1.0}))
})
It("It produces the correct cosine similarities for orthogonal and opposite vectors", func() {
keys := [][]float32{{1.0, 0.0, 1.0}, {0.0, 2.0, 0.0}, {0.0, 0.0, -1.0}, {-1.0, 0.0, -1.0}}
vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}
err := store.SetCols(context.Background(), sc, keys, vals);
Expect(err).ToNot(HaveOccurred())
_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
Expect(err).ToNot(HaveOccurred())
Expect(sims[0]).To(BeNumerically("~", 1, 0.1))
Expect(sims[1]).To(BeNumerically("~", 0, 0.1))
Expect(sims[2]).To(BeNumerically("~", -0.7, 0.1))
Expect(sims[3]).To(BeNumerically("~", -1, 0.1))
})
expectTriangleEq := func(keys [][]float32, vals [][]byte) {
sims := map[string]map[string]float32{}
// compare every key vector pair and store the similarities in a lookup table
// that uses the values as keys
for i, k := range keys {
_, valsk, simsk, err := store.Find(context.Background(), sc, k, 9)
Expect(err).ToNot(HaveOccurred())
for j, v := range valsk {
p := string(vals[i])
q := string(v)
if sims[p] == nil {
sims[p] = map[string]float32{}
}
//log.Debug().Strs("vals", []string{p, q}).Float32("similarity", simsk[j]).Send()
sims[p][q] = simsk[j]
}
}
// Check that the triangle inequality holds for every combination of the triplet
// u, v and w
for _, simsu := range sims {
for w, simw := range simsu {
// acos(u,w) <= ...
uws := math.Acos(float64(simw))
// ... acos(u,v) + acos(v,w)
for v, _ := range simsu {
uvws := math.Acos(float64(simsu[v])) + math.Acos(float64(sims[v][w]))
//log.Debug().Str("u", u).Str("v", v).Str("w", w).Send()
//log.Debug().Float32("uw", simw).Float32("uv", simsu[v]).Float32("vw", sims[v][w]).Send()
Expect(uws).To(BeNumerically("<=", uvws))
}
}
}
}
It("It obeys the triangle inequality for normalized values", func() {
keys := [][]float32{
{1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0},
{-1.0, 0.0, 0.0}, {0.0, -1.0, 0.0}, {0.0, 0.0, -1.0},
{2.0, 3.0, 4.0}, {9.0, 7.0, 1.0}, {0.0, -1.2, 2.3},
}
vals := [][]byte{
[]byte("x"), []byte("y"), []byte("z"),
[]byte("-x"), []byte("-y"), []byte("-z"),
[]byte("u"), []byte("v"), []byte("w"),
}
normalize(keys[6:])
err := store.SetCols(context.Background(), sc, keys, vals);
Expect(err).ToNot(HaveOccurred())
expectTriangleEq(keys, vals)
})
It("It obeys the triangle inequality", func() {
rnd := rand.New(rand.NewSource(151))
keys := make([][]float32, 20)
vals := make([][]byte, 20)
for i := range keys {
k := make([]float32, 768)
for j := range k {
k[j] = rnd.Float32()
}
keys[i] = k
}
c := byte('a')
for i := range vals {
vals[i] = []byte{c}
c += 1
}
err := store.SetCols(context.Background(), sc, keys, vals);
Expect(err).ToNot(HaveOccurred())
expectTriangleEq(keys, vals)
})
})
})