Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
184fbc26bf Revert "feat: include tokens usage for streamed output (#4282)"
This reverts commit 0d6c3a7d57.
2024-12-08 16:31:48 +01:00
433 changed files with 9134 additions and 256611 deletions

View File

@@ -0,0 +1,23 @@
meta {
name: musicgen
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model_id": "facebook/musicgen-small",
"text": "Exciting 80s Newscast Interstitial",
"duration_seconds": 8
}
}

View File

@@ -0,0 +1,17 @@
meta {
name: backend monitor
type: http
seq: 4
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
body: json
auth: none
}
body:json {
{
"model": "{{DEFAULT_MODEL}}"
}
}

View File

@@ -0,0 +1,21 @@
meta {
name: backend-shutdown
type: http
seq: 3
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}"
}
}

View File

@@ -0,0 +1,5 @@
{
"version": "1",
"name": "LocalAI Test Requests",
"type": "collection"
}

View File

@@ -0,0 +1,6 @@
vars {
HOST: localhost
PORT: 8080
DEFAULT_MODEL: gpt-3.5-turbo
PROTOCOL: http://
}

View File

@@ -0,0 +1,11 @@
meta {
name: get models list
type: http
seq: 2
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
body: none
auth: none
}

View File

@@ -0,0 +1,25 @@
meta {
name: Generate image
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"prompt": "<positive prompt>|<negative prompt>",
"model": "model-name",
"step": 51,
"size": "1024x1024",
"image": ""
}
}

View File

@@ -0,0 +1,24 @@
meta {
name: -completions
type: http
seq: 4
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"prompt": "function downloadFile(string url, string outputPath) {",
"max_tokens": 256,
"temperature": 0.5
}
}

View File

@@ -0,0 +1,23 @@
meta {
name: -edits
type: http
seq: 5
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "What day of the wek is it?",
"instruction": "Fix the spelling mistakes"
}
}

View File

@@ -0,0 +1,22 @@
meta {
name: -embeddings
type: http
seq: 6
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
}
}

View File

@@ -0,0 +1,30 @@
meta {
name: chat completion -simple- 1 message-
type: http
seq: 4
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [
{
"role": "user",
"content": "How could one use friction to cook an egg?"
}
],
"max_tokens": 256,
"temperature": 0.2,
"grammar": ""
}
}

View File

@@ -0,0 +1,29 @@
meta {
name: chat-completions -long-
type: http
seq: 5
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
{"role": "user", "content": "How could one use electricity to cook an egg?"},
{"role": "assistant",
"content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
},
{"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
"max_tokens": 1024,
"temperature": 0.5
}
}

View File

@@ -0,0 +1,25 @@
meta {
name: chat-completions -stream-
type: http
seq: 6
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
"max_tokens": 256,
"temperature": 0.9,
"stream": true
}
}

View File

@@ -0,0 +1,22 @@
meta {
name: add model gallery
type: http
seq: 10
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
"name": "test"
}
}

View File

@@ -0,0 +1,21 @@
meta {
name: delete model gallery
type: http
seq: 11
}
delete {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"name": "test"
}
}

View File

@@ -0,0 +1,11 @@
meta {
name: list MODELS in galleries
type: http
seq: 7
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
body: none
auth: none
}

View File

@@ -0,0 +1,11 @@
meta {
name: list model GALLERIES
type: http
seq: 8
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: none
auth: none
}

View File

@@ -0,0 +1,11 @@
meta {
name: model delete
type: http
seq: 7
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: none
auth: none
}

View File

@@ -0,0 +1,21 @@
meta {
name: model gallery apply -gist-
type: http
seq: 12
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
}
}

View File

@@ -0,0 +1,22 @@
meta {
name: model gallery apply
type: http
seq: 9
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
"name": "codellama7b"
}
}

View File

Binary file not shown.

View File

@@ -0,0 +1,16 @@
meta {
name: transcribe
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
body: multipartForm
auth: none
}
body:multipart-form {
file: @file(transcription/gb1.ogg)
model: whisper-1
}

View File

@@ -0,0 +1,22 @@
meta {
name: -tts
type: http
seq: 2
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
}
}

View File

@@ -0,0 +1,23 @@
meta {
name: musicgen
type: http
seq: 2
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"backend": "transformers-musicgen",
"model": "facebook/musicgen-small",
"input": "80s Synths playing Jazz"
}
}

View File

@@ -7,7 +7,7 @@ services:
args:
- FFMPEG=true
- IMAGE_TYPE=extras
- GO_TAGS=p2p tts
- GO_TAGS=stablediffusion p2p tts
env_file:
- ../.env
ports:

18
.env
View File

@@ -29,9 +29,6 @@
## Enable/Disable single backend (useful if only one GPU is available)
# LOCALAI_SINGLE_ACTIVE_BACKEND=true
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
## Specify a build type. Available: cublas, openblas, clblas.
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -41,12 +38,12 @@
## Uncomment and set to true to enable rebuilding from source
# REBUILD=true
## Enable go tags, available: p2p, tts
## p2p: enable distributed inferencing
## Enable go tags, available: stablediffusion, tts
## stablediffusion: image generation with stablediffusion
## tts: enables text-to-speech with go-piper
## (requires REBUILD=true)
#
# GO_TAGS=p2p
# GO_TAGS=stablediffusion
## Path where to store generated images
# LOCALAI_IMAGE_PATH=/tmp/generated/images
@@ -85,15 +82,6 @@
# Enable to allow p2p mode
# LOCALAI_P2P=true
# Enable to use federated mode
# LOCALAI_FEDERATED=true
# Enable to start federation server
# FEDERATED_SERVER=true
# Define to use federation token
# TOKEN=""
### Watchdog settings
###
# Enables watchdog to kill backends that are inactive for too much time

View File

@@ -81,6 +81,14 @@ updates:
directory: "/backend/python/transformers"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/transformers-musicgen"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/vall-e-x"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/vllm"
schedule:

6
.github/labeler.yml vendored
View File

@@ -1,14 +1,10 @@
enhancement:
enhancements:
- head-branch: ['^feature', 'feature']
dependencies:
- any:
- changed-files:
- any-glob-to-any-file: 'Makefile'
- changed-files:
- any-glob-to-any-file: '*.mod'
- changed-files:
- any-glob-to-any-file: '*.sum'
kind/documentation:
- any:

View File

@@ -9,7 +9,7 @@ jobs:
fail-fast: false
matrix:
include:
- repository: "ggml-org/llama.cpp"
- repository: "ggerganov/llama.cpp"
variable: "CPPLLAMA_VERSION"
branch: "master"
- repository: "ggerganov/whisper.cpp"

View File

@@ -14,7 +14,7 @@ jobs:
steps:
- name: Dependabot metadata
id: metadata
uses: dependabot/fetch-metadata@v2.3.0
uses: dependabot/fetch-metadata@v2.2.0
with:
github-token: "${{ secrets.GITHUB_TOKEN }}"
skip-commit-verification: true

View File

@@ -33,7 +33,7 @@ jobs:
run: |
CGO_ENABLED=0 make build-api
- name: rm
uses: appleboy/ssh-action@v1.2.2
uses: appleboy/ssh-action@v1.2.0
with:
host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
rm: true
target: ./local-ai
- name: restarting
uses: appleboy/ssh-action@v1.2.2
uses: appleboy/ssh-action@v1.2.0
with:
host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }}

View File

@@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'
on:
workflow_dispatch:
schedule:
# daily at midnight
- cron: '0 0 * * *'
push:
branches:
- master
concurrency:
group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -17,7 +16,7 @@ jobs:
matrix:
include:
- grpc-base-image: ubuntu:22.04
runs-on: 'arc-runner-set'
runs-on: 'ubuntu-latest'
platforms: 'linux/amd64,linux/arm64'
runs-on: ${{matrix.runs-on}}
steps:

View File

@@ -15,7 +15,7 @@ jobs:
strategy:
matrix:
include:
- base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
- base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
runs-on: 'ubuntu-latest'
platforms: 'linux/amd64'
runs-on: ${{matrix.runs-on}}

View File

@@ -280,7 +280,6 @@ jobs:
makeflags: ${{ matrix.makeflags }}
latest-image: ${{ matrix.latest-image }}
latest-image-aio: ${{ matrix.latest-image-aio }}
skip-drivers: ${{ matrix.skip-drivers }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -302,7 +301,6 @@ jobs:
latest-image: 'latest-cpu'
latest-image-aio: 'latest-aio-cpu'
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
@@ -314,7 +312,6 @@ jobs:
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
@@ -326,7 +323,6 @@ jobs:
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
@@ -338,7 +334,6 @@ jobs:
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
@@ -349,7 +344,6 @@ jobs:
image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target"
- build-type: 'vulkan'
platforms: 'linux/amd64'
@@ -360,45 +354,4 @@ jobs:
image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target"
gh-runner:
uses: ./.github/workflows/image_build.yml
with:
tag-latest: ${{ matrix.tag-latest }}
tag-suffix: ${{ matrix.tag-suffix }}
ffmpeg: ${{ matrix.ffmpeg }}
image-type: ${{ matrix.image-type }}
build-type: ${{ matrix.build-type }}
cuda-major-version: ${{ matrix.cuda-major-version }}
cuda-minor-version: ${{ matrix.cuda-minor-version }}
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
aio: ${{ matrix.aio }}
base-image: ${{ matrix.base-image }}
grpc-base-image: ${{ matrix.grpc-base-image }}
makeflags: ${{ matrix.makeflags }}
latest-image: ${{ matrix.latest-image }}
latest-image-aio: ${{ matrix.latest-image-aio }}
skip-drivers: ${{ matrix.skip-drivers }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
strategy:
matrix:
include:
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'false'
tag-suffix: '-nvidia-l4t-arm64-core'
latest-image: 'latest-nvidia-l4t-arm64-core'
ffmpeg: 'true'
image-type: 'core'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'true'

View File

@@ -49,10 +49,6 @@ on:
description: 'FFMPEG'
default: ''
type: string
skip-drivers:
description: 'Skip drivers by default'
default: 'false'
type: string
image-type:
description: 'Image type'
default: ''
@@ -238,7 +234,6 @@ jobs:
GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
GRPC_VERSION=v1.65.0
MAKEFLAGS=${{ inputs.makeflags }}
SKIP_DRIVERS=${{ inputs.skip-drivers }}
context: .
file: ./Dockerfile
cache-from: type=gha
@@ -267,7 +262,6 @@ jobs:
GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
GRPC_VERSION=v1.65.0
MAKEFLAGS=${{ inputs.makeflags }}
SKIP_DRIVERS=${{ inputs.skip-drivers }}
context: .
file: ./Dockerfile
cache-from: type=gha
@@ -310,11 +304,6 @@ jobs:
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
- name: Cleanup
run: |
docker builder prune -f
docker system prune --force --volumes --all
- name: Latest tag
# run this on branches, when it is a tag and there is a latest-image defined
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'

View File

@@ -18,7 +18,7 @@ jobs:
with:
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
# Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0
- uses: GrantBirki/git-diff-action@v2.7.0
id: git-diff-action
with:
json_diff_file_output: diff.json
@@ -99,7 +99,7 @@ jobs:
docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done
# Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0
- uses: GrantBirki/git-diff-action@v2.7.0
id: git-diff-action
with:
json_diff_file_output: diff.json

View File

@@ -237,7 +237,40 @@ jobs:
detached: true
connect-timeout-seconds: 180
limit-access-to-actor: true
build-stablediffusion:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- uses: actions/setup-go@v5
with:
go-version: '1.21.x'
cache: false
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
- name: Build stablediffusion
run: |
export PATH=$PATH:$GOPATH/bin
make backend-assets/grpc/stablediffusion
mkdir -p release && cp backend-assets/grpc/stablediffusion release
env:
GO_TAGS: stablediffusion
- uses: actions/upload-artifact@v4
with:
name: stablediffusion
path: release/
- name: Release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
with:
files: |
release/*
build-macOS-x86_64:
runs-on: macos-13

View File

@@ -18,7 +18,7 @@ jobs:
if: ${{ github.actor != 'dependabot[bot]' }}
- name: Run Gosec Security Scanner
if: ${{ github.actor != 'dependabot[bot]' }}
uses: securego/gosec@v2.22.3
uses: securego/gosec@v2.21.4
with:
# we let the report trigger content trigger a failure using the GitHub Security features.
args: '-no-fail -fmt sarif -out results.sarif ./...'

View File

@@ -35,6 +35,30 @@ jobs:
run: |
make --jobs=5 --output-sync=target -C backend/python/transformers
make --jobs=5 --output-sync=target -C backend/python/transformers test
tests-sentencetransformers:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test sentencetransformers
run: |
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
tests-rerankers:
runs-on: ubuntu-latest
steps:
@@ -78,27 +102,78 @@ jobs:
make --jobs=5 --output-sync=target -C backend/python/diffusers
make --jobs=5 --output-sync=target -C backend/python/diffusers test
# tests-transformers-musicgen:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential ffmpeg
# # Install UV
# curl -LsSf https://astral.sh/uv/install.sh | sh
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
# sudo apt-get install -y libopencv-dev
# pip install --user --no-cache-dir grpcio-tools==1.64.1
tests-parler-tts:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
# - name: Test transformers-musicgen
# run: |
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
- name: Test parler-tts
run: |
make --jobs=5 --output-sync=target -C backend/python/parler-tts
make --jobs=5 --output-sync=target -C backend/python/parler-tts test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19
with:
detached: true
connect-timeout-seconds: 180
limit-access-to-actor: true
tests-openvoice:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test openvoice
run: |
make --jobs=5 --output-sync=target -C backend/python/openvoice
make --jobs=5 --output-sync=target -C backend/python/openvoice test
tests-transformers-musicgen:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test transformers-musicgen
run: |
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
# tests-bark:
# runs-on: ubuntu-latest
@@ -185,6 +260,26 @@ jobs:
# run: |
# make --jobs=5 --output-sync=target -C backend/python/vllm
# make --jobs=5 --output-sync=target -C backend/python/vllm test
tests-vallex:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test vall-e-x
run: |
make --jobs=5 --output-sync=target -C backend/python/vall-e-x
make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
tests-coqui:
runs-on: ubuntu-latest

View File

@@ -100,12 +100,15 @@ jobs:
# The python3-grpc-tools package in 22.04 is too old
pip install --user grpcio-tools
make -C backend/python/transformers
sudo rm -rfv /usr/bin/conda || true
PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
# Pre-build piper before we start tests in order to have shared libraries in place
make sources/go-piper && \
GO_TAGS="tts" make -C sources/go-piper piper.o && \
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
# Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
env:
CUDA_VERSION: 12-4
- name: Cache grpc
@@ -127,7 +130,7 @@ jobs:
cd grpc && cd cmake/build && sudo make --jobs 5 install
- name: Test
run: |
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19

2
.vscode/launch.json vendored
View File

@@ -26,7 +26,7 @@
"LOCALAI_P2P": "true",
"LOCALAI_FEDERATED": "true"
},
"buildFlags": ["-tags", "p2p tts", "-v"],
"buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
"envFile": "${workspaceFolder}/.env",
"cwd": "${workspaceRoot}"
}

View File

@@ -15,7 +15,8 @@ ARG TARGETARCH
ARG TARGETVARIANT
ENV DEBIAN_FRONTEND=noninteractive
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
RUN apt-get update && \
apt-get install -y --no-install-recommends \
@@ -24,7 +25,6 @@ RUN apt-get update && \
ca-certificates \
curl libssl-dev \
git \
git-lfs \
unzip upx-ucl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
@@ -69,10 +69,14 @@ ENV PATH=/opt/rocm/bin:${PATH}
# OpenBLAS requirements and stable diffusion
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libopenblas-dev && \
libopenblas-dev \
libopencv-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set up OpenCV
RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
WORKDIR /build
###################################
@@ -111,13 +115,12 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
ARG BUILD_TYPE
ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=0
ARG SKIP_DRIVERS=false
ENV BUILD_TYPE=${BUILD_TYPE}
# Vulkan requirements
RUN <<EOT bash
if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
if [ "${BUILD_TYPE}" = "vulkan" ]; then
apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common pciutils wget gpg-agent && \
@@ -133,7 +136,7 @@ EOT
# CuBLAS requirements
RUN <<EOT bash
if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
if [ "${BUILD_TYPE}" = "cublas" ]; then
apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common pciutils
@@ -159,7 +162,7 @@ RUN <<EOT bash
EOT
# If we are building with clblas support, we need the libraries for the builds
RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
apt-get update && \
apt-get install -y --no-install-recommends \
libclblast-dev && \
@@ -167,7 +170,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
rm -rf /var/lib/apt/lists/* \
; fi
RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
apt-get update && \
apt-get install -y --no-install-recommends \
hipblas-dev \
@@ -247,7 +250,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
FROM requirements-drivers AS builder-base
ARG GO_TAGS="tts p2p"
ARG GO_TAGS="stablediffusion tts p2p"
ARG GRPC_BACKENDS
ARG MAKEFLAGS
ARG LD_FLAGS="-s -w"
@@ -281,12 +284,35 @@ RUN <<EOT bash
fi
EOT
###################################
###################################
# This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
FROM builder-base AS builder-sd
# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
COPY Makefile .
COPY go.mod .
COPY go.sum .
COPY backend/backend.proto ./backend/backend.proto
COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
COPY pkg/grpc ./pkg/grpc
COPY pkg/stablediffusion ./pkg/stablediffusion
RUN git init
RUN make sources/go-stable-diffusion
RUN touch prepare-sources
# Actually build the backend
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
###################################
###################################
# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
# Adjustments to the build process should likely be made here.
FROM builder-base AS builder
FROM builder-sd AS builder
# Install the pre-built GRPC
COPY --from=grpc /opt/grpc /usr/local
@@ -304,7 +330,7 @@ RUN make prepare
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
## (both will use CUDA or hipblas for the actual computation)
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \
make build; \
fi
@@ -326,6 +352,8 @@ ARG FFMPEG
COPY --from=grpc /opt/grpc /usr/local
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
COPY .devcontainer-scripts /.devcontainer-scripts
# Add FFmpeg
@@ -398,28 +426,36 @@ COPY --from=builder /build/local-ai ./
# Copy shared libraries for piper
COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
# do not let stablediffusion rebuild (requires an older version of absl)
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
# Change the shell to bash so we can use [[ tests below
SHELL ["/bin/bash", "-c"]
# We try to strike a balance between individual layer size (as that affects total push time) and total image size
# Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
# Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
apt-get -qq -y install espeak-ng \
; fi
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/coqui \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/faster-whisper \
if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/parler-tts \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/diffusers \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/transformers-musicgen \
; fi
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/kokoro \
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/vall-e-x \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/openvoice \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/sentencetransformers \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/exllama2 \
@@ -439,6 +475,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/rerankers \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/mamba \
; fi
# Make sure the models directory exists

View File

@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

215
Makefile
View File

@@ -6,7 +6,9 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true
# llama.cpp versions
CPPLLAMA_VERSION?=d6d2c2ab8c8865784ba9fef37f2b2de3f2134d33
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=cc98896db858df7aa40d0e16a505883ef196a482
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -16,13 +18,21 @@ WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
PIPER_REPO?=https://github.com/mudler/go-piper
PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
# stablediffusion version
STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
# tinydream version
TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
# bark.cpp
BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=4570715727f35e5a07a76796d823824c8f42206c
ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64
@@ -149,6 +159,7 @@ ifeq ($(BUILD_TYPE),hipblas)
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
# llama-ggml has no hipblas support, so override it here.
export STABLE_BUILD_TYPE=
export GGML_HIP=1
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -172,6 +183,16 @@ ifeq ($(STATIC),true)
LD_FLAGS+=-linkmode external -extldflags -static
endif
ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
# OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
endif
ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
# OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
OPTIONAL_GRPC+=backend-assets/grpc/tinydream
endif
ifeq ($(findstring tts,$(GO_TAGS)),tts)
# OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
# OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@@ -183,8 +204,8 @@ endif
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -218,6 +239,19 @@ endif
all: help
## go-llama.cpp
sources/go-llama.cpp:
mkdir -p sources/go-llama.cpp
cd sources/go-llama.cpp && \
git init && \
git remote add origin $(GOLLAMA_REPO) && \
git fetch origin && \
git checkout $(GOLLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
## bark.cpp
sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@@ -248,6 +282,19 @@ sources/go-piper:
sources/go-piper/libpiper_binding.a: sources/go-piper
$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
## stable diffusion (onnx)
sources/go-stable-diffusion:
mkdir -p sources/go-stable-diffusion
cd sources/go-stable-diffusion && \
git init && \
git remote add origin $(STABLEDIFFUSION_REPO) && \
git fetch origin && \
git checkout $(STABLEDIFFUSION_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
## stablediffusion (ggml)
sources/stablediffusion-ggml.cpp:
git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
@@ -255,12 +302,22 @@ sources/stablediffusion-ggml.cpp:
git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a: sources/stablediffusion-ggml.cpp
cd sources/stablediffusion-ggml.cpp && \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) .. && \
cmake --build . --config Release
backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion-ggml
endif
sources/onnxruntime:
mkdir -p sources/onnxruntime
@@ -276,6 +333,19 @@ else
mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
endif
## tiny-dream
sources/go-tiny-dream:
mkdir -p sources/go-tiny-dream
cd sources/go-tiny-dream && \
git init && \
git remote add origin $(TINYDREAM_REPO) && \
git fetch origin && \
git checkout $(TINYDREAM_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
$(MAKE) -C sources/go-tiny-dream libtinydream.a
## whisper
sources/whisper.cpp:
mkdir -p sources/whisper.cpp
@@ -289,17 +359,23 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace
$(GOCMD) mod download
@@ -307,8 +383,11 @@ prepare-sources: get-sources replace
## GENERIC
rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-stable-diffusion clean
$(MAKE) -C sources/go-piper clean
$(MAKE) -C sources/go-tiny-dream clean
$(MAKE) build
prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -410,7 +489,7 @@ run: prepare ## run local-ai
test-models/testmodel.ggml:
mkdir test-models
mkdir test-dir
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -422,10 +501,11 @@ prepare-test: grpcs
test: prepare test-models/testmodel.ggml grpcs
@echo 'Running tests'
export GO_TAGS="tts debug"
export GO_TAGS="tts stablediffusion debug"
$(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-llama
$(MAKE) test-llama-gguf
$(MAKE) test-tts
$(MAKE) test-stablediffusion
@@ -454,6 +534,10 @@ teardown-e2e:
rm -rf $(TEST_DIR) || true
docker stop $$(docker ps -q --filter ancestor=localai-tests)
test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -505,10 +589,10 @@ protogen-go-clean:
$(RM) bin/*
.PHONY: protogen-python
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
.PHONY: protogen-python-clean
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
.PHONY: autogptq-protogen
autogptq-protogen:
@@ -542,14 +626,6 @@ diffusers-protogen:
diffusers-protogen-clean:
$(MAKE) -C backend/python/diffusers protogen-clean
.PHONY: faster-whisper-protogen
faster-whisper-protogen:
$(MAKE) -C backend/python/faster-whisper protogen
.PHONY: faster-whisper-protogen-clean
faster-whisper-protogen-clean:
$(MAKE) -C backend/python/faster-whisper protogen-clean
.PHONY: exllama2-protogen
exllama2-protogen:
$(MAKE) -C backend/python/exllama2 protogen
@@ -558,6 +634,14 @@ exllama2-protogen:
exllama2-protogen-clean:
$(MAKE) -C backend/python/exllama2 protogen-clean
.PHONY: mamba-protogen
mamba-protogen:
$(MAKE) -C backend/python/mamba protogen
.PHONY: mamba-protogen-clean
mamba-protogen-clean:
$(MAKE) -C backend/python/mamba protogen-clean
.PHONY: rerankers-protogen
rerankers-protogen:
$(MAKE) -C backend/python/rerankers protogen
@@ -566,6 +650,14 @@ rerankers-protogen:
rerankers-protogen-clean:
$(MAKE) -C backend/python/rerankers protogen-clean
.PHONY: sentencetransformers-protogen
sentencetransformers-protogen:
$(MAKE) -C backend/python/sentencetransformers protogen
.PHONY: sentencetransformers-protogen-clean
sentencetransformers-protogen-clean:
$(MAKE) -C backend/python/sentencetransformers protogen-clean
.PHONY: transformers-protogen
transformers-protogen:
$(MAKE) -C backend/python/transformers protogen
@@ -574,13 +666,37 @@ transformers-protogen:
transformers-protogen-clean:
$(MAKE) -C backend/python/transformers protogen-clean
.PHONY: kokoro-protogen
kokoro-protogen:
$(MAKE) -C backend/python/kokoro protogen
.PHONY: parler-tts-protogen
parler-tts-protogen:
$(MAKE) -C backend/python/parler-tts protogen
.PHONY: kokoro-protogen-clean
kokoro-protogen-clean:
$(MAKE) -C backend/python/kokoro protogen-clean
.PHONY: parler-tts-protogen-clean
parler-tts-protogen-clean:
$(MAKE) -C backend/python/parler-tts protogen-clean
.PHONY: transformers-musicgen-protogen
transformers-musicgen-protogen:
$(MAKE) -C backend/python/transformers-musicgen protogen
.PHONY: transformers-musicgen-protogen-clean
transformers-musicgen-protogen-clean:
$(MAKE) -C backend/python/transformers-musicgen protogen-clean
.PHONY: vall-e-x-protogen
vall-e-x-protogen:
$(MAKE) -C backend/python/vall-e-x protogen
.PHONY: vall-e-x-protogen-clean
vall-e-x-protogen-clean:
$(MAKE) -C backend/python/vall-e-x protogen-clean
.PHONY: openvoice-protogen
openvoice-protogen:
$(MAKE) -C backend/python/openvoice protogen
.PHONY: openvoice-protogen-clean
openvoice-protogen-clean:
$(MAKE) -C backend/python/openvoice protogen-clean
.PHONY: vllm-protogen
vllm-protogen:
@@ -597,11 +713,15 @@ prepare-extra-conda-environments: protogen-python
$(MAKE) -C backend/python/bark
$(MAKE) -C backend/python/coqui
$(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/faster-whisper
$(MAKE) -C backend/python/vllm
$(MAKE) -C backend/python/mamba
$(MAKE) -C backend/python/sentencetransformers
$(MAKE) -C backend/python/rerankers
$(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/kokoro
$(MAKE) -C backend/python/transformers-musicgen
$(MAKE) -C backend/python/parler-tts
$(MAKE) -C backend/python/vall-e-x
$(MAKE) -C backend/python/openvoice
$(MAKE) -C backend/python/exllama2
prepare-test-extra: protogen-python
@@ -671,13 +791,6 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx512
$(MAKE) -C backend/cpp/llama-avx512 purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx
$(MAKE) -C backend/cpp/llama-avx purge
@@ -731,6 +844,13 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/llama-ggml
endif
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@@ -745,6 +865,13 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/piper
endif
backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion
endif
backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
@@ -752,6 +879,13 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/silero-vad
endif
backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/tinydream
endif
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
@@ -805,8 +939,7 @@ docker-aio-all:
docker-image-intel:
docker build \
--progress plain \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -814,7 +947,7 @@ docker-image-intel:
docker-image-intel-xpu:
docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -826,7 +959,7 @@ swagger:
.PHONY: gen-assets
gen-assets:
$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
## Documentation
docs/layouts/_default:

116
README.md
View File

@@ -1,6 +1,7 @@
<h1 align="center">
<br>
<img height="300" src="./core/http/static/logo.png"> <br>
<img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
LocalAI
<br>
</h1>
@@ -38,7 +39,7 @@
</p>
<p align="center">
<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
<a href="https://trendshift.io/repositories/1484" target="_blank"><img src="https://trendshift.io/api/badge/repositories/1484" alt="go-skynet%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
@@ -47,58 +48,9 @@
[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
## 📚🆕 Local Stack Family
🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
<table>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalAGI">
<img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
<p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
</td>
</tr>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalRecall">
<img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
<p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
</td>
</tr>
</table>
## Screenshots
| Talk Interface | Generate Audio |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
| Models Overview | Generate Images |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
| Chat Interface | Home |
| --- | --- |
| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
| Login | Swarm |
| --- | --- |
|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
## 💻 Quickstart
![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
Run the installer script:
@@ -107,21 +59,17 @@ curl https://localai.io/install.sh | sh
```
Or run with docker:
### CPU only image:
```bash
# CPU only image:
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
```
### Nvidia GPU:
```bash
# Nvidia GPU:
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
```
### CPU and GPU image (bigger size):
```bash
# CPU and GPU image (bigger size):
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
```
### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
```bash
# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
```
@@ -140,32 +88,48 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
local-ai run oci://localai/phi-2:latest
```
For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
[💻 Getting started](https://localai.io/basics/getting_started/index.html)
## 📰 Latest project news
- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
- Apr 2025: WebUI overhaul, AIO images updates
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
- Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
- Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
- June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
- June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
- May 2024: 🔥🔥 Decentralized P2P llama.cpp: https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs https://localai.io/features/distribute/
- May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
- May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
- May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
## 🔥🔥 Hot topics (looking for help):
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation)
- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
- 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/)
- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -173,10 +137,11 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
- 📈 [Reranker API](https://localai.io/features/reranker/)
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
- 🔊 Voice activity detection (Silero-VAD support)
- 🌍 Integrated WebUI!
## 💻 Usage
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
### 🔗 Community and integrations
@@ -194,7 +159,6 @@ Model galleries
Other:
- Helm chart https://github.com/go-skynet/helm-charts
- VSCode extension https://github.com/badgooooor/localai-vscode-plugin
- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
- Terminal utility https://github.com/djcopley/ShellOracle
- Local Smart assistant https://github.com/mudler/LocalAGI
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@@ -252,7 +216,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
<p align="center">
<a href="https://www.spectrocloud.com/" target="blank">
<img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
<img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
</a>
<a href="https://www.premai.io/" target="blank">
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>

View File

@@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002
embeddings: true
parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
usage: |
You can test this model with curl like this:

View File

@@ -1,17 +1,56 @@
name: stablediffusion
backend: stablediffusion-ggml
cfg_scale: 4.5
options:
- sampler:euler
backend: stablediffusion
parameters:
model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
step: 25
model: stablediffusion_assets
license: "BSD-3"
urls:
- https://github.com/EdVince/Stable-Diffusion-NCNN
- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
description: |
Stable Diffusion in NCNN with c++, supported txt2img and img2img
download_files:
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
- filename: "stablediffusion_assets/log_sigmas.bin"
sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
- filename: "stablediffusion_assets/vocab.txt"
sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
usage: |
curl http://localhost:8080/v1/images/generations \

View File

@@ -1,57 +1,101 @@
context_size: 8192
f16: true
function:
grammar:
no_mixed_free_string: true
schema_type: llama3.1 # or JSON is supported too (json)
response_regex:
- <function=(?P<name>\w+)>(?P<arguments>.*)</function>
mmap: true
name: gpt-4
mmap: true
parameters:
model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords:
- <|im_end|>
- <dummy32000>
- <|eot_id|>
- <|end_of_text|>
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template:
chat: |
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
{{ if .FunctionCall -}}
{{ else if eq .RoleName "tool" -}}
The Function was executed and the response was:
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ else if .FunctionCall -}}
{{ range .FunctionCall }}
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
{{ end }}
{{ end -}}
<|eot_id|>
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}
<tool_call>
{{- else if eq .RoleName "tool" }}
<tool_response>
{{- end }}
{{- if .Content}}
{{.Content }}
{{- end }}
{{- if .FunctionCall}}
{{toJson .FunctionCall}}
{{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: |
{{.Input}}
function: |
<|start_header_id|>system<|end_header_id|>
You are an expert in composing functions. You are given a question and a set of possible functions.
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
If you decide to invoke any of the function(s), you MUST put it in the format as follows:
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
You SHOULD NOT include any other text in the response.
Here is a list of functions in JSON format that you can invoke.
{{toJson .Functions}}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input}}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
download_files:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
function: |-
<|im_start|>system
You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant

View File

@@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View File

@@ -1,49 +1,31 @@
backend: llama-cpp
context_size: 4096
f16: true
mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters:
model: minicpm-v-2_6-Q4_K_M.gguf
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
model: bakllava.gguf
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
ASSISTANT:
download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
- filename: bakllava.gguf
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -129,7 +129,7 @@ detect_gpu
detect_gpu_size
PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
check_vars

View File

@@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002
backend: sentencetransformers
parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
model: all-MiniLM-L6-v2
usage: |
You can test this model with curl like this:

View File

@@ -1,53 +1,101 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4
mmap: true
parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}
<tool_call>
{{- else if eq .RoleName "tool" }}
<tool_response>
{{- end }}
{{- if .Content}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{- end }}
{{- if .FunctionCall}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
{{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: |
{{.Input}}
function: |
function: |-
<|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal:
You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
<|im_end|>
</tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
<|im_start|>assistant

View File

@@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View File

@@ -1,49 +1,35 @@
backend: llama-cpp
context_size: 4096
f16: true
mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters:
model: minicpm-v-2_6-Q4_K_M.gguf
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
ASSISTANT:
download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002
backend: sentencetransformers
parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
model: all-MiniLM-L6-v2
usage: |
You can test this model with curl like this:

View File

@@ -1,53 +1,103 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4
mmap: false
context_size: 8192
f16: false
parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- "<|im_end|>"
- "<dummy32000>"
- "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}
<tool_call>
{{- else if eq .RoleName "tool" }}
<tool_response>
{{- end }}
{{- if .Content}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{- end }}
{{- if .FunctionCall}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
{{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: |
{{.Input}}
function: |
function: |-
<|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal:
You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
<|im_end|>
</tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf

View File

@@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View File

@@ -1,50 +1,35 @@
backend: llama-cpp
context_size: 4096
f16: true
mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
mmap: false
f16: false
name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters:
model: minicpm-v-2_6-Q4_K_M.gguf
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
ASSISTANT:
download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -159,12 +159,6 @@ message Reply {
bytes message = 1;
int32 tokens = 2;
int32 prompt_tokens = 3;
double timing_prompt_processing = 4;
double timing_token_generation = 5;
}
message GrammarTrigger {
string word = 1;
}
message ModelOptions {
@@ -228,11 +222,6 @@ message ModelOptions {
int32 MaxModelLen = 54;
int32 TensorParallelSize = 55;
string LoadFormat = 58;
bool DisableLogStatus = 66;
string DType = 67;
int32 LimitImagePerPrompt = 68;
int32 LimitVideoPerPrompt = 69;
int32 LimitAudioPerPrompt = 70;
string MMProj = 41;
@@ -253,11 +242,6 @@ message ModelOptions {
repeated float LoraScales = 61;
repeated string Options = 62;
string CacheTypeKey = 63;
string CacheTypeValue = 64;
repeated GrammarTrigger GrammarTriggers = 65;
}
message Result {
@@ -361,4 +345,4 @@ message StatusResponse {
message Message {
string role = 1;
string content = 2;
}
}

View File

@@ -2,7 +2,7 @@
## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future.
set(TARGET myclip)
add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
install(TARGETS ${TARGET} LIBRARY)
target_include_directories(myclip PUBLIC .)
target_include_directories(myclip PUBLIC ../..)

View File

@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
TARGET?=--target grpc-server
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas)
@@ -36,18 +36,11 @@ else ifeq ($(OS),Darwin)
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl" \
-DGGML_SYCL_F16=ON
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl"
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
endif
llama.cpp:
@@ -84,4 +77,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
else
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
endif
cp llama.cpp/build/bin/grpc-server .
cp llama.cpp/build/bin/grpc-server .

View File

@@ -134,32 +134,6 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
return out;
}
// Adds an RPC server
// https://github.com/ggerganov/llama.cpp/compare/4dbc8b9cb71876e005724f4e8f73a3544646bcf5..3edfa7d3753c29e44b964c0ff424d2ea8d5fdee6
static void add_rpc_devices(std::string servers) {
auto rpc_servers = string_split<std::string>(servers, ',');
if (rpc_servers.empty()) {
throw std::invalid_argument("no RPC servers specified");
}
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
throw std::invalid_argument("failed to find RPC backend");
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
throw std::invalid_argument("failed to find RPC device add function");
}
for (const auto & server : rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
ggml_backend_device_register(dev);
} else {
throw std::invalid_argument("failed to register RPC device");
}
}
}
// convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
{
@@ -454,7 +428,6 @@ struct llama_server_context
{
llama_model *model = nullptr;
llama_context *ctx = nullptr;
const llama_vocab * vocab = nullptr;
clip_ctx *clp_ctx = nullptr;
@@ -466,11 +439,6 @@ struct llama_server_context
bool clean_kv_cache = true;
bool all_slots_are_idle = false;
bool add_bos_token = true;
bool has_eos_token = true;
bool has_gpu = false;
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_triggers;
int32_t n_ctx; // total context for all clients / slots
@@ -509,15 +477,12 @@ struct llama_server_context
bool load_model(const common_params &params_)
{
params = params_;
if (!params.mmproj.path.empty()) {
if (!params.mmproj.empty()) {
multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
/* use_gpu */ has_gpu,
/*verbosity=*/ GGML_LOG_LEVEL_INFO,
});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) {
LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
return false;
}
@@ -527,17 +492,17 @@ struct llama_server_context
}
common_init_result common_init = common_init_from_params(params);
model = common_init.model.release();
ctx = common_init.context.release();
model = common_init.model;
ctx = common_init.context;
if (model == nullptr)
{
LOG_ERR("unable to load model: %s", params.model.path.c_str());
LOG_ERR("unable to load model: %s", params.model.c_str());
return false;
}
if (multimodal) {
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_model_n_embd(model);
const int n_embd_llm = llama_n_embd(model);
if (n_embd_clip != n_embd_llm) {
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
llama_free(ctx);
@@ -546,15 +511,23 @@ struct llama_server_context
}
}
vocab = llama_model_get_vocab(model);
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
add_bos_token = llama_add_bos_token(model);
return true;
}
void validate_model_chat_template(server_params & sparams) {
llama_chat_message chat[] = {{"user", "test"}};
std::vector<char> buf(1);
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
if (res < 0) {
LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
}
}
llama_client_slot* get_active_slot() {
for (llama_client_slot& slot : slots) {
// Check if the slot is currently processing
@@ -708,13 +681,12 @@ struct llama_server_context
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
slot->sparams.grammar_triggers = grammar_triggers;
slot->sparams.grammar_lazy = grammar_lazy;
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ?
@@ -754,8 +726,8 @@ struct llama_server_context
slot->prompt = "";
}
if (json_value(data, "ignore_eos", false) && has_eos_token) {
slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
if (json_value(data, "ignore_eos", false)) {
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
}
/*
slot->sparams.penalty_prompt_tokens.clear();
@@ -794,13 +766,13 @@ struct llama_server_context
}
}
*/
slot->sparams.logit_bias.clear();
const auto &logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array())
{
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
const int n_vocab = llama_n_vocab(model);
for (const auto &el : *logit_bias)
{
if (el.is_array() && el.size() == 2)
@@ -829,7 +801,7 @@ struct llama_server_context
}
else if (el[0].is_string())
{
auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks)
{
slot->sparams.logit_bias.push_back({tok, bias});
@@ -1159,15 +1131,7 @@ struct llama_server_context
slot.has_next_token = false;
}
if (slot.n_past >= slot.n_ctx) {
slot.truncated = true;
slot.stopped_limit = true;
slot.has_next_token = false;
LOG_VERBOSE("stopped due to running out of context capacity", {});
}
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
if (result.tok == llama_token_eos(model))
{
slot.stopped_eos = true;
slot.has_next_token = false;
@@ -1249,12 +1213,13 @@ struct llama_server_context
{"mirostat", slot.sparams.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl},
{"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict},
{"n_keep", params.n_keep},
{"ignore_eos", slot.sparams.ignore_eos},
{"stream", slot.params.stream},
// {"logit_bias", slot.sparams.logit_bias},
// {"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar},
@@ -1354,7 +1319,7 @@ struct llama_server_context
queue_results.send(res);
}
void send_embedding(llama_client_slot &slot, const llama_batch & batch)
void send_embedding(llama_client_slot &slot)
{
task_result res;
res.id = slot.task_id;
@@ -1362,7 +1327,7 @@ struct llama_server_context
res.error = false;
res.stop = true;
const int n_embd = llama_model_n_embd(model);
const int n_embd = llama_n_embd(model);
if (!params.embedding)
{
LOG_WARNING("embedding disabled", {
@@ -1376,38 +1341,10 @@ struct llama_server_context
else
{
const float *data = llama_get_embeddings(ctx);
std::vector<float> embd_res(n_embd, 0.0f);
std::vector<std::vector<float>> embedding;
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
}
if (embd == NULL) {
LOG("failed to get embeddings");
continue;
}
// normalize only when there is pooling
// TODO: configurable
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
common_embd_normalize(embd, embd_res.data(), n_embd, 2);
embedding.push_back(embd_res);
} else {
embedding.push_back({ embd, embd + n_embd });
}
}
// OAI compat
std::vector<float> embedding(data, data + n_embd);
res.result_json = json
{
{"embedding", embedding[0] },
{"embedding", embedding },
};
}
queue_results.send(res);
@@ -1489,7 +1426,7 @@ struct llama_server_context
n_eval = n_batch;
}
const int n_embd = llama_model_n_embd(model);
const int n_embd = llama_n_embd(model);
float * embd = img.image_embedding + i * n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
if (llama_decode(ctx, llava_batch.batch))
@@ -1667,17 +1604,17 @@ struct llama_server_context
{
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
{
// this check is redundant (for good)
// we should never get here, because generation should already stopped in process_token()
// START LOCALAI changes
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
// See: https://github.com/mudler/LocalAI/issues/1333
// Context is exhausted, release the slot
slot.release();
send_final_response(slot);
slot.has_next_token = false;
LOG_ERROR("context is exhausted, release the slot", {});
slot.cache_tokens.clear();
slot.n_past = 0;
slot.truncated = false;
slot.has_next_token = true;
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
continue;
// END LOCALAI changes
@@ -1770,11 +1707,11 @@ struct llama_server_context
suffix_tokens.erase(suffix_tokens.begin());
}
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), llama_vocab_fim_suf(vocab));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
prefix_tokens.push_back(llama_token_middle(model));
prompt_tokens = prefix_tokens;
}
else
@@ -2028,7 +1965,7 @@ struct llama_server_context
// prompt evaluated for embedding
if (slot.embedding)
{
send_embedding(slot, batch_view);
send_embedding(slot);
slot.release();
slot.i_batch = -1;
continue;
@@ -2122,11 +2059,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
}
std::function<void(int)> shutdown_handler;
inline void signal_handler(int signal) {
exit(1);
}
inline void signal_handler(int signal) { shutdown_handler(signal); }
/////////////////////////////////
////////////////////////////////
@@ -2179,6 +2112,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
// slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
// slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
// slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
// slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
// slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
// slot->params.seed = json_value(data, "seed", default_params.seed);
// slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
@@ -2201,6 +2135,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
data["mirostat"] = predict->mirostat();
data["mirostat_tau"] = predict->mirostattau();
data["mirostat_eta"] = predict->mirostateta();
data["penalize_nl"] = predict->penalizenl();
data["n_keep"] = predict->nkeep();
data["seed"] = predict->seed();
data["grammar"] = predict->grammar();
@@ -2246,6 +2181,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
// llama.params.sparams.mirostat = predict->mirostat();
// llama.params.sparams.mirostat_tau = predict->mirostattau();
// llama.params.sparams.mirostat_eta = predict->mirostateta();
// llama.params.sparams.penalize_nl = predict->penalizenl();
// llama.params.n_keep = predict->nkeep();
// llama.params.seed = predict->seed();
// llama.params.sparams.grammar = predict->grammar();
@@ -2292,54 +2228,19 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
// }
// }
const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_Q8_0,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_IQ4_NL,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
};
static ggml_type kv_cache_type_from_str(const std::string & s) {
for (const auto & type : kv_cache_types) {
if (ggml_type_name(type) == s) {
return type;
}
}
throw std::runtime_error("Unsupported cache type: " + s);
}
static std::string get_all_kv_cache_types() {
std::ostringstream msg;
for (const auto & type : kv_cache_types) {
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
}
return msg.str();
}
static void params_parse(const backend::ModelOptions* request,
common_params & params, llama_server_context &llama) {
common_params & params) {
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
params.model.path = request->modelfile();
params.model = request->modelfile();
if (!request->mmproj().empty()) {
// get the directory of modelfile
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
params.mmproj.path = model_dir + "/"+ request->mmproj();
std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
params.mmproj = model_dir + "/"+ request->mmproj();
}
// params.model_alias ??
params.model_alias = request->modelfile();
if (!request->cachetypekey().empty()) {
params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
}
if (!request->cachetypevalue().empty()) {
params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
}
params.n_ctx = request->contextsize();
//params.memory_f16 = request->f16memory();
params.cpuparams.n_threads = request->threads();
@@ -2357,23 +2258,9 @@ static void params_parse(const backend::ModelOptions* request,
const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
if (llama_grpc_servers != NULL) {
add_rpc_devices(std::string(llama_grpc_servers));
params.rpc_servers = std::string(llama_grpc_servers);
}
// decode options. Options are in form optname:optvale, or if booleans only optname.
for (int i = 0; i < request->options_size(); i++) {
std::string opt = request->options(i);
char *optname = strtok(&opt[0], ":");
char *optval = strtok(NULL, ":");
if (optval == NULL) {
optval = "true";
}
if (!strcmp(optname, "gpu")) {
llama.has_gpu = true;
}
}
// TODO: Add yarn
if (!request->tensorsplit().empty()) {
@@ -2405,7 +2292,7 @@ static void params_parse(const backend::ModelOptions* request,
scale_factor = request->lorascale();
}
// get the directory of modelfile
std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
}
params.use_mlock = request->mlock();
@@ -2437,21 +2324,6 @@ static void params_parse(const backend::ModelOptions* request,
if ( request->ropefreqscale() != 0.0f ) {
params.rope_freq_scale = request->ropefreqscale();
}
if (request->grammartriggers_size() > 0) {
LOG_INFO("configuring grammar triggers", {});
llama.grammar_lazy = true;
for (int i = 0; i < request->grammartriggers_size(); i++) {
common_grammar_trigger trigger;
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
trigger.value = request->grammartriggers(i).word();
// trigger.at_start = request->grammartriggers(i).at_start();
llama.grammar_triggers.push_back(trigger);
LOG_INFO("grammar trigger", {
{ "word", trigger.value },
});
}
}
}
@@ -2467,7 +2339,7 @@ public:
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
// Implement LoadModel RPC
common_params params;
params_parse(request, params, llama);
params_parse(request, params);
llama_backend_init();
llama_numa_init(params.numa);
@@ -2512,13 +2384,6 @@ public:
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
reply.set_prompt_tokens(tokens_evaluated);
if (result.result_json.contains("timings")) {
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
reply.set_timing_prompt_processing(timing_prompt_processing);
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
reply.set_timing_token_generation(timing_token_generation);
}
// Log Request Correlation Id
LOG_VERBOSE("correlation:", {
{ "id", data["correlation_id"] }
@@ -2559,13 +2424,6 @@ public:
reply->set_prompt_tokens(tokens_evaluated);
reply->set_tokens(tokens_predicted);
reply->set_message(completion_text);
if (result.result_json.contains("timings")) {
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
reply->set_timing_prompt_processing(timing_prompt_processing);
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
reply->set_timing_token_generation(timing_token_generation);
}
}
else
{
@@ -2600,18 +2458,6 @@ public:
return grpc::Status::OK;
}
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
json data = parse_options(false, request, llama);
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
for (int i=0 ; i< tokens.size(); i++){
response->add_tokens(tokens[i]);
}
return grpc::Status::OK;
}
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
llama_client_slot* active_slot = llama.get_active_slot();
@@ -2653,20 +2499,6 @@ void RunServer(const std::string& server_address) {
int main(int argc, char** argv) {
std::string server_address("localhost:50051");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
// Define long and short options
struct option long_options[] = {
{"addr", required_argument, nullptr, 'a'},

View File

@@ -1,13 +1,13 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd0d2fa..6c5e811a 100644
index 342042ff..224db9b5 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
- patches_data[i] = i + 1;
+ patches_data[i] = i;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
- patches_data[i] = i + 1;
+ patches_data[i] = i;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);

View File

@@ -21,7 +21,6 @@ fi
## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future.
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h

View File

@@ -2,134 +2,20 @@ INCLUDE_PATH := $(abspath ./)
LIBRARY_PATH := $(abspath ./)
AR?=ar
CMAKE_ARGS?=
BUILD_TYPE?=
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
# keep standard at C11 and C++11
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
GOCMD?=go
CGO_LDFLAGS?=
# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
CGO_LDFLAGS_SYCL=
GO_TAGS?=
LD_FLAGS?=
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DGGML_CUDA=ON
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# to CMAKE_ARGS automatically
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DGGML_HIP=ON
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
# But if it's OSX without metal, disable it here
else ifeq ($(OS),Darwin)
ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF
else
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
TARGET+=--target ggml-metal
endif
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON \
-DGGML_SYCL_F16=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
# warnings
# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
# Find all .a archives in ARCHIVE_DIR
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
GGML_ARCHIVE_DIR := build/ggml/src/
ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
# Name of the single merged library
COMBINED_LIB := libggmlall.a
# Rule to merge all the .a files into one
$(COMBINED_LIB): $(ALL_ARCHIVES)
@echo "Merging all .a into $(COMBINED_LIB)"
rm -f $@
mkdir -p merge-tmp
for a in $(ALL_ARCHIVES); do \
( cd merge-tmp && ar x ../$$a ); \
done
( cd merge-tmp && ar rcs ../$@ *.o )
# Ensure we have a proper index
ranlib $@
# Clean up
rm -rf merge-tmp
build/libstable-diffusion.a:
@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
cmake --build . --config Release"
else
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
cmake --build . --config Release
endif
$(MAKE) $(COMBINED_LIB)
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
gosd.o:
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
else
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
endif
libsd.a: gosd.o
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
cp $(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a ./libsd.a
$(AR) rcs libsd.a gosd.o
stablediffusion-ggml:
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
ifneq ($(UPX),)
$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
endif
clean:
rm -rf gosd.o libsd.a build $(COMBINED_LIB)
rm -f gosd.o libsd.a

View File

@@ -35,8 +35,6 @@ const char* sample_method_str[] = {
"ipndm",
"ipndm_v",
"lcm",
"ddim_trailing",
"tcd",
};
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -175,7 +173,6 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
-1, //clip_skip
cfg_scale, // sfg_scale
3.5f,
0, // eta
width,
height,
sample_method,

View File

@@ -1,7 +1,7 @@
package main
// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src/ggml-cpu -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src -lsd -lstdc++ -lm -lggml -lggml-base -lggml-cpu -lgomp
// #include <gosd.h>
// #include <stdlib.h>
import "C"

View File

@@ -0,0 +1,21 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Image{}); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,33 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/stablediffusion"
)
type Image struct {
base.SingleThread
stablediffusion *stablediffusion.StableDiffusion
}
func (image *Image) Load(opts *pb.ModelOptions) error {
var err error
// Note: the Model here is a path to a directory containing the model files
image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
return err
}
func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
return image.stablediffusion.GenerateImage(
int(opts.Height),
int(opts.Width),
int(opts.Mode),
int(opts.Step),
int(opts.Seed),
opts.PositivePrompt,
opts.NegativePrompt,
opts.Dst)
}

View File

@@ -0,0 +1,21 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Image{}); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,32 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/tinydream"
)
type Image struct {
base.SingleThread
tinydream *tinydream.TinyDream
}
func (image *Image) Load(opts *pb.ModelOptions) error {
var err error
// Note: the Model here is a path to a directory containing the model files
image.tinydream, err = tinydream.New(opts.ModelFile)
return err
}
func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
return image.tinydream.GenerateImage(
int(opts.Height),
int(opts.Width),
int(opts.Step),
int(opts.Seed),
opts.PositivePrompt,
opts.NegativePrompt,
opts.Dst)
}

View File

@@ -0,0 +1,204 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"
"github.com/go-skynet/go-llama.cpp"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type LLM struct {
base.SingleThread
llama *llama.LLama
}
func (llm *LLM) Load(opts *pb.ModelOptions) error {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
llamaOpts := []llama.ModelOption{
llama.WithRopeFreqBase(ropeFreqBase),
llama.WithRopeFreqScale(ropeFreqScale),
}
if opts.NGQA != 0 {
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
}
if opts.RMSNormEps != 0 {
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
}
if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}
if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}
if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}
model, err := llama.New(opts.ModelFile, llamaOpts...)
llm.llama = model
return err
}
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
predictOptions := []llama.PredictOption{
llama.SetTemperature(opts.Temperature),
llama.SetTopP(opts.TopP),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
llama.WithGrammar(opts.Grammar),
llama.SetRopeFreqBase(ropeFreqBase),
llama.SetRopeFreqScale(ropeFreqScale),
llama.SetNegativePromptScale(opts.NegativePromptScale),
llama.SetNegativePrompt(opts.NegativePrompt),
}
if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}
if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}
// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}
if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}
if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
}
if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
}
if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
}
if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}
if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}
if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}
if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}
if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
return predictOptions
}
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))
go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()
return nil
}
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)
if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}

View File

@@ -0,0 +1,19 @@
package main
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
panic(err)
}
}

View File

@@ -311,16 +311,12 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
}
func isNormalized(k []float32) bool {
var sum float64
var sum float32
for _, v := range k {
v64 := float64(v)
sum += v64*v64
sum += v
}
s := math.Sqrt(sum)
return s >= 0.99 && s <= 1.01
return sum == 1.0
}
// TODO: This we could replace with handwritten SIMD code
@@ -332,7 +328,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
dot += k1[i] * k2[i]
}
assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
// 2.0 * (1.0 - dot) would be the Euclidean distance
return dot
@@ -422,7 +418,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
sim := float32(dot / (mag1 * math.Sqrt(mag2)))
assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
return sim
}

View File

@@ -1,6 +1,5 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
intel-extension-for-pytorch
torch
optimum[openvino]
setuptools
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,6 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.71.0
grpcio==1.68.1
protobuf
certifi
transformers

View File

@@ -1,9 +1,8 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
torchaudio==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
intel-extension-for-pytorch
torch
torchaudio
optimum[openvino]
setuptools
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
transformers
accelerate

View File

@@ -1,4 +1,4 @@
bark==0.1.5
grpcio==1.71.0
grpcio==1.68.1
protobuf
certifi

View File

@@ -17,9 +17,6 @@
# LIMIT_TARGETS="cublas12"
# source $(dirname $0)/../common/libbackend.sh
#
PYTHON_VERSION="3.10"
function init() {
# Name of the backend (directory name)
BACKEND_NAME=${PWD##*/}
@@ -91,7 +88,7 @@ function getBuildProfile() {
# always result in an activated virtual environment
function ensureVenv() {
if [ ! -d "${EDIR}/venv" ]; then
uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
uv venv ${EDIR}/venv
echo "virtualenv created"
fi

View File

@@ -1,5 +1,4 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
intel-extension-for-pytorch
torch
optimum[openvino]

View File

@@ -1,3 +1,3 @@
grpcio==1.71.0
grpcio==1.68.1
protobuf
grpcio-tools

View File

@@ -1,4 +1,4 @@
transformers==4.48.3
transformers
accelerate
torch==2.4.1
coqui-tts

View File

@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118
torchaudio==2.4.1+cu118
transformers==4.48.3
transformers
accelerate
coqui-tts

View File

@@ -1,5 +1,5 @@
torch==2.4.1
torchaudio==2.4.1
transformers==4.48.3
transformers
accelerate
coqui-tts

View File

@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0
torchaudio==2.4.1+rocm6.0
transformers==4.48.3
transformers
accelerate
coqui-tts

View File

@@ -1,10 +1,9 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
torchaudio==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
intel-extension-for-pytorch
torch
torchaudio
optimum[openvino]
setuptools
transformers==4.48.3
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
transformers
accelerate
coqui-tts

View File

@@ -1,4 +1,4 @@
grpcio==1.71.0
grpcio==1.68.1
protobuf
certifi
packaging==24.1

View File

@@ -17,9 +17,9 @@ import backend_pb2_grpc
import grpc
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
from diffusers.pipelines.stable_diffusion import safety_checker
from diffusers.utils import load_image, export_to_video
from compel import Compel, ReturnedEmbeddingsType
@@ -159,18 +159,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
torchType = torch.float16
variant = "fp16"
options = request.Options
# empty dict
self.options = {}
# The options are a list of strings in this form optname:optvalue
# We are storing all the options in a dict so we can use it later when
# generating the images
for opt in options:
key, value = opt.split(":")
self.options[key] = value
local = False
modelFile = request.Model
@@ -287,19 +275,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.LowVRAM:
self.pipe.enable_model_cpu_offload()
elif request.PipelineType == "Lumina2Text2ImgPipeline":
self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
request.Model,
torch_dtype=torch.bfloat16)
if request.LowVRAM:
self.pipe.enable_model_cpu_offload()
elif request.PipelineType == "SanaPipeline":
self.pipe = SanaPipeline.from_pretrained(
request.Model,
variant="bf16",
torch_dtype=torch.bfloat16)
self.pipe.vae.to(torch.bfloat16)
self.pipe.text_encoder.to(torch.bfloat16)
if CLIPSKIP and request.CLIPSkip != 0:
self.clip_skip = request.CLIPSkip
@@ -459,9 +434,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
kwargs = {key: options.get(key) for key in keys if key in options}
# populate kwargs from self.options.
kwargs.update(self.options)
# Set seed
if request.seed > 0:
kwargs["generator"] = torch.Generator(device=self.device).manual_seed(

View File

@@ -1,10 +1,9 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
torchvision==0.18.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
intel-extension-for-pytorch
torch
torchvision
optimum[openvino]
setuptools
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
diffusers
opencv-python
transformers

View File

@@ -1,5 +1,5 @@
setuptools
grpcio==1.71.0
grpcio==1.68.1
pillow
protobuf
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.71.0
grpcio==1.68.1
protobuf
certifi
wheel

View File

@@ -1,20 +0,0 @@
.DEFAULT_GOAL := install
.PHONY: install
install:
bash install.sh
$(MAKE) protogen
.PHONY: protogen
protogen: backend_pb2_grpc.py backend_pb2.py
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
backend_pb2_grpc.py backend_pb2.py:
bash protogen.sh
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__

View File

@@ -1,8 +0,0 @@
faster-whisper
opencv-python
accelerate
compel
peft
sentencepiece
torch==2.4.1
optimum-quanto

View File

@@ -1,8 +0,0 @@
torch==2.4.1
faster-whisper
opencv-python
accelerate
compel
peft
sentencepiece
optimum-quanto

View File

@@ -1,6 +0,0 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
optimum[openvino]
faster-whisper

View File

@@ -1,3 +0,0 @@
grpcio==1.71.0
protobuf
grpcio-tools

View File

@@ -1,524 +0,0 @@
# https://huggingface.co/hexgrad/Kokoro-82M/blob/main/istftnet.py
# https://github.com/yl4579/StyleTTS2/blob/main/Modules/istftnet.py
from scipy.signal import get_window
from torch.nn import Conv1d, ConvTranspose1d
from torch.nn.utils import weight_norm, remove_weight_norm
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
# https://github.com/yl4579/StyleTTS2/blob/main/Modules/utils.py
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size*dilation - dilation)/2)
LRELU_SLOPE = 0.1
class AdaIN1d(nn.Module):
def __init__(self, style_dim, num_features):
super().__init__()
self.norm = nn.InstanceNorm1d(num_features, affine=False)
self.fc = nn.Linear(style_dim, num_features*2)
def forward(self, x, s):
h = self.fc(s)
h = h.view(h.size(0), h.size(1), 1)
gamma, beta = torch.chunk(h, chunks=2, dim=1)
return (1 + gamma) * self.norm(x) + beta
class AdaINResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), style_dim=64):
super(AdaINResBlock1, self).__init__()
self.convs1 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2])))
])
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1)))
])
self.convs2.apply(init_weights)
self.adain1 = nn.ModuleList([
AdaIN1d(style_dim, channels),
AdaIN1d(style_dim, channels),
AdaIN1d(style_dim, channels),
])
self.adain2 = nn.ModuleList([
AdaIN1d(style_dim, channels),
AdaIN1d(style_dim, channels),
AdaIN1d(style_dim, channels),
])
self.alpha1 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs1))])
self.alpha2 = nn.ParameterList([nn.Parameter(torch.ones(1, channels, 1)) for i in range(len(self.convs2))])
def forward(self, x, s):
for c1, c2, n1, n2, a1, a2 in zip(self.convs1, self.convs2, self.adain1, self.adain2, self.alpha1, self.alpha2):
xt = n1(x, s)
xt = xt + (1 / a1) * (torch.sin(a1 * xt) ** 2) # Snake1D
xt = c1(xt)
xt = n2(xt, s)
xt = xt + (1 / a2) * (torch.sin(a2 * xt) ** 2) # Snake1D
xt = c2(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class TorchSTFT(torch.nn.Module):
def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
super().__init__()
self.filter_length = filter_length
self.hop_length = hop_length
self.win_length = win_length
self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32))
def transform(self, input_data):
forward_transform = torch.stft(
input_data,
self.filter_length, self.hop_length, self.win_length, window=self.window.to(input_data.device),
return_complex=True)
return torch.abs(forward_transform), torch.angle(forward_transform)
def inverse(self, magnitude, phase):
inverse_transform = torch.istft(
magnitude * torch.exp(phase * 1j),
self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device))
return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation
def forward(self, input_data):
self.magnitude, self.phase = self.transform(input_data)
reconstruction = self.inverse(self.magnitude, self.phase)
return reconstruction
class SineGen(torch.nn.Module):
""" Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, upsample_scale, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
self.flag_for_pulse = flag_for_pulse
self.upsample_scale = upsample_scale
def _f02uv(self, f0):
# generate uv signal
uv = (f0 > self.voiced_threshold).type(torch.float32)
return uv
def _f02sine(self, f0_values):
""" f0_values: (batchsize, length, dim)
where dim indicates fundamental tone and overtones
"""
# convert to F0 in rad. The interger part n can be ignored
# because 2 * np.pi * n doesn't affect phase
rad_values = (f0_values / self.sampling_rate) % 1
# initial phase noise (no noise for fundamental component)
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
device=f0_values.device)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
if not self.flag_for_pulse:
# # for normal case
# # To prevent torch.cumsum numerical overflow,
# # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
# # Buffer tmp_over_one_idx indicates the time step to add -1.
# # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
# tmp_over_one = torch.cumsum(rad_values, 1) % 1
# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
# cumsum_shift = torch.zeros_like(rad_values)
# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
# phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
scale_factor=1/self.upsample_scale,
mode="linear").transpose(1, 2)
# tmp_over_one = torch.cumsum(rad_values, 1) % 1
# tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
# cumsum_shift = torch.zeros_like(rad_values)
# cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
sines = torch.sin(phase)
else:
# If necessary, make sure that the first time step of every
# voiced segments is sin(pi) or cos(0)
# This is used for pulse-train generation
# identify the last time step in unvoiced segments
uv = self._f02uv(f0_values)
uv_1 = torch.roll(uv, shifts=-1, dims=1)
uv_1[:, -1, :] = 1
u_loc = (uv < 1) * (uv_1 > 0)
# get the instantanouse phase
tmp_cumsum = torch.cumsum(rad_values, dim=1)
# different batch needs to be processed differently
for idx in range(f0_values.shape[0]):
temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
# stores the accumulation of i.phase within
# each voiced segments
tmp_cumsum[idx, :, :] = 0
tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
# rad_values - tmp_cumsum: remove the accumulation of i.phase
# within the previous voiced segment.
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
# get the sines
sines = torch.cos(i_phase * 2 * np.pi)
return sines
def forward(self, f0):
""" sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1)
"""
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
device=f0.device)
# fundamental component
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
# generate sine waveforms
sine_waves = self._f02sine(fn) * self.sine_amp
# generate uv signal
# uv = torch.ones(f0.shape)
# uv = uv * (f0 > self.voiced_threshold)
uv = self._f02uv(f0)
# noise: for unvoiced should be similar to sine_amp
# std = self.sine_amp/3 -> max value ~ self.sine_amp
# . for voiced regions is self.noise_std
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
# first: set the unvoiced part to 0 by uv
# then: additive noise
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
# to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, upsample_scale, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x):
"""
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
"""
# source for harmonic branch
with torch.no_grad():
sine_wavs, uv, _ = self.l_sin_gen(x)
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
# source for noise branch, in the same shape as uv
noise = torch.randn_like(uv) * self.sine_amp / 3
return sine_merge, noise, uv
def padDiff(x):
return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
class Generator(torch.nn.Module):
def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
resblock = AdaINResBlock1
self.m_source = SourceModuleHnNSF(
sampling_rate=24000,
upsample_scale=np.prod(upsample_rates) * gen_istft_hop_size,
harmonic_num=8, voiced_threshod=10)
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * gen_istft_hop_size)
self.noise_convs = nn.ModuleList()
self.noise_res = nn.ModuleList()
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(weight_norm(
ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
k, u, padding=(k-u)//2)))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel//(2**(i+1))
for j, (k, d) in enumerate(zip(resblock_kernel_sizes,resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d, style_dim))
c_cur = upsample_initial_channel // (2 ** (i + 1))
if i + 1 < len(upsample_rates): #
stride_f0 = np.prod(upsample_rates[i + 1:])
self.noise_convs.append(Conv1d(
gen_istft_n_fft + 2, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
self.noise_res.append(resblock(c_cur, 7, [1,3,5], style_dim))
else:
self.noise_convs.append(Conv1d(gen_istft_n_fft + 2, c_cur, kernel_size=1))
self.noise_res.append(resblock(c_cur, 11, [1,3,5], style_dim))
self.post_n_fft = gen_istft_n_fft
self.conv_post = weight_norm(Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3))
self.ups.apply(init_weights)
self.conv_post.apply(init_weights)
self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
self.stft = TorchSTFT(filter_length=gen_istft_n_fft, hop_length=gen_istft_hop_size, win_length=gen_istft_n_fft)
def forward(self, x, s, f0):
with torch.no_grad():
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
har_source, noi_source, uv = self.m_source(f0)
har_source = har_source.transpose(1, 2).squeeze(1)
har_spec, har_phase = self.stft.transform(har_source)
har = torch.cat([har_spec, har_phase], dim=1)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, LRELU_SLOPE)
x_source = self.noise_convs[i](har)
x_source = self.noise_res[i](x_source, s)
x = self.ups[i](x)
if i == self.num_upsamples - 1:
x = self.reflection_pad(x)
x = x + x_source
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i*self.num_kernels+j](x, s)
else:
xs += self.resblocks[i*self.num_kernels+j](x, s)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :])
phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :])
return self.stft.inverse(spec, phase)
def fw_phase(self, x, s):
for i in range(self.num_upsamples):
x = F.leaky_relu(x, LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i*self.num_kernels+j](x, s)
else:
xs += self.resblocks[i*self.num_kernels+j](x, s)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.reflection_pad(x)
x = self.conv_post(x)
spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :])
phase = torch.sin(x[:, self.post_n_fft // 2 + 1:, :])
return spec, phase
def remove_weight_norm(self):
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
class AdainResBlk1d(nn.Module):
def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
upsample='none', dropout_p=0.0):
super().__init__()
self.actv = actv
self.upsample_type = upsample
self.upsample = UpSample1d(upsample)
self.learned_sc = dim_in != dim_out
self._build_weights(dim_in, dim_out, style_dim)
self.dropout = nn.Dropout(dropout_p)
if upsample == 'none':
self.pool = nn.Identity()
else:
self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
def _build_weights(self, dim_in, dim_out, style_dim):
self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
self.norm1 = AdaIN1d(style_dim, dim_in)
self.norm2 = AdaIN1d(style_dim, dim_out)
if self.learned_sc:
self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
def _shortcut(self, x):
x = self.upsample(x)
if self.learned_sc:
x = self.conv1x1(x)
return x
def _residual(self, x, s):
x = self.norm1(x, s)
x = self.actv(x)
x = self.pool(x)
x = self.conv1(self.dropout(x))
x = self.norm2(x, s)
x = self.actv(x)
x = self.conv2(self.dropout(x))
return x
def forward(self, x, s):
out = self._residual(x, s)
out = (out + self._shortcut(x)) / np.sqrt(2)
return out
class UpSample1d(nn.Module):
def __init__(self, layer_type):
super().__init__()
self.layer_type = layer_type
def forward(self, x):
if self.layer_type == 'none':
return x
else:
return F.interpolate(x, scale_factor=2, mode='nearest')
class Decoder(nn.Module):
def __init__(self, dim_in=512, F0_channel=512, style_dim=64, dim_out=80,
resblock_kernel_sizes = [3,7,11],
upsample_rates = [10, 6],
upsample_initial_channel=512,
resblock_dilation_sizes=[[1,3,5], [1,3,5], [1,3,5]],
upsample_kernel_sizes=[20, 12],
gen_istft_n_fft=20, gen_istft_hop_size=5):
super().__init__()
self.decode = nn.ModuleList()
self.encode = AdainResBlk1d(dim_in + 2, 1024, style_dim)
self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
self.decode.append(AdainResBlk1d(1024 + 2 + 64, 1024, style_dim))
self.decode.append(AdainResBlk1d(1024 + 2 + 64, 512, style_dim, upsample=True))
self.F0_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
self.N_conv = weight_norm(nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1))
self.asr_res = nn.Sequential(
weight_norm(nn.Conv1d(512, 64, kernel_size=1)),
)
self.generator = Generator(style_dim, resblock_kernel_sizes, upsample_rates,
upsample_initial_channel, resblock_dilation_sizes,
upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size)
def forward(self, asr, F0_curve, N, s):
F0 = self.F0_conv(F0_curve.unsqueeze(1))
N = self.N_conv(N.unsqueeze(1))
x = torch.cat([asr, F0, N], axis=1)
x = self.encode(x, s)
asr_res = self.asr_res(asr)
res = True
for block in self.decode:
if res:
x = torch.cat([x, asr_res, F0, N], axis=1)
x = block(x, s)
if block.upsample_type != "none":
res = False
x = self.generator(x, s, F0_curve)
return x

View File

@@ -1,166 +0,0 @@
# https://huggingface.co/hexgrad/Kokoro-82M/blob/main/kokoro.py
import phonemizer
import re
import torch
import numpy as np
def split_num(num):
num = num.group()
if '.' in num:
return num
elif ':' in num:
h, m = [int(n) for n in num.split(':')]
if m == 0:
return f"{h} o'clock"
elif m < 10:
return f'{h} oh {m}'
return f'{h} {m}'
year = int(num[:4])
if year < 1100 or year % 1000 < 10:
return num
left, right = num[:2], int(num[2:4])
s = 's' if num.endswith('s') else ''
if 100 <= year % 1000 <= 999:
if right == 0:
return f'{left} hundred{s}'
elif right < 10:
return f'{left} oh {right}{s}'
return f'{left} {right}{s}'
def flip_money(m):
m = m.group()
bill = 'dollar' if m[0] == '$' else 'pound'
if m[-1].isalpha():
return f'{m[1:]} {bill}s'
elif '.' not in m:
s = '' if m[1:] == '1' else 's'
return f'{m[1:]} {bill}{s}'
b, c = m[1:].split('.')
s = '' if b == '1' else 's'
c = int(c.ljust(2, '0'))
coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence')
return f'{b} {bill}{s} and {c} {coins}'
def point_num(num):
a, b = num.group().split('.')
return ' point '.join([a, ' '.join(b)])
def normalize_text(text):
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
text = text.replace('«', chr(8220)).replace('»', chr(8221))
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
text = text.replace('(', '«').replace(')', '»')
for a, b in zip('、。!,:;?', ',.!,:;?'):
text = text.replace(a, b+' ')
text = re.sub(r'[^\S \n]', ' ', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
text = re.sub(r'(?<=\d),(?=\d)', '', text)
text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
text = re.sub(r'\d*\.\d+', point_num, text)
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text)
text = re.sub(r'(?<=\d)S', ' S', text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", 's', text)
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
return text.strip()
def get_vocab():
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'"
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
dicts = {}
for i in range(len((symbols))):
dicts[symbols[i]] = i
return dicts
VOCAB = get_vocab()
def tokenize(ps):
return [i for i in map(VOCAB.get, ps) if i is not None]
phonemizers = dict(
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
)
def phonemize(text, lang, norm=True):
if norm:
text = normalize_text(text)
ps = phonemizers[lang].phonemize([text])
ps = ps[0] if ps else ''
# https://en.wiktionary.org/wiki/kokoro#English
ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', ps)
if lang == 'a':
ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
ps = ''.join(filter(lambda p: p in VOCAB, ps))
return ps.strip()
def length_to_mask(lengths):
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
mask = torch.gt(mask+1, lengths.unsqueeze(1))
return mask
@torch.no_grad()
def forward(model, tokens, ref_s, speed):
device = ref_s.device
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
text_mask = length_to_mask(input_lengths).to(device)
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
s = ref_s[:, 128:]
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
x, _ = model.predictor.lstm(d)
duration = model.predictor.duration_proj(x)
duration = torch.sigmoid(duration).sum(axis=-1) / speed
pred_dur = torch.round(duration).clamp(min=1).long()
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
c_frame = 0
for i in range(pred_aln_trg.size(0)):
pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
c_frame += pred_dur[0,i].item()
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
t_en = model.text_encoder(tokens, input_lengths, text_mask)
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
def generate(model, text, voicepack, lang='a', speed=1, ps=None):
ps = ps or phonemize(text, lang)
tokens = tokenize(ps)
if not tokens:
return None
elif len(tokens) > 510:
tokens = tokens[:510]
print('Truncated to 510 tokens')
ref_s = voicepack[len(tokens)]
out = forward(model, tokens, ref_s, speed)
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
return out, ps
def generate_full(model, text, voicepack, lang='a', speed=1, ps=None):
ps = ps or phonemize(text, lang)
tokens = tokenize(ps)
if not tokens:
return None
outs = []
loop_count = len(tokens)//510 + (1 if len(tokens) % 510 != 0 else 0)
for i in range(loop_count):
ref_s = voicepack[len(tokens[i*510:(i+1)*510])]
out = forward(model, tokens[i*510:(i+1)*510], ref_s, speed)
outs.append(out)
outs = np.concatenate(outs)
ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
return outs, ps

View File

@@ -1,373 +0,0 @@
# https://github.com/yl4579/StyleTTS2/blob/main/models.py
# https://huggingface.co/hexgrad/Kokoro-82M/blob/main/models.py
from istftnet import AdaIN1d, Decoder
from munch import Munch
from pathlib import Path
from plbert import load_plbert
from torch.nn.utils import weight_norm, spectral_norm
import json
import numpy as np
import os
import os.path as osp
import torch
import torch.nn as nn
import torch.nn.functional as F
class LinearNorm(torch.nn.Module):
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
super(LinearNorm, self).__init__()
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
torch.nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(w_init_gain))
def forward(self, x):
return self.linear_layer(x)
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class TextEncoder(nn.Module):
def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
super().__init__()
self.embedding = nn.Embedding(n_symbols, channels)
padding = (kernel_size - 1) // 2
self.cnn = nn.ModuleList()
for _ in range(depth):
self.cnn.append(nn.Sequential(
weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
LayerNorm(channels),
actv,
nn.Dropout(0.2),
))
# self.cnn = nn.Sequential(*self.cnn)
self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
def forward(self, x, input_lengths, m):
x = self.embedding(x) # [B, T, emb]
x = x.transpose(1, 2) # [B, emb, T]
m = m.to(input_lengths.device).unsqueeze(1)
x.masked_fill_(m, 0.0)
for c in self.cnn:
x = c(x)
x.masked_fill_(m, 0.0)
x = x.transpose(1, 2) # [B, T, chn]
input_lengths = input_lengths.cpu().numpy()
x = nn.utils.rnn.pack_padded_sequence(
x, input_lengths, batch_first=True, enforce_sorted=False)
self.lstm.flatten_parameters()
x, _ = self.lstm(x)
x, _ = nn.utils.rnn.pad_packed_sequence(
x, batch_first=True)
x = x.transpose(-1, -2)
x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
x_pad[:, :, :x.shape[-1]] = x
x = x_pad.to(x.device)
x.masked_fill_(m, 0.0)
return x
def inference(self, x):
x = self.embedding(x)
x = x.transpose(1, 2)
x = self.cnn(x)
x = x.transpose(1, 2)
self.lstm.flatten_parameters()
x, _ = self.lstm(x)
return x
def length_to_mask(self, lengths):
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
mask = torch.gt(mask+1, lengths.unsqueeze(1))
return mask
class UpSample1d(nn.Module):
def __init__(self, layer_type):
super().__init__()
self.layer_type = layer_type
def forward(self, x):
if self.layer_type == 'none':
return x
else:
return F.interpolate(x, scale_factor=2, mode='nearest')
class AdainResBlk1d(nn.Module):
def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
upsample='none', dropout_p=0.0):
super().__init__()
self.actv = actv
self.upsample_type = upsample
self.upsample = UpSample1d(upsample)
self.learned_sc = dim_in != dim_out
self._build_weights(dim_in, dim_out, style_dim)
self.dropout = nn.Dropout(dropout_p)
if upsample == 'none':
self.pool = nn.Identity()
else:
self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
def _build_weights(self, dim_in, dim_out, style_dim):
self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
self.norm1 = AdaIN1d(style_dim, dim_in)
self.norm2 = AdaIN1d(style_dim, dim_out)
if self.learned_sc:
self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
def _shortcut(self, x):
x = self.upsample(x)
if self.learned_sc:
x = self.conv1x1(x)
return x
def _residual(self, x, s):
x = self.norm1(x, s)
x = self.actv(x)
x = self.pool(x)
x = self.conv1(self.dropout(x))
x = self.norm2(x, s)
x = self.actv(x)
x = self.conv2(self.dropout(x))
return x
def forward(self, x, s):
out = self._residual(x, s)
out = (out + self._shortcut(x)) / np.sqrt(2)
return out
class AdaLayerNorm(nn.Module):
def __init__(self, style_dim, channels, eps=1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.fc = nn.Linear(style_dim, channels*2)
def forward(self, x, s):
x = x.transpose(-1, -2)
x = x.transpose(1, -1)
h = self.fc(s)
h = h.view(h.size(0), h.size(1), 1)
gamma, beta = torch.chunk(h, chunks=2, dim=1)
gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), eps=self.eps)
x = (1 + gamma) * x + beta
return x.transpose(1, -1).transpose(-1, -2)
class ProsodyPredictor(nn.Module):
def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
super().__init__()
self.text_encoder = DurationEncoder(sty_dim=style_dim,
d_model=d_hid,
nlayers=nlayers,
dropout=dropout)
self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
self.duration_proj = LinearNorm(d_hid, max_dur)
self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
self.F0 = nn.ModuleList()
self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
self.N = nn.ModuleList()
self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
def forward(self, texts, style, text_lengths, alignment, m):
d = self.text_encoder(texts, style, text_lengths, m)
batch_size = d.shape[0]
text_size = d.shape[1]
# predict duration
input_lengths = text_lengths.cpu().numpy()
x = nn.utils.rnn.pack_padded_sequence(
d, input_lengths, batch_first=True, enforce_sorted=False)
m = m.to(text_lengths.device).unsqueeze(1)
self.lstm.flatten_parameters()
x, _ = self.lstm(x)
x, _ = nn.utils.rnn.pad_packed_sequence(
x, batch_first=True)
x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
x_pad[:, :x.shape[1], :] = x
x = x_pad.to(x.device)
duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
en = (d.transpose(-1, -2) @ alignment)
return duration.squeeze(-1), en
def F0Ntrain(self, x, s):
x, _ = self.shared(x.transpose(-1, -2))
F0 = x.transpose(-1, -2)
for block in self.F0:
F0 = block(F0, s)
F0 = self.F0_proj(F0)
N = x.transpose(-1, -2)
for block in self.N:
N = block(N, s)
N = self.N_proj(N)
return F0.squeeze(1), N.squeeze(1)
def length_to_mask(self, lengths):
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
mask = torch.gt(mask+1, lengths.unsqueeze(1))
return mask
class DurationEncoder(nn.Module):
def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
super().__init__()
self.lstms = nn.ModuleList()
for _ in range(nlayers):
self.lstms.append(nn.LSTM(d_model + sty_dim,
d_model // 2,
num_layers=1,
batch_first=True,
bidirectional=True,
dropout=dropout))
self.lstms.append(AdaLayerNorm(sty_dim, d_model))
self.dropout = dropout
self.d_model = d_model
self.sty_dim = sty_dim
def forward(self, x, style, text_lengths, m):
masks = m.to(text_lengths.device)
x = x.permute(2, 0, 1)
s = style.expand(x.shape[0], x.shape[1], -1)
x = torch.cat([x, s], axis=-1)
x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
x = x.transpose(0, 1)
input_lengths = text_lengths.cpu().numpy()
x = x.transpose(-1, -2)
for block in self.lstms:
if isinstance(block, AdaLayerNorm):
x = block(x.transpose(-1, -2), style).transpose(-1, -2)
x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
else:
x = x.transpose(-1, -2)
x = nn.utils.rnn.pack_padded_sequence(
x, input_lengths, batch_first=True, enforce_sorted=False)
block.flatten_parameters()
x, _ = block(x)
x, _ = nn.utils.rnn.pad_packed_sequence(
x, batch_first=True)
x = F.dropout(x, p=self.dropout, training=self.training)
x = x.transpose(-1, -2)
x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
x_pad[:, :, :x.shape[-1]] = x
x = x_pad.to(x.device)
return x.transpose(-1, -2)
def inference(self, x, style):
x = self.embedding(x.transpose(-1, -2)) * np.sqrt(self.d_model)
style = style.expand(x.shape[0], x.shape[1], -1)
x = torch.cat([x, style], axis=-1)
src = self.pos_encoder(x)
output = self.transformer_encoder(src).transpose(0, 1)
return output
def length_to_mask(self, lengths):
mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
mask = torch.gt(mask+1, lengths.unsqueeze(1))
return mask
# https://github.com/yl4579/StyleTTS2/blob/main/utils.py
def recursive_munch(d):
if isinstance(d, dict):
return Munch((k, recursive_munch(v)) for k, v in d.items())
elif isinstance(d, list):
return [recursive_munch(v) for v in d]
else:
return d
def build_model(path, device):
config = Path(__file__).parent / 'config.json'
assert config.exists(), f'Config path incorrect: config.json not found at {config}'
with open(config, 'r') as r:
args = recursive_munch(json.load(r))
assert args.decoder.type == 'istftnet', f'Unknown decoder type: {args.decoder.type}'
decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
upsample_rates = args.decoder.upsample_rates,
upsample_initial_channel=args.decoder.upsample_initial_channel,
resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
bert = load_plbert()
bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim)
for parent in [bert, bert_encoder, predictor, decoder, text_encoder]:
for child in parent.children():
if isinstance(child, nn.RNNBase):
child.flatten_parameters()
model = Munch(
bert=bert.to(device).eval(),
bert_encoder=bert_encoder.to(device).eval(),
predictor=predictor.to(device).eval(),
decoder=decoder.to(device).eval(),
text_encoder=text_encoder.to(device).eval(),
)
for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items():
assert key in model, key
try:
model[key].load_state_dict(state_dict)
except:
state_dict = {k[7:]: v for k, v in state_dict.items()}
model[key].load_state_dict(state_dict, strict=False)
return model

Some files were not shown because too many files have changed in this diff Show More