mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 00:59:28 -04:00
Compare commits
79 Commits
worktree-f
...
feat/llama
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4e9bb4f879 | ||
|
|
3b47122e54 | ||
|
|
379fa3e525 | ||
|
|
e47c58656f | ||
|
|
482314c623 | ||
|
|
e8ae88a2a0 | ||
|
|
e1994579f8 | ||
|
|
e5620989dd | ||
|
|
fc618dcee6 | ||
|
|
e6042080c0 | ||
|
|
0f3b24436d | ||
|
|
4b6f911835 | ||
|
|
a5e28942a6 | ||
|
|
dba9cd7ca4 | ||
|
|
c93190de50 | ||
|
|
4dbf69f889 | ||
|
|
deb430f3ec | ||
|
|
dd8c8778e2 | ||
|
|
06a7b6cadb | ||
|
|
67c8889866 | ||
|
|
1d49041c85 | ||
|
|
2edc4e25b3 | ||
|
|
7888067914 | ||
|
|
9eedbf537a | ||
|
|
69c16481c8 | ||
|
|
56f8a6623f | ||
|
|
4755d676a3 | ||
|
|
10184b5e28 | ||
|
|
fdf475ec5f | ||
|
|
9d54a599b0 | ||
|
|
63bcbf6c12 | ||
|
|
95b058e1c5 | ||
|
|
f2abcc7503 | ||
|
|
62c99c10b3 | ||
|
|
7226bb9f30 | ||
|
|
569d9bbd9e | ||
|
|
682fb2718c | ||
|
|
20c643e1f6 | ||
|
|
64a4351f3a | ||
|
|
b7d67f5779 | ||
|
|
600dafd20b | ||
|
|
ce8a3e9266 | ||
|
|
a88d9d2de3 | ||
|
|
1cf1bf32e1 | ||
|
|
f45c6acc54 | ||
|
|
1a1bd57469 | ||
|
|
1f29e96030 | ||
|
|
64560a974b | ||
|
|
32c47706ae | ||
|
|
e58870a573 | ||
|
|
8fab1d2e45 | ||
|
|
7b462a0d51 | ||
|
|
aed181e6c1 | ||
|
|
a556cd9afc | ||
|
|
b50b1fe418 | ||
|
|
b4c0dc67fe | ||
|
|
01fa12e0de | ||
|
|
cf7f9573a2 | ||
|
|
c6303104c7 | ||
|
|
3e96d811b7 | ||
|
|
23f225260c | ||
|
|
aef10723c9 | ||
|
|
9565db5f94 | ||
|
|
e19c43cf04 | ||
|
|
b081247d95 | ||
|
|
1be959ce30 | ||
|
|
518381278e | ||
|
|
93706fec57 | ||
|
|
11aee03a80 | ||
|
|
8915f2ab91 | ||
|
|
f143d7f688 | ||
|
|
dd928f0bdd | ||
|
|
c43a752afc | ||
|
|
079ac0e15a | ||
|
|
2e734bf560 | ||
|
|
72d46c1115 | ||
|
|
606128e4e9 | ||
|
|
59c7ad5153 | ||
|
|
78d682224a |
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
|
||||
- If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
|
||||
- Check similar backends to determine the correct context
|
||||
|
||||
## Documenting the backend (README + docs)
|
||||
|
||||
A backend is not "added" until it is discoverable. Update the user-facing docs:
|
||||
|
||||
- **`docs/content/features/backends.md`** - add the backend to the right
|
||||
category in the "LocalAI supports various types of backends" list (and add a
|
||||
new category if it introduces a new modality, e.g. sound classification).
|
||||
- If the backend introduces a **new API surface** (a new endpoint or a realtime
|
||||
capability), document it under `docs/content/` where its area lives (audio,
|
||||
vision, etc.) and follow the api-endpoints checklist in
|
||||
[api-endpoints-and-auth.md](api-endpoints-and-auth.md).
|
||||
|
||||
**If the backend is a native C/C++/GGML engine created and maintained by the
|
||||
LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
|
||||
`vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
|
||||
ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
|
||||
engines ... developed and maintained by the LocalAI project itself". Add a row
|
||||
linking the upstream engine repo with a one-line description. This is the
|
||||
project's showcase of its own engines; a new in-house backend that is missing
|
||||
from it is a documentation bug.
|
||||
|
||||
## 5. Verification Checklist
|
||||
|
||||
After adding a new backend, verify:
|
||||
@@ -211,6 +232,8 @@ After adding a new backend, verify:
|
||||
- [ ] No YAML syntax errors (check with linter)
|
||||
- [ ] No Makefile syntax errors (check with linter)
|
||||
- [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
|
||||
- [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
|
||||
- [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`
|
||||
|
||||
## Bundling runtime shared libraries (`package.sh`)
|
||||
|
||||
|
||||
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
|
||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
|
||||
# manifests. The LunarG SDK below only provides the loader and shader
|
||||
# tooling, not hardware drivers — without Mesa the packaged Vulkan backend
|
||||
# would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
|
||||
# .so files plus their deps into the backend so it stays self-contained.
|
||||
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||
if [ "amd64" = "${TARGETARCH:-}" ]; then
|
||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
|
||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
|
||||
|
||||
@@ -17,19 +17,25 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
|
||||
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
|
||||
fi
|
||||
|
||||
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
|
||||
cd /LocalAI/backend/cpp/llama-cpp
|
||||
cd /LocalAI/backend/cpp/llama-cpp
|
||||
if [ "${BUILD_TYPE}" = "hipblas" ]; then
|
||||
# ROCm: the GPU does the compute, so a single fallback CPU build is enough.
|
||||
make llama-cpp-fallback
|
||||
make llama-cpp-grpc
|
||||
make llama-cpp-rpc-server
|
||||
else
|
||||
cd /LocalAI/backend/cpp/llama-cpp
|
||||
make llama-cpp-avx
|
||||
make llama-cpp-avx2
|
||||
make llama-cpp-avx512
|
||||
make llama-cpp-fallback
|
||||
make llama-cpp-grpc
|
||||
make llama-cpp-rpc-server
|
||||
# arm64: ggml's CPU_ALL_VARIANTS table includes armv9.2 SME variants whose
|
||||
# -march=...+sme is rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so
|
||||
# build the arm64 variants with gcc-14 (the host never *selects* SME unless it has it,
|
||||
# but every variant must still compile).
|
||||
if [ "${TARGETARCH}" = "arm64" ]; then
|
||||
apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
|
||||
export CC=gcc-14 CXX=g++-14
|
||||
fi
|
||||
# x86 and arm64: one build with ggml CPU_ALL_VARIANTS replaces the per-microarch
|
||||
# binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml dlopens the
|
||||
# best libggml-cpu-*.so at runtime by probing host CPU features.
|
||||
make llama-cpp-cpu-all
|
||||
fi
|
||||
make llama-cpp-grpc
|
||||
make llama-cpp-rpc-server
|
||||
|
||||
ccache -s || true
|
||||
|
||||
@@ -19,17 +19,19 @@ fi
|
||||
|
||||
cd /LocalAI/backend/cpp/turboquant
|
||||
|
||||
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
|
||||
if [ "${BUILD_TYPE}" = "hipblas" ]; then
|
||||
# ROCm: single fallback CPU build (GPU does the compute).
|
||||
make turboquant-fallback
|
||||
make turboquant-grpc
|
||||
make turboquant-rpc-server
|
||||
else
|
||||
make turboquant-avx
|
||||
make turboquant-avx2
|
||||
make turboquant-avx512
|
||||
make turboquant-fallback
|
||||
make turboquant-grpc
|
||||
make turboquant-rpc-server
|
||||
# arm64: the CPU_ALL_VARIANTS armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme).
|
||||
if [ "${TARGETARCH}" = "arm64" ]; then
|
||||
apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
|
||||
export CC=gcc-14 CXX=g++-14
|
||||
fi
|
||||
# x86 and arm64: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries.
|
||||
make turboquant-cpu-all
|
||||
fi
|
||||
make turboquant-grpc
|
||||
make turboquant-rpc-server
|
||||
|
||||
ccache -s || true
|
||||
|
||||
152
.github/backend-matrix.yml
vendored
152
.github/backend-matrix.yml
vendored
@@ -3575,6 +3575,154 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# ced
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-12-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-13-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
|
||||
base-image: "ubuntu:24.04"
|
||||
ubuntu-version: '2404'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-ced'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f32'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f32-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f16'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f16-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-ced'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-arm64-ced'
|
||||
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-rocm-hipblas-ced'
|
||||
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||
runs-on: 'ubuntu-latest'
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# acestep-cpp
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
@@ -4754,6 +4902,10 @@ includeDarwin:
|
||||
tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "ced"
|
||||
tag-suffix: "-metal-darwin-arm64-ced"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "acestep-cpp"
|
||||
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
|
||||
build-type: "metal"
|
||||
|
||||
2
.github/workflows/backend.yml
vendored
2
.github/workflows/backend.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
|
||||
2
.github/workflows/backend_build.yml
vendored
2
.github/workflows/backend_build.yml
vendored
@@ -101,7 +101,7 @@ jobs:
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
|
||||
2
.github/workflows/backend_build_darwin.yml
vendored
2
.github/workflows/backend_build_darwin.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
HOMEBREW_NO_ANALYTICS: '1'
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
|
||||
2
.github/workflows/backend_merge.yml
vendored
2
.github/workflows/backend_merge.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
||||
# Sparse checkout: the merge job needs `.github/scripts/` (for the
|
||||
# keepalive cleanup script) but none of the source tree.
|
||||
- name: Checkout (.github/scripts only)
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
sparse-checkout: |
|
||||
.github/scripts
|
||||
|
||||
2
.github/workflows/backend_pr.yml
vendored
2
.github/workflows/backend_pr.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
|
||||
2
.github/workflows/base-images.yml
vendored
2
.github/workflows/base-images.yml
vendored
@@ -127,7 +127,7 @@ jobs:
|
||||
# the original l4t matrix entry which set skip-drivers: 'true'.
|
||||
skip-drivers: 'true'
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: false
|
||||
- name: Free disk space
|
||||
|
||||
6
.github/workflows/build-test.yaml
vendored
6
.github/workflows/build-test.yaml
vendored
@@ -11,7 +11,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -25,7 +25,7 @@ jobs:
|
||||
runs-on: macos-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -47,7 +47,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Configure apt mirror on runner
|
||||
|
||||
@@ -14,7 +14,7 @@ jobs:
|
||||
bump:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
|
||||
8
.github/workflows/bump_deps.yaml
vendored
8
.github/workflows/bump_deps.yaml
vendored
@@ -42,6 +42,10 @@ jobs:
|
||||
variable: "PARAKEET_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/parakeet-cpp/Makefile"
|
||||
- repository: "mudler/ced.cpp"
|
||||
variable: "CED_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/ced/Makefile"
|
||||
- repository: "mudler/depth-anything.cpp"
|
||||
variable: "DEPTHANYTHING_VERSION"
|
||||
branch: "master"
|
||||
@@ -88,7 +92,7 @@ jobs:
|
||||
file: "backend/go/vibevoice-cpp/Makefile"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump dependencies 🔧
|
||||
id: bump
|
||||
run: |
|
||||
@@ -124,7 +128,7 @@ jobs:
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump vLLM cu130 wheel pin 🔧
|
||||
id: bump
|
||||
run: |
|
||||
|
||||
2
.github/workflows/bump_docs.yaml
vendored
2
.github/workflows/bump_docs.yaml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
- repository: "mudler/LocalAI"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump dependencies 🔧
|
||||
run: |
|
||||
bash .github/bump_docs.sh ${{ matrix.repository }}
|
||||
|
||||
2
.github/workflows/checksum_checker.yaml
vendored
2
.github/workflows/checksum_checker.yaml
vendored
@@ -8,7 +8,7 @@ jobs:
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Configure apt mirror on runner
|
||||
uses: ./.github/actions/configure-apt-mirror
|
||||
- name: Install dependencies
|
||||
|
||||
2
.github/workflows/deploy-explorer.yaml
vendored
2
.github/workflows/deploy-explorer.yaml
vendored
@@ -16,7 +16,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- uses: actions/setup-go@v5
|
||||
|
||||
2
.github/workflows/gallery-agent.yaml
vendored
2
.github/workflows/gallery-agent.yaml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
2
.github/workflows/generate_intel_image.yaml
vendored
2
.github/workflows/generate_intel_image.yaml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@master
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Cache Intel images
|
||||
uses: docker/build-push-action@v7
|
||||
|
||||
2
.github/workflows/gh-pages.yml
vendored
2
.github/workflows/gh-pages.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
HUGO_VERSION: "0.146.3"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0 # needed for enableGitInfo
|
||||
submodules: true
|
||||
|
||||
2
.github/workflows/image_build.yml
vendored
2
.github/workflows/image_build.yml
vendored
@@ -80,7 +80,7 @@ jobs:
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Configure apt mirror on runner
|
||||
id: apt_mirror
|
||||
|
||||
2
.github/workflows/image_merge.yml
vendored
2
.github/workflows/image_merge.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
# Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
|
||||
# script). Skips the rest of the source tree.
|
||||
- name: Checkout (.github/scripts only)
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
sparse-checkout: |
|
||||
.github/scripts
|
||||
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -20,7 +20,7 @@ jobs:
|
||||
golangci-lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
with:
|
||||
# Full history so golangci-lint's new-from-merge-base can reach
|
||||
# origin/master and compute the diff against it.
|
||||
|
||||
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@@ -10,7 +10,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -28,7 +28,7 @@ jobs:
|
||||
runs-on: macos-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Configure apt mirror on runner
|
||||
|
||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -14,7 +14,7 @@ jobs:
|
||||
GO111MODULE: on
|
||||
steps:
|
||||
- name: Checkout Source
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
|
||||
86
.github/workflows/test-extra.yml
vendored
86
.github/workflows/test-extra.yml
vendored
@@ -50,7 +50,7 @@ jobs:
|
||||
parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
- name: Install dependencies
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -113,7 +113,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -158,7 +158,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -178,7 +178,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -240,7 +240,7 @@ jobs:
|
||||
# sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
|
||||
# df -h
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -265,7 +265,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -288,7 +288,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -309,7 +309,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -330,7 +330,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -351,7 +351,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -373,7 +373,7 @@ jobs:
|
||||
# timeout-minutes: 45
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -394,7 +394,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -415,7 +415,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -436,7 +436,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -462,7 +462,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -484,7 +484,7 @@ jobs:
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -513,7 +513,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -530,7 +530,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -552,7 +552,7 @@ jobs:
|
||||
timeout-minutes: 20
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -579,7 +579,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -604,7 +604,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -625,7 +625,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -645,7 +645,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -664,7 +664,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -681,7 +681,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -698,7 +698,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -741,7 +741,7 @@ jobs:
|
||||
# timeout-minutes: 90
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -783,7 +783,7 @@ jobs:
|
||||
# timeout-minutes: 90
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -808,7 +808,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -840,7 +840,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -876,7 +876,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -915,7 +915,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -952,7 +952,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -987,7 +987,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -1013,7 +1013,7 @@ jobs:
|
||||
timeout-minutes: 150
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1042,7 +1042,7 @@ jobs:
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -1058,7 +1058,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1091,7 +1091,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1114,7 +1114,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1140,7 +1140,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
|
||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
go-version: ['1.26.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Free disk space
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
go-version: ['1.26.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go ${{ matrix.go-version }}
|
||||
|
||||
2
.github/workflows/tests-aio.yml
vendored
2
.github/workflows/tests-aio.yml
vendored
@@ -62,7 +62,7 @@ jobs:
|
||||
sudo rm -rfv build || true
|
||||
df -h
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
|
||||
2
.github/workflows/tests-e2e.yml
vendored
2
.github/workflows/tests-e2e.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
go-version: ['1.25.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Configure apt mirror on runner
|
||||
|
||||
97
.github/workflows/tests-pii-ner-e2e.yml
vendored
Normal file
97
.github/workflows/tests-pii-ner-e2e.yml
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
---
|
||||
name: 'PII NER tier E2E (live GGUF, CPU)'
|
||||
|
||||
# Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
|
||||
# hermetic tests/e2e suite cannot cover (it only exercises the in-process
|
||||
# pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
|
||||
# GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
|
||||
#
|
||||
# This drives the container-level harness (tests/e2e-backends) via
|
||||
# `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
|
||||
# downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
|
||||
# TokenClassify spans. The complementary HTTP-path specs in tests/e2e
|
||||
# (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 3 * * *'
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- 'backend/cpp/privacy-filter/**'
|
||||
- 'backend/Dockerfile.privacy-filter'
|
||||
- 'core/services/routing/pii/**'
|
||||
- 'core/services/routing/piidetector/**'
|
||||
- 'core/backend/token_classify.go'
|
||||
- 'core/http/endpoints/localai/pii.go'
|
||||
- 'core/schema/pii.go'
|
||||
- 'tests/e2e-backends/**'
|
||||
- 'tests/e2e/e2e_pii_ner_test.go'
|
||||
- 'tests/e2e/e2e_suite_test.go'
|
||||
- '.github/workflows/tests-pii-ner-e2e.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'backend/cpp/privacy-filter/**'
|
||||
- 'backend/Dockerfile.privacy-filter'
|
||||
- 'core/services/routing/pii/**'
|
||||
- 'core/services/routing/piidetector/**'
|
||||
- 'core/backend/token_classify.go'
|
||||
- 'core/http/endpoints/localai/pii.go'
|
||||
- 'core/schema/pii.go'
|
||||
- 'tests/e2e-backends/**'
|
||||
- 'tests/e2e/e2e_pii_ner_test.go'
|
||||
- 'tests/e2e/e2e_suite_test.go'
|
||||
- '.github/workflows/tests-pii-ner-e2e.yml'
|
||||
|
||||
concurrency:
|
||||
group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
jobs:
|
||||
tests-pii-ner-e2e:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
go-version: ['1.25.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Free disk space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
|
||||
sudo docker image prune --all --force || true
|
||||
df -h
|
||||
- name: Configure apt mirror on runner
|
||||
uses: ./.github/actions/configure-apt-mirror
|
||||
- name: Setup Go ${{ matrix.go-version }}
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
cache: false
|
||||
- name: Proto Dependencies
|
||||
run: |
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential
|
||||
# Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
|
||||
# CPU and runs the token_classify capability spec (byte-offset contract).
|
||||
- name: Run live PII NER backend E2E
|
||||
run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.23
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
limit-access-to-actor: true
|
||||
2
.github/workflows/tests-ui-e2e.yml
vendored
2
.github/workflows/tests-ui-e2e.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
go-version: ['1.26.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Configure apt mirror on runner
|
||||
|
||||
2
.github/workflows/update_swagger.yaml
vendored
2
.github/workflows/update_swagger.yaml
vendored
@@ -10,7 +10,7 @@ jobs:
|
||||
fail-fast: false
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Configure apt mirror on runner
|
||||
uses: ./.github/actions/configure-apt-mirror
|
||||
- uses: actions/setup-go@v5
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/
|
||||
|
||||
# Local worktrees
|
||||
.worktrees/
|
||||
|
||||
# SDD / brainstorm scratch (agent-driven development)
|
||||
.superpowers/
|
||||
|
||||
10
Makefile
10
Makefile
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
|
||||
BACKEND_TEST_CTX_SIZE=2048 \
|
||||
$(MAKE) test-extra-backend
|
||||
|
||||
## privacy-filter: the PII/NER token-classification backend. Exercises the
|
||||
## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
|
||||
## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
|
||||
## active params). This is the live-backend coverage for the PII NER tier.
|
||||
test-extra-backend-privacy-filter: docker-build-privacy-filter
|
||||
BACKEND_IMAGE=local-ai-backend:privacy-filter \
|
||||
BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
|
||||
BACKEND_TEST_CAPS=health,load,token_classify \
|
||||
$(MAKE) test-extra-backend
|
||||
|
||||
## vllm is resolved from a HuggingFace model id (no file download) and
|
||||
## exercises Predict + streaming + tool-call extraction via the hermes parser.
|
||||
## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
|
||||
|
||||
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
||||
| Backend | What it does |
|
||||
|---------|-------------|
|
||||
| [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
|
||||
| [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
|
||||
| [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
|
||||
| [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
|
||||
| [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
|
||||
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
||||
| [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
|
||||
| [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
|
||||
|
||||
We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
|
||||
|
||||
## Resources
|
||||
|
||||
- [Documentation](https://localai.io/)
|
||||
|
||||
@@ -65,7 +65,12 @@ RUN <<EOT bash
|
||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||
# LunarG SDK below only provides the loader and shader tooling, not
|
||||
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||
# bundle and the packaged backend finds no GPU at runtime.
|
||||
if [ "amd64" = "$TARGETARCH" ]; then
|
||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||
|
||||
@@ -66,7 +66,12 @@ RUN <<EOT bash
|
||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||
# LunarG SDK below only provides the loader and shader tooling, not
|
||||
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||
# bundle and the packaged backend finds no GPU at runtime.
|
||||
if [ "amd64" = "$TARGETARCH" ]; then
|
||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||
|
||||
@@ -24,6 +24,9 @@ service Backend {
|
||||
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
||||
rpc Status(HealthMessage) returns (StatusResponse) {}
|
||||
rpc Detect(DetectOptions) returns (DetectResponse) {}
|
||||
// SoundDetection runs an audio-tagging / sound-event-classification model
|
||||
// (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
|
||||
rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
|
||||
rpc Depth(DepthRequest) returns (DepthResponse) {}
|
||||
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
|
||||
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
|
||||
@@ -671,6 +674,24 @@ message DetectResponse {
|
||||
repeated Detection Detections = 1;
|
||||
}
|
||||
|
||||
// --- Sound-event classification / audio tagging messages (CED) ---
|
||||
|
||||
message SoundDetectionRequest {
|
||||
string src = 1; // audio file path (LocalAI writes the upload to disk)
|
||||
int32 top_k = 2; // number of top tags to return (0 = all classes)
|
||||
float threshold = 3; // optional: drop tags scoring below this
|
||||
}
|
||||
|
||||
message SoundClass {
|
||||
string label = 1; // AudioSet class name, e.g. "Baby cry, infant cry"
|
||||
float score = 2; // per-class probability (multi-label, independent)
|
||||
int32 index = 3; // class index in the model ontology
|
||||
}
|
||||
|
||||
message SoundDetectionResponse {
|
||||
repeated SoundClass detections = 1; // score-descending
|
||||
}
|
||||
|
||||
// --- Depth estimation messages (Depth Anything 3) ---
|
||||
|
||||
message DepthRequest {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
|
||||
IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
|
||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -50,8 +50,13 @@ add_custom_command(
|
||||
"${hw_proto}"
|
||||
DEPENDS "${hw_proto}")
|
||||
|
||||
# hw_grpc_proto
|
||||
add_library(hw_grpc_proto
|
||||
# hw_grpc_proto: force STATIC. Under the CPU_ALL_VARIANTS build BUILD_SHARED_LIBS=ON
|
||||
# (ggml/llama become shared), which would otherwise make this glue library a DSO. As a
|
||||
# DSO it references the hidden-visibility symbols in the static libprotobuf.a, which the
|
||||
# linker cannot satisfy ("hidden symbol ... in libprotobuf.a is referenced by DSO").
|
||||
# Keeping it STATIC links protobuf/gRPC directly into the grpc-server executable while
|
||||
# only ggml/llama stay shared. No effect on the static variants (already BUILD_SHARED_LIBS=OFF).
|
||||
add_library(hw_grpc_proto STATIC
|
||||
${hw_grpc_srcs}
|
||||
${hw_grpc_hdrs}
|
||||
${hw_proto_srcs}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
|
||||
LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
@@ -10,8 +10,16 @@ TARGET?=--target grpc-server
|
||||
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
||||
ARCH?=$(shell uname -m)
|
||||
|
||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
|
||||
# Shared libs default to OFF: we link static gRPC and the avx/avx2/avx512/fallback
|
||||
# variants are fully static. The CPU_ALL_VARIANTS build flips SHARED_LIBS=ON (ggml/llama
|
||||
# become shared so the dynamic CPU backends work; gRPC stays static via its imported
|
||||
# targets). SHARED_LIBS is a make variable, not an appended -D, so it survives the
|
||||
# recursive sub-make into the VARIANT build dir (which re-parses this Makefile) instead
|
||||
# of being re-clobbered by a second -DBUILD_SHARED_LIBS=OFF. EXTRA_CMAKE_ARGS is the hook
|
||||
# the CPU_ALL_VARIANTS target uses to inject -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS.
|
||||
SHARED_LIBS?=OFF
|
||||
EXTRA_CMAKE_ARGS?=
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=$(SHARED_LIBS) -DLLAMA_CURL=OFF $(EXTRA_CMAKE_ARGS)
|
||||
|
||||
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
ifeq ($(NATIVE),false)
|
||||
@@ -120,6 +128,30 @@ llama-cpp-fallback: llama.cpp
|
||||
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
|
||||
|
||||
# Single-build CPU backend using ggml's CPU_ALL_VARIANTS. Produces ONE grpc-server
|
||||
# plus a set of dlopen-able libggml-cpu-*.so (sandybridge/haswell/skylakex/...) that
|
||||
# ggml's backend registry selects from at runtime by probing host CPU features.
|
||||
# Replaces the avx/avx2/avx512/fallback multi-binary build on x86.
|
||||
#
|
||||
# CPU_ALL_VARIANTS requires GGML_BACKEND_DL, which requires BUILD_SHARED_LIBS=ON, so we
|
||||
# pass SHARED_LIBS=ON and the DL flags as make variables (NOT pre-expanded into the
|
||||
# CMAKE_ARGS env string): command-line make variables propagate through every recursive
|
||||
# sub-make, so the deepest VARIANT-dir build computes BUILD_SHARED_LIBS=ON consistently.
|
||||
# Only ggml/llama go shared - gRPC is found via its static imported targets, so the
|
||||
# grpc-server binary keeps static gRPC and only dynamically links ggml.
|
||||
#
|
||||
# TARGET adds "ggml": the per-microarch backends are runtime-dlopened, not link deps of
|
||||
# grpc-server, so they only build because each is an add_dependencies() of the ggml target.
|
||||
llama-cpp-cpu-all: llama.cpp
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build purge
|
||||
$(info ${GREEN}I llama-cpp build info:cpu-all-variants${RESET})
|
||||
$(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all
|
||||
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
|
||||
find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
|
||||
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
|
||||
|
||||
llama-cpp-grpc: llama.cpp
|
||||
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
|
||||
|
||||
@@ -18,6 +18,18 @@
|
||||
#if __has_include("server-chat.cpp")
|
||||
#include "server-chat.cpp"
|
||||
#endif
|
||||
// server-schema.cpp exists only in llama.cpp after the upstream refactor that
|
||||
// extracted the JSON request-schema evaluation (previously the static
|
||||
// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
|
||||
// server-context.cpp and grpc-server.cpp both call into it, so its definitions
|
||||
// must be part of this translation unit or the link fails. __has_include keeps
|
||||
// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
|
||||
// predate the split and still expose params_from_json_cmpl (see the guarded
|
||||
// call sites below).
|
||||
#if __has_include("server-schema.cpp")
|
||||
#define LOCALAI_HAS_SERVER_SCHEMA 1
|
||||
#include "server-schema.cpp"
|
||||
#endif
|
||||
#include "server-context.cpp"
|
||||
|
||||
// LocalAI
|
||||
@@ -2102,7 +2114,11 @@ public:
|
||||
task.index = i;
|
||||
|
||||
task.tokens = std::move(inputs[i]);
|
||||
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||
task.params = server_schema::eval_llama_cmpl_schema(
|
||||
#else
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
#endif
|
||||
ctx_server.impl->vocab,
|
||||
params_base,
|
||||
ctx_server.get_meta().slot_n_ctx,
|
||||
@@ -2116,7 +2132,7 @@ public:
|
||||
// cannot detect tool calls or separate reasoning from content.
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
@@ -2940,7 +2956,11 @@ public:
|
||||
task.index = i;
|
||||
|
||||
task.tokens = std::move(inputs[i]);
|
||||
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||
task.params = server_schema::eval_llama_cmpl_schema(
|
||||
#else
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
#endif
|
||||
ctx_server.impl->vocab,
|
||||
params_base,
|
||||
ctx_server.get_meta().slot_n_ctx,
|
||||
@@ -2952,7 +2972,7 @@ public:
|
||||
// reasoning, tool calls, and content are classified into ChatDeltas.
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
@@ -14,6 +14,22 @@ mkdir -p $CURDIR/package/lib
|
||||
cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
|
||||
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||
|
||||
# Bundle the ggml shared backends produced by the CPU_ALL_VARIANTS build (libggml-base.so,
|
||||
# libggml.so, libllama.so and the per-microarch libggml-cpu-*.so), all into package/lib.
|
||||
#
|
||||
# Two distinct resolution mechanisms both land here:
|
||||
# - NEEDED deps (libggml-base/libggml/libllama): resolved by the dynamic linker via the
|
||||
# LD_LIBRARY_PATH=$CURDIR/lib that run.sh exports.
|
||||
# - The per-microarch libggml-cpu-*.so are NOT linked; ggml *discovers* them at runtime by
|
||||
# scanning the executable's own directory (readlink /proc/self/exe). run.sh launches via
|
||||
# the bundled $CURDIR/lib/ld.so, so /proc/self/exe -> .../lib/ld.so and ggml scans lib/.
|
||||
# That is why the variants must sit in lib/ (next to ld.so), not just on the link path.
|
||||
# No-op on builds (arm64/darwin) that don't produce the all-variants set.
|
||||
if [ -d "$CURDIR/ggml-shared-libs" ]; then
|
||||
echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
|
||||
cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
|
||||
fi
|
||||
|
||||
# Detect architecture and copy appropriate libraries
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
# x86_64 architecture
|
||||
|
||||
@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1
|
||||
|
||||
BINARY=llama-cpp-fallback
|
||||
|
||||
if grep -q -e "\savx\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX found OK"
|
||||
if [ -e $CURDIR/llama-cpp-avx ]; then
|
||||
BINARY=llama-cpp-avx
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX2 found OK"
|
||||
if [ -e $CURDIR/llama-cpp-avx2 ]; then
|
||||
BINARY=llama-cpp-avx2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check avx 512
|
||||
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX512F found OK"
|
||||
if [ -e $CURDIR/llama-cpp-avx512 ]; then
|
||||
BINARY=llama-cpp-avx512
|
||||
fi
|
||||
# x86 ships a single llama-cpp-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's backend
|
||||
# registry dlopens the best libggml-cpu-*.so for this host, so no shell-side AVX probing.
|
||||
# arm64/darwin builds ship only llama-cpp-fallback, so fall back to it when cpu-all absent.
|
||||
if [ -e $CURDIR/llama-cpp-cpu-all ]; then
|
||||
BINARY=llama-cpp-cpu-all
|
||||
fi
|
||||
|
||||
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
# Local development: point at a working checkout instead of cloning, e.g.
|
||||
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
||||
|
||||
PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
|
||||
PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
|
||||
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
||||
PRIVACY_FILTER_SRC?=
|
||||
|
||||
|
||||
@@ -65,6 +65,29 @@ turboquant-avx:
|
||||
turboquant-fallback:
|
||||
$(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
||||
|
||||
# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
|
||||
# turboquant reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
|
||||
# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same overrides
|
||||
# through to the copied build: SHARED_LIBS=ON, the DL flags, and --target ggml (which
|
||||
# pulls in the per-microarch libggml-cpu-*.so via ggml's add_dependencies). The .so set
|
||||
# is collected for package.sh to bundle into package/lib.
|
||||
turboquant-cpu-all:
|
||||
rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
|
||||
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build purge
|
||||
bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server.cpp
|
||||
$(info $(GREEN)I turboquant build info:cpu-all-variants$(RESET))
|
||||
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build llama.cpp
|
||||
bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp $(PATCHES_DIR)
|
||||
SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" \
|
||||
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
|
||||
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build grpc-server
|
||||
cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server turboquant-cpu-all
|
||||
rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
|
||||
find $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
|
||||
@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
|
||||
|
||||
turboquant-grpc:
|
||||
$(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)
|
||||
|
||||
|
||||
@@ -14,6 +14,15 @@ mkdir -p $CURDIR/package/lib
|
||||
cp -avrf $CURDIR/turboquant-* $CURDIR/package/
|
||||
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||
|
||||
# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml
|
||||
# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which
|
||||
# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the
|
||||
# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds.
|
||||
if [ -d "$CURDIR/ggml-shared-libs" ]; then
|
||||
echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
|
||||
cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
|
||||
fi
|
||||
|
||||
# Detect architecture and copy appropriate libraries
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
# x86_64 architecture
|
||||
|
||||
@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1
|
||||
|
||||
BINARY=turboquant-fallback
|
||||
|
||||
if grep -q -e "\savx\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX found OK"
|
||||
if [ -e $CURDIR/turboquant-avx ]; then
|
||||
BINARY=turboquant-avx
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX2 found OK"
|
||||
if [ -e $CURDIR/turboquant-avx2 ]; then
|
||||
BINARY=turboquant-avx2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check avx 512
|
||||
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX512F found OK"
|
||||
if [ -e $CURDIR/turboquant-avx512 ]; then
|
||||
BINARY=turboquant-avx512
|
||||
fi
|
||||
# x86/arm64 ship a single turboquant-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's
|
||||
# backend registry dlopens the best libggml-cpu-*.so for this host, so no shell-side
|
||||
# probing. ROCm ships only turboquant-fallback, so fall back to it when cpu-all is absent.
|
||||
if [ -e $CURDIR/turboquant-cpu-all ]; then
|
||||
BINARY=turboquant-cpu-all
|
||||
fi
|
||||
|
||||
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
||||
|
||||
11
backend/go/ced/.gitignore
vendored
Normal file
11
backend/go/ced/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
.cache/
|
||||
sources/
|
||||
build/
|
||||
package/
|
||||
ced-grpc
|
||||
# build artifacts staged in-tree by the Makefile (cp from sources/) or
|
||||
# symlinked for local dev; the real sources live in ced.cpp upstream.
|
||||
*.so
|
||||
*.so.*
|
||||
ced_capi.h
|
||||
compile_commands.json
|
||||
77
backend/go/ced/Makefile
Normal file
77
backend/go/ced/Makefile
Normal file
@@ -0,0 +1,77 @@
|
||||
# ced sound-classification backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
|
||||
# and update it (matches the parakeet-cpp / whisper.cpp convention).
|
||||
#
|
||||
# Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
|
||||
# skip the clone/cmake steps entirely:
|
||||
# ln -sf /path/to/ced.cpp/build-shared/libced.so .
|
||||
# ln -sf /path/to/ced.cpp/include/ced_capi.h .
|
||||
# go build -o ced-grpc .
|
||||
|
||||
CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
|
||||
CED_REPO?=https://github.com/mudler/ced.cpp
|
||||
|
||||
GOCMD?=go
|
||||
GO_TAGS?=
|
||||
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=false
|
||||
|
||||
# Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
|
||||
# dlopen needs no libggml*.so alongside it, only system libs the runtime image
|
||||
# already provides.
|
||||
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
endif
|
||||
|
||||
# ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
|
||||
# "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DCED_GGML_HIP=ON
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
|
||||
endif
|
||||
|
||||
.PHONY: ced-grpc package build clean purge test all
|
||||
|
||||
all: ced-grpc
|
||||
|
||||
sources/ced.cpp:
|
||||
mkdir -p sources/ced.cpp
|
||||
cd sources/ced.cpp && \
|
||||
git init -q && \
|
||||
git remote add origin $(CED_REPO) && \
|
||||
git fetch --depth 1 origin $(CED_VERSION) && \
|
||||
git checkout FETCH_HEAD && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
libced.so: sources/ced.cpp
|
||||
cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
|
||||
cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
|
||||
cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
|
||||
cp -fv sources/ced.cpp/include/ced_capi.h ./
|
||||
|
||||
ced-grpc: libced.so main.go goced.go
|
||||
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
|
||||
|
||||
package: ced-grpc
|
||||
bash package.sh
|
||||
|
||||
build: package
|
||||
|
||||
test:
|
||||
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
|
||||
|
||||
clean: purge
|
||||
rm -rf libced.so* ced_capi.h package ced-grpc
|
||||
|
||||
purge:
|
||||
rm -rf sources/ced.cpp
|
||||
130
backend/go/ced/goced.go
Normal file
130
backend/go/ced/goced.go
Normal file
@@ -0,0 +1,130 @@
|
||||
package main
|
||||
|
||||
// Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
|
||||
// SoundDetection implementation.
|
||||
//
|
||||
// SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
|
||||
// `make protogen-go`). The C side is single-threaded per ctx, so we guard the
|
||||
// engine with engineMu; LocalAI also serializes via base.SingleThread.
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
// purego-bound entry points from libced.so. Names match ced_capi.h exactly.
|
||||
var (
|
||||
CppAbiVersion func() int32
|
||||
CppLoad func(ggufPath string) uintptr
|
||||
CppFree func(ctx uintptr)
|
||||
CppLastError func(ctx uintptr) string
|
||||
CppNumClasses func(ctx uintptr) int32
|
||||
CppSampleRate func(ctx uintptr) int32
|
||||
CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
|
||||
CppClassifyPcmJSON func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
|
||||
CppFreeString func(s uintptr)
|
||||
)
|
||||
|
||||
// cstr copies a malloc'd C string (returned as uintptr) into a Go string and
|
||||
// frees the original via ced_capi_free_string. Empty/0 -> "".
|
||||
func cstr(p uintptr) string {
|
||||
if p == 0 {
|
||||
return ""
|
||||
}
|
||||
defer CppFreeString(p)
|
||||
var b []byte
|
||||
for i := 0; ; i++ {
|
||||
ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
|
||||
if ch == 0 {
|
||||
break
|
||||
}
|
||||
b = append(b, ch)
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// Ced is the gRPC backend. One loaded CED model per instance.
|
||||
type Ced struct {
|
||||
base.Base
|
||||
ctxPtr uintptr
|
||||
engineMu sync.Mutex
|
||||
}
|
||||
|
||||
// Load resolves the GGUF and opens the C-API context.
|
||||
func (c *Ced) Load(opts *pb.ModelOptions) error {
|
||||
if opts.ModelFile == "" {
|
||||
return errors.New("ced: ModelFile is required")
|
||||
}
|
||||
ctx := CppLoad(opts.ModelFile)
|
||||
if ctx == 0 {
|
||||
return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
|
||||
}
|
||||
c.ctxPtr = ctx
|
||||
return nil
|
||||
}
|
||||
|
||||
// jsonTag mirrors the ced_capi JSON tag objects.
|
||||
type jsonTag struct {
|
||||
Index int `json:"index"`
|
||||
Score float32 `json:"score"`
|
||||
Label string `json:"label"`
|
||||
}
|
||||
|
||||
// SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
|
||||
func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
|
||||
if c.ctxPtr == 0 {
|
||||
return nil, errors.New("ced: model not loaded")
|
||||
}
|
||||
if req.GetSrc() == "" {
|
||||
return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
|
||||
}
|
||||
topK := req.GetTopK()
|
||||
if topK <= 0 {
|
||||
topK = 10 // sensible default for a tagging response
|
||||
}
|
||||
|
||||
c.engineMu.Lock()
|
||||
out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
|
||||
lastErr := CppLastError(c.ctxPtr)
|
||||
c.engineMu.Unlock()
|
||||
|
||||
if out == "" {
|
||||
return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
|
||||
}
|
||||
var tags []jsonTag
|
||||
if err := json.Unmarshal([]byte(out), &tags); err != nil {
|
||||
return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
|
||||
}
|
||||
|
||||
thr := req.GetThreshold()
|
||||
resp := &pb.SoundDetectionResponse{}
|
||||
for _, t := range tags {
|
||||
if t.Score < thr {
|
||||
continue
|
||||
}
|
||||
resp.Detections = append(resp.Detections, &pb.SoundClass{
|
||||
Label: t.Label, Score: t.Score, Index: int32(t.Index),
|
||||
})
|
||||
}
|
||||
sort.Slice(resp.Detections, func(i, j int) bool {
|
||||
return resp.Detections[i].Score > resp.Detections[j].Score
|
||||
})
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func (c *Ced) Free() error {
|
||||
c.engineMu.Lock()
|
||||
defer c.engineMu.Unlock()
|
||||
if c.ctxPtr != 0 {
|
||||
CppFree(c.ctxPtr)
|
||||
c.ctxPtr = 0
|
||||
}
|
||||
return nil
|
||||
}
|
||||
59
backend/go/ced/main.go
Normal file
59
backend/go/ced/main.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package main
|
||||
|
||||
// ced sound-classification backend. Started internally by LocalAI: one gRPC
|
||||
// server per loaded model. Loads libced.so via purego and registers the flat
|
||||
// C-API declared in ced_capi.h. The library name can be overridden with
|
||||
// CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
|
||||
// for the .so next to this binary.
|
||||
//
|
||||
// SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
|
||||
// addition, and a built libced.so (see Makefile). See DESIGN.md.
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
)
|
||||
|
||||
var addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
|
||||
type libFunc struct {
|
||||
ptr any
|
||||
name string
|
||||
}
|
||||
|
||||
func main() {
|
||||
libName := os.Getenv("CED_LIBRARY")
|
||||
if libName == "" {
|
||||
libName = "libced.so"
|
||||
}
|
||||
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
|
||||
}
|
||||
|
||||
// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
|
||||
// so we can free the same pointer with ced_capi_free_string after copying
|
||||
// (purego's string return would copy and leak the original).
|
||||
for _, lf := range []libFunc{
|
||||
{&CppAbiVersion, "ced_capi_abi_version"},
|
||||
{&CppLoad, "ced_capi_load"},
|
||||
{&CppFree, "ced_capi_free"},
|
||||
{&CppLastError, "ced_capi_last_error"},
|
||||
{&CppNumClasses, "ced_capi_num_classes"},
|
||||
{&CppSampleRate, "ced_capi_sample_rate"},
|
||||
{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
|
||||
{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
|
||||
{&CppFreeString, "ced_capi_free_string"},
|
||||
} {
|
||||
purego.RegisterLibFunc(lf.ptr, lib, lf.name)
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
|
||||
flag.Parse()
|
||||
if err := grpc.StartServer(*addr, &Ced{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
60
backend/go/ced/package.sh
Executable file
60
backend/go/ced/package.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
|
||||
# libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
|
||||
# is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
|
||||
# the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
mkdir -p "$CURDIR/package/lib"
|
||||
|
||||
cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
|
||||
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||
|
||||
cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||
echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||
echo "Detected Darwin"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||
15
backend/go/ced/run.sh
Executable file
15
backend/go/ced/run.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
|
||||
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
|
||||
|
||||
# If a self-contained ld.so was packaged, route through it so the packaged
|
||||
# libc / libstdc++ are used instead of the host's (matches the sibling backends).
|
||||
if [ -f "$CURDIR/lib/ld.so" ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
|
||||
fi
|
||||
|
||||
exec "$CURDIR/ced-grpc" "$@"
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# CrispASR version (release tag)
|
||||
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
|
||||
CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
|
||||
CRISPASR_VERSION?=96b2a6ee31d30389fed8a7ef1a54239b75231ddc
|
||||
SO_TARGET?=libgocrispasr.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
@@ -67,7 +67,7 @@ sources/CrispASR:
|
||||
# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
|
||||
# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
|
||||
# which is correct both standalone and as a subproject. Idempotent.
|
||||
sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
|
||||
sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
|
||||
|
||||
# Detect OS
|
||||
UNAME_S := $(shell uname -s)
|
||||
|
||||
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
|
||||
g_abort.store(v, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// --- word-level timestamp accessors ---
|
||||
extern "C" {
|
||||
int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
|
||||
const char *crispasr_session_result_word_text(crispasr_session_result *r,
|
||||
int seg_i, int word_i);
|
||||
int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
|
||||
int word_i);
|
||||
int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
|
||||
int word_i);
|
||||
|
||||
// Parakeet-specific word accessors
|
||||
int crispasr_parakeet_result_n_words(void *r);
|
||||
const char *crispasr_parakeet_result_word_text(void *r, int word_i);
|
||||
int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
|
||||
int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
|
||||
}
|
||||
|
||||
void *get_result(void) { return g_result; }
|
||||
|
||||
int get_word_count(int seg_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_n_words(g_result, seg_i);
|
||||
}
|
||||
|
||||
const char *get_word_text(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return "";
|
||||
return crispasr_session_result_word_text(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
int64_t get_word_t0(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_word_t0(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
int64_t get_word_t1(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_word_t1(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
// Parakeet-specific word accessors
|
||||
int get_parakeet_word_count(void) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_n_words(g_result);
|
||||
}
|
||||
|
||||
const char *get_parakeet_word_text(int word_i) {
|
||||
if (!g_result)
|
||||
return "";
|
||||
return crispasr_parakeet_result_word_text(g_result, word_i);
|
||||
}
|
||||
|
||||
int64_t get_parakeet_word_t0(int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_word_t0(g_result, word_i);
|
||||
}
|
||||
|
||||
int64_t get_parakeet_word_t1(int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_word_t1(g_result, word_i);
|
||||
}
|
||||
|
||||
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
||||
void *data) {
|
||||
const char *level_str;
|
||||
|
||||
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
|
||||
void tts_free(float *pcm);
|
||||
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
||||
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
||||
|
||||
// --- word-level timestamp accessors ---
|
||||
// Session-based (works for whisper-like backends)
|
||||
void *get_result(void);
|
||||
int get_word_count(int seg_i);
|
||||
const char *get_word_text(int seg_i, int word_i);
|
||||
int64_t get_word_t0(int seg_i, int word_i);
|
||||
int64_t get_word_t1(int seg_i, int word_i);
|
||||
|
||||
// Parakeet-specific (global word list, no segment index)
|
||||
int get_parakeet_word_count(void);
|
||||
const char *get_parakeet_word_text(int word_i);
|
||||
int64_t get_parakeet_word_t0(int word_i);
|
||||
int64_t get_parakeet_word_t1(int word_i);
|
||||
}
|
||||
|
||||
@@ -34,6 +34,18 @@ var (
|
||||
CppTTSFree func(ptr uintptr)
|
||||
CppTTSSetVoice func(name string) int
|
||||
CppTTSSetVoiceFile func(path string, refText string) int
|
||||
|
||||
// Word-level timestamp accessors (session-based, per-segment)
|
||||
CppGetWordCount func(segI int) int
|
||||
CppGetWordText func(segI int, wordI int) string
|
||||
CppGetWordT0 func(segI int, wordI int) int64
|
||||
CppGetWordT1 func(segI int, wordI int) int64
|
||||
|
||||
// Parakeet-specific word accessors (global, no segment index)
|
||||
CppGetParakeetWordCount func() int
|
||||
CppGetParakeetWordText func(wordI int) string
|
||||
CppGetParakeetWordT0 func(wordI int) int64
|
||||
CppGetParakeetWordT1 func(wordI int) int64
|
||||
)
|
||||
|
||||
type CrispASR struct {
|
||||
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isValidWord reports whether a TranscriptWord contains recognisable speech
|
||||
// content. The parakeet-specific word accessors can return stale initialisation
|
||||
// data (model name, binary blobs) when a segment has no real speech. A word is
|
||||
// considered valid only when:
|
||||
// - the text is non-empty after trimming,
|
||||
// - it contains no U+FFFD replacement characters (from binary data scrubbing),
|
||||
// - both timestamps are non-negative,
|
||||
// - the word has positive duration (end > start).
|
||||
func isValidWord(w *pb.TranscriptWord) bool {
|
||||
txt := strings.TrimSpace(w.Text)
|
||||
if txt == "" {
|
||||
return false
|
||||
}
|
||||
if strings.ContainsRune(txt, '\uFFFD') {
|
||||
return false
|
||||
}
|
||||
if w.Start < 0 || w.End < 0 || w.End <= w.Start {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
|
||||
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
|
||||
// IDs, so Tokens is left empty.
|
||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||
|
||||
// Populate word-level timestamps. Try session-based functions first
|
||||
// (per-segment); fall back to parakeet-specific functions (global word
|
||||
// list with no segment index — only populated on the first segment to
|
||||
// avoid duplication).
|
||||
words := []*pb.TranscriptWord{}
|
||||
wordCount := CppGetWordCount(i)
|
||||
if wordCount == 0 && i == 0 {
|
||||
wordCount = CppGetParakeetWordCount()
|
||||
for j := 0; j < wordCount; j++ {
|
||||
w := &pb.TranscriptWord{
|
||||
Start: CppGetParakeetWordT0(j) * (10000000),
|
||||
End: CppGetParakeetWordT1(j) * (10000000),
|
||||
Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
|
||||
}
|
||||
if isValidWord(w) {
|
||||
words = append(words, w)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for j := 0; j < wordCount; j++ {
|
||||
w := &pb.TranscriptWord{
|
||||
Start: CppGetWordT0(i, j) * (10000000),
|
||||
End: CppGetWordT1(i, j) * (10000000),
|
||||
Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
|
||||
}
|
||||
if isValidWord(w) {
|
||||
words = append(words, w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip empty segments with no recognisable content (e.g. trailing
|
||||
// silence segments that parakeet emits with stale init data).
|
||||
trimmed := strings.TrimSpace(txt)
|
||||
if trimmed == "" && len(words) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
segment := &pb.TranscriptSegment{
|
||||
Id: int32(i),
|
||||
Text: txt,
|
||||
Start: s, End: t,
|
||||
Words: words,
|
||||
}
|
||||
|
||||
segments = append(segments, segment)
|
||||
|
||||
text += " " + strings.TrimSpace(txt)
|
||||
text += " " + trimmed
|
||||
}
|
||||
|
||||
return pb.TranscriptResult{
|
||||
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
|
||||
s := CppGetSegmentStart(i) * 10000000
|
||||
t := CppGetSegmentEnd(i) * 10000000
|
||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||
|
||||
// Skip empty segments (e.g. trailing silence that parakeet emits
|
||||
// with stale init data).
|
||||
trimmed := strings.TrimSpace(txt)
|
||||
if trimmed == "" && s == t {
|
||||
continue
|
||||
}
|
||||
|
||||
segments = append(segments, &pb.TranscriptSegment{
|
||||
Id: int32(i),
|
||||
Text: txt,
|
||||
Start: s, End: t,
|
||||
})
|
||||
|
||||
trimmed := strings.TrimSpace(txt)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -44,6 +44,14 @@ func main() {
|
||||
{&CppTTSFree, "tts_free"},
|
||||
{&CppTTSSetVoice, "tts_set_voice"},
|
||||
{&CppTTSSetVoiceFile, "tts_set_voice_file"},
|
||||
{&CppGetWordCount, "get_word_count"},
|
||||
{&CppGetWordText, "get_word_text"},
|
||||
{&CppGetWordT0, "get_word_t0"},
|
||||
{&CppGetWordT1, "get_word_t1"},
|
||||
{&CppGetParakeetWordCount, "get_parakeet_word_count"},
|
||||
{&CppGetParakeetWordText, "get_parakeet_word_text"},
|
||||
{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
|
||||
{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
|
||||
}
|
||||
|
||||
for _, lf := range libFuncs {
|
||||
|
||||
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
|
||||
# merge upstream can orphan a branch, so the native version is pinned by SHA.
|
||||
# This SHA adds the nested two-file metric C-API (abi_version 4,
|
||||
# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
|
||||
# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
|
||||
# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
|
||||
# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
|
||||
# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
|
||||
# It is kept alive by the upstream tag da2-support (survives a squash-merge);
|
||||
# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
|
||||
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
|
||||
DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
|
||||
DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118
|
||||
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# omnivoice.cpp version
|
||||
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
||||
OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
|
||||
OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
|
||||
SO_TARGET?=libgomnivoicecpp.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# parakeet-cpp backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
||||
# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
|
||||
# (.github/bump_deps.sh) can find and update it - matches the
|
||||
# whisper.cpp / ds4 / vibevoice-cpp convention.
|
||||
#
|
||||
@@ -15,7 +15,7 @@
|
||||
# That's what the L0 smoke test uses. The default target below does the
|
||||
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
|
||||
|
||||
PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
||||
PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a
|
||||
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
|
||||
|
||||
GOCMD?=go
|
||||
|
||||
@@ -1,23 +1,68 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
|
||||
# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
|
||||
# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
|
||||
# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
|
||||
# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
|
||||
# BUILD_TYPE so the package is self-contained. Mirrors
|
||||
# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
|
||||
# through lib/ld.so so the packaged libc is used instead of the host's.
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
mkdir -p "$CURDIR/package/lib"
|
||||
|
||||
cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
|
||||
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||
|
||||
# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
|
||||
# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
|
||||
# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
|
||||
cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||
echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "L0 package layout (full ldd walk lands in L3):"
|
||||
# Detect architecture and copy the core runtime libs libparakeet.so links
|
||||
# against, plus the matching dynamic loader as lib/ld.so.
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||
echo "Detected Darwin"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
|
||||
# based on BUILD_TYPE so the backend can reach the GPU without the runtime
|
||||
# base image shipping those drivers.
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# qwentts.cpp version
|
||||
QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
|
||||
QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
|
||||
QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
|
||||
SO_TARGET?=libgoqwen3ttscpp.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# stablediffusion.cpp (ggml)
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
|
||||
STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f
|
||||
|
||||
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
|
||||
WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
|
||||
SO_TARGET?=libgowhisper.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -178,6 +178,37 @@
|
||||
nvidia-cuda-12: "cuda12-parakeet-cpp"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
|
||||
- &ced
|
||||
name: "ced"
|
||||
alias: "ced"
|
||||
license: mit
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
description: |
|
||||
CED sound-event classification / audio tagging (527-class AudioSet).
|
||||
ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
|
||||
taxonomy, exposed through the SoundDetection gRPC rpc and the
|
||||
/v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
|
||||
AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
|
||||
urls:
|
||||
- https://github.com/mudler/ced.cpp
|
||||
tags:
|
||||
- audio-classification
|
||||
- CPU
|
||||
- GPU
|
||||
- CUDA
|
||||
- HIP
|
||||
capabilities:
|
||||
default: "cpu-ced"
|
||||
nvidia: "cuda12-ced"
|
||||
intel: "intel-sycl-f16-ced"
|
||||
metal: "metal-ced"
|
||||
amd: "rocm-ced"
|
||||
vulkan: "vulkan-ced"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-ced"
|
||||
nvidia-cuda-13: "cuda13-ced"
|
||||
nvidia-cuda-12: "cuda12-ced"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
|
||||
- &voxtral
|
||||
name: "voxtral"
|
||||
alias: "voxtral"
|
||||
@@ -2650,6 +2681,121 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
|
||||
## ced
|
||||
- !!merge <<: *ced
|
||||
name: "ced-development"
|
||||
capabilities:
|
||||
default: "cpu-ced-development"
|
||||
nvidia: "cuda12-ced-development"
|
||||
intel: "intel-sycl-f16-ced-development"
|
||||
metal: "metal-ced-development"
|
||||
amd: "rocm-ced-development"
|
||||
vulkan: "vulkan-ced-development"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-ced-development"
|
||||
nvidia-cuda-13: "cuda13-ced-development"
|
||||
nvidia-cuda-12: "cuda12-ced-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
|
||||
- !!merge <<: *ced
|
||||
name: "nvidia-l4t-arm64-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "nvidia-l4t-arm64-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-nvidia-l4t-arm64-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-nvidia-l4t-arm64-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cpu-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cpu-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-ced
|
||||
- !!merge <<: *ced
|
||||
name: "metal-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "metal-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda12-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda12-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-12-ced
|
||||
- !!merge <<: *ced
|
||||
name: "rocm-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-rocm-hipblas-ced
|
||||
- !!merge <<: *ced
|
||||
name: "rocm-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-rocm-hipblas-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f32-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f32-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f32-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f32-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f16-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f16-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f16-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f16-ced
|
||||
- !!merge <<: *ced
|
||||
name: "vulkan-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-vulkan-ced
|
||||
- !!merge <<: *ced
|
||||
name: "vulkan-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-vulkan-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-ced
|
||||
## stablediffusion-ggml
|
||||
- !!merge <<: *stablediffusionggml
|
||||
name: "cpu-stablediffusion-ggml"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
torchvision==0.22.1
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torch==2.7.1
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu121
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
torchvision
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torch
|
||||
ftfy
|
||||
optimum-quanto
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
torchvision
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torch
|
||||
ftfy
|
||||
optimum-quanto
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,17 +1,23 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm7.0
|
||||
torch==2.10.0+rocm7.0
|
||||
torchvision==0.25.0+rocm7.0
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -3,18 +3,24 @@ torch
|
||||
torchvision
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
|
||||
torch
|
||||
git+https://github.com/huggingface/diffusers
|
||||
transformers
|
||||
diffusers==0.38.0
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
optimum-quanto
|
||||
@@ -9,9 +9,15 @@ numpy<2
|
||||
sentencepiece
|
||||
torchvision
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||
torch
|
||||
git+https://github.com/huggingface/diffusers
|
||||
transformers
|
||||
diffusers==0.38.0
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
optimum-quanto
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torchvision
|
||||
ftfy
|
||||
chardet
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,16 +1,22 @@
|
||||
torch==2.7.1
|
||||
torchvision==0.22.1
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def _get_stride_seconds(self):
|
||||
"""Compute the seconds-per-frame stride for the loaded model.
|
||||
|
||||
stride = preprocessor_window_stride * encoder_subsampling_factor
|
||||
"""
|
||||
try:
|
||||
preprocessor = self.model.preprocessor
|
||||
window_stride = preprocessor._cfg.get('window_stride', 0.01)
|
||||
subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
|
||||
return window_stride * subsampling_factor
|
||||
except (AttributeError, KeyError, TypeError) as err:
|
||||
print(
|
||||
f"Warning: could not compute stride from model config ({err}), "
|
||||
f"falling back to 0.08s/frame",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 0.08
|
||||
|
||||
def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
|
||||
"""Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
|
||||
|
||||
Supports two granularity modes:
|
||||
- "word": one TranscriptSegment per word, each with a single TranscriptWord entry
|
||||
- "segment" (default): merge consecutive words into sentence-level segments,
|
||||
splitting at word-level time gaps that exceed a dynamic threshold.
|
||||
"""
|
||||
if not hypothesis or not isinstance(hypothesis.timestamp, dict):
|
||||
return []
|
||||
|
||||
word_offsets = hypothesis.timestamp.get('word', [])
|
||||
if not word_offsets:
|
||||
return []
|
||||
|
||||
granularities = list(timestamp_granularities) if timestamp_granularities else []
|
||||
granularity = "word" if "word" in granularities else "segment"
|
||||
|
||||
# Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
|
||||
transcript_words = []
|
||||
for wo in word_offsets:
|
||||
word_text = wo.get('word', '')
|
||||
if not word_text:
|
||||
continue
|
||||
start_offset = wo.get('start_offset', 0)
|
||||
end_offset = wo.get('end_offset', start_offset)
|
||||
start_ns = int(start_offset * stride * 1_000_000_000)
|
||||
end_ns = int(end_offset * stride * 1_000_000_000)
|
||||
transcript_words.append({
|
||||
'text': word_text,
|
||||
'start': start_ns,
|
||||
'end': end_ns,
|
||||
})
|
||||
|
||||
if not transcript_words:
|
||||
return []
|
||||
|
||||
if granularity == "word":
|
||||
# One segment per word
|
||||
result = []
|
||||
for idx, tw in enumerate(transcript_words):
|
||||
word = backend_pb2.TranscriptWord(
|
||||
start=tw['start'], end=tw['end'], text=tw['text']
|
||||
)
|
||||
result.append(backend_pb2.TranscriptSegment(
|
||||
id=idx,
|
||||
start=tw['start'],
|
||||
end=tw['end'],
|
||||
text=tw['text'],
|
||||
words=[word],
|
||||
))
|
||||
return result
|
||||
|
||||
# segment mode — merge at word-level time-gap boundaries
|
||||
# Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
|
||||
gaps = []
|
||||
for i in range(1, len(transcript_words)):
|
||||
gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
|
||||
if gap > 0:
|
||||
gaps.append(gap)
|
||||
if gaps:
|
||||
gaps.sort()
|
||||
median_gap = gaps[len(gaps) // 2]
|
||||
threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
|
||||
else:
|
||||
threshold_ns = int(0.5 * 1_000_000_000)
|
||||
|
||||
result = []
|
||||
buf_words = [] # list of TranscriptWord protobuf
|
||||
buf_start = None
|
||||
buf_end = 0
|
||||
buf_text = []
|
||||
prev_end = None
|
||||
|
||||
for tw in transcript_words:
|
||||
# Detect word-level time gap
|
||||
if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
|
||||
seg_text = ' '.join(buf_text)
|
||||
result.append(backend_pb2.TranscriptSegment(
|
||||
id=len(result),
|
||||
start=buf_start,
|
||||
end=buf_end,
|
||||
text=seg_text,
|
||||
words=list(buf_words),
|
||||
))
|
||||
buf_words = []
|
||||
buf_text = []
|
||||
buf_start = None
|
||||
|
||||
if buf_start is None:
|
||||
buf_start = tw['start']
|
||||
buf_end = tw['end']
|
||||
buf_text.append(tw['text'])
|
||||
buf_words.append(backend_pb2.TranscriptWord(
|
||||
start=tw['start'], end=tw['end'], text=tw['text']
|
||||
))
|
||||
prev_end = tw['end']
|
||||
|
||||
# flush remaining
|
||||
if buf_text and buf_start is not None:
|
||||
seg_text = ' '.join(buf_text)
|
||||
result.append(backend_pb2.TranscriptSegment(
|
||||
id=len(result),
|
||||
start=buf_start,
|
||||
end=buf_end,
|
||||
text=seg_text,
|
||||
words=list(buf_words),
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
def AudioTranscription(self, request, context):
|
||||
result_segments = []
|
||||
text = ""
|
||||
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
|
||||
return backend_pb2.TranscriptResult(segments=[], text="")
|
||||
|
||||
# NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
|
||||
results = self.model.transcribe([audio_path])
|
||||
# Determine requested timestamp granularity
|
||||
timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
|
||||
want_timestamps = bool(timestamp_granularities)
|
||||
|
||||
if not results or len(results) == 0:
|
||||
return backend_pb2.TranscriptResult(segments=[], text="")
|
||||
if want_timestamps:
|
||||
# Request timestamps from NeMo.
|
||||
# timestamps=True forces NeMo to return Hypothesis objects with
|
||||
# the timestamp dict populated, so we omit return_hypotheses to
|
||||
# let NeMo choose the correct return type.
|
||||
results = self.model.transcribe([audio_path], timestamps=True)
|
||||
|
||||
# Get the transcript text from the first result.
|
||||
# CTC models return List[str], TDT/RNNT models return List[Hypothesis]
|
||||
# where the actual text lives in Hypothesis.text.
|
||||
result = results[0]
|
||||
if isinstance(result, str):
|
||||
text = result
|
||||
if results and len(results) > 0:
|
||||
hypotheses = results[0] if isinstance(results[0], list) else results
|
||||
if hypotheses and len(hypotheses) > 0:
|
||||
hypothesis = hypotheses[0]
|
||||
|
||||
# Hypothesis object should have .timestamp populated
|
||||
if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
|
||||
print(
|
||||
"Warning: timestamps were requested but NeMo did not return "
|
||||
"Hypothesis objects; falling back to untimestamped output",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Extract text
|
||||
if hasattr(hypothesis, 'text'):
|
||||
text = hypothesis.text or ""
|
||||
elif isinstance(hypothesis, str):
|
||||
text = hypothesis
|
||||
|
||||
# Build segments with word-level timestamps
|
||||
stride = self._get_stride_seconds()
|
||||
result_segments = self._build_segments_with_words(
|
||||
hypothesis, stride, timestamp_granularities
|
||||
)
|
||||
|
||||
# If no word offsets but we have text, fall back to single segment
|
||||
if not result_segments and text:
|
||||
result_segments.append(backend_pb2.TranscriptSegment(
|
||||
id=0, start=0, end=0, text=text
|
||||
))
|
||||
else:
|
||||
text = getattr(result, 'text', None) or ""
|
||||
# Simple transcription without timestamps
|
||||
# NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
|
||||
results = self.model.transcribe([audio_path])
|
||||
|
||||
if text:
|
||||
# Create a single segment with the full transcription
|
||||
result_segments.append(backend_pb2.TranscriptSegment(
|
||||
id=0, start=0, end=0, text=text
|
||||
))
|
||||
if results and len(results) > 0:
|
||||
# Get the transcript text from the first result.
|
||||
# CTC models return List[str], TDT/RNNT models return List[Hypothesis]
|
||||
# where the actual text lives in Hypothesis.text.
|
||||
result = results[0]
|
||||
if isinstance(result, str):
|
||||
text = result
|
||||
else:
|
||||
text = getattr(result, 'text', None) or ""
|
||||
|
||||
if text:
|
||||
# Create a single segment with the full transcription
|
||||
result_segments.append(backend_pb2.TranscriptSegment(
|
||||
id=0, start=0, end=0, text=text
|
||||
))
|
||||
|
||||
except Exception as err:
|
||||
print(f"Error in AudioTranscription: {err}", file=sys.stderr)
|
||||
|
||||
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
dataset_split = request.dataset_split or "train"
|
||||
if os.path.exists(request.dataset_source):
|
||||
_allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
|
||||
_real_path = os.path.realpath(os.path.abspath(request.dataset_source))
|
||||
if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
|
||||
raise ValueError("Dataset source path is outside the allowed directory")
|
||||
if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
|
||||
dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
|
||||
elif request.dataset_source.endswith('.csv'):
|
||||
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def ExportModel(self, request, context):
|
||||
export_format = request.export_format or "lora"
|
||||
output_path = request.output_path
|
||||
_allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
|
||||
_real_output_path = os.path.realpath(os.path.abspath(output_path))
|
||||
if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
|
||||
raise ValueError("Output path is outside the allowed directory")
|
||||
output_path = _real_output_path
|
||||
checkpoint_path = request.checkpoint_path
|
||||
|
||||
# Extract HF token for gated model access
|
||||
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
env = os.environ.copy()
|
||||
env["NO_LOCAL_GGUF"] = "1"
|
||||
cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
|
||||
conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
|
||||
conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False) # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
|
||||
if conv_result.returncode != 0:
|
||||
diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
|
||||
return backend_pb2.Result(success=False,
|
||||
|
||||
@@ -48,8 +48,10 @@ try:
|
||||
except ImportError:
|
||||
HAS_REASONING_PARSERS = False
|
||||
|
||||
# vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
|
||||
# SamplingParams field guided_decoding -> structured_outputs.
|
||||
try:
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
HAS_GUIDED_DECODING = True
|
||||
except ImportError:
|
||||
HAS_GUIDED_DECODING = False
|
||||
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if value not in (None, 0, [], False):
|
||||
setattr(sampling_params, param_field, value)
|
||||
|
||||
# Guided decoding: use Grammar field to pass JSON schema or BNF
|
||||
# Structured-output decoding: use Grammar field to pass JSON schema or BNF
|
||||
if HAS_GUIDED_DECODING and request.Grammar:
|
||||
try:
|
||||
json.loads(request.Grammar) # valid JSON = JSON schema
|
||||
sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
|
||||
sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
|
||||
except json.JSONDecodeError:
|
||||
sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
|
||||
sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)
|
||||
|
||||
# Extract image paths and process images
|
||||
prompt = request.Prompt
|
||||
@@ -596,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
# Stream the results
|
||||
generated_text = ""
|
||||
generated_token_ids: list[int] = []
|
||||
last_output = None
|
||||
|
||||
# Tool-parsing strategy decision (made once, before the loop):
|
||||
#
|
||||
# When a tool parser is active, the model's raw tool-call markup
|
||||
# (e.g. <tool_call>...) must not be streamed verbatim as delta.content
|
||||
# — clients would see the unparsed syntax. Two paths:
|
||||
#
|
||||
# (A) native streaming via parser.extract_tool_calls_streaming. All
|
||||
# concrete tool parsers shipped with vLLM 0.23+ implement this
|
||||
# (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
|
||||
# llama3_json, mistral, …). The parser decides per-delta whether
|
||||
# to emit content or suppress tool-call markup, and emits a
|
||||
# structured DeltaMessage(tool_calls=[...]) when a call is ready.
|
||||
# (B) buffer fallback — used only when the parser surprisingly lacks
|
||||
# the streaming method or it raises mid-stream. The post-loop
|
||||
# extract_tool_calls assembles the final chat_delta. Same correctness
|
||||
# guarantee as a non-streaming response, at the cost of a delayed
|
||||
# final chunk.
|
||||
has_tool_parser = bool(self.tool_parser_cls and request.Tools)
|
||||
tp_instance = None
|
||||
tp_request = None
|
||||
native_streaming = False
|
||||
native_streaming_error = False
|
||||
if has_tool_parser:
|
||||
try:
|
||||
tools_for_parser = json.loads(request.Tools)
|
||||
except json.JSONDecodeError:
|
||||
tools_for_parser = []
|
||||
try:
|
||||
tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
|
||||
except TypeError:
|
||||
tp_instance = self.tool_parser_cls(self.tokenizer)
|
||||
# Build a minimal ChatCompletionRequest so the streaming method
|
||||
# sees the tools list. We do not need any other request fields —
|
||||
# parsers only read .tools (and sometimes .tool_choice, which we
|
||||
# leave at default).
|
||||
try:
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest as _CCR,
|
||||
)
|
||||
tp_request = _CCR(
|
||||
model="local",
|
||||
messages=[{"role": "user", "content": ""}],
|
||||
tools=tools_for_parser or None,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
|
||||
file=sys.stderr)
|
||||
tp_request = None
|
||||
native_streaming = (
|
||||
tp_request is not None
|
||||
and hasattr(tp_instance, "extract_tool_calls_streaming")
|
||||
)
|
||||
|
||||
try:
|
||||
async for request_output in outputs:
|
||||
iteration_text = request_output.outputs[0].text
|
||||
last_output = request_output
|
||||
|
||||
if streaming:
|
||||
# Remove text already sent as vllm concatenates the text from previous yields
|
||||
delta_iteration_text = iteration_text.removeprefix(generated_text)
|
||||
# Send the partial result
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(delta_iteration_text, encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
|
||||
)
|
||||
new_token_ids = list(request_output.outputs[0].token_ids)
|
||||
delta_token_ids = new_token_ids[len(generated_token_ids):]
|
||||
|
||||
# Keep track of text generated
|
||||
if not has_tool_parser:
|
||||
# Plain streaming — unchanged from pre-tool-parser path.
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(delta_iteration_text, encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
|
||||
)
|
||||
elif native_streaming and not native_streaming_error:
|
||||
# (A) Native vLLM extract_tool_calls_streaming.
|
||||
try:
|
||||
msg = tp_instance.extract_tool_calls_streaming(
|
||||
previous_text=generated_text,
|
||||
current_text=iteration_text,
|
||||
delta_text=delta_iteration_text,
|
||||
previous_token_ids=generated_token_ids,
|
||||
current_token_ids=new_token_ids,
|
||||
delta_token_ids=delta_token_ids,
|
||||
request=tp_request,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Streaming tool parser error (falling back to "
|
||||
f"buffer for the rest of the stream): {e}",
|
||||
file=sys.stderr)
|
||||
native_streaming_error = True
|
||||
msg = None
|
||||
if msg is not None:
|
||||
tc_protos = []
|
||||
for tc in (msg.tool_calls or []):
|
||||
fn = tc.function or None
|
||||
tc_protos.append(backend_pb2.ToolCallDelta(
|
||||
index=tc.index,
|
||||
id=tc.id or "",
|
||||
name=(fn.name if fn and fn.name else "") or "",
|
||||
arguments=(fn.arguments if fn and fn.arguments else "") or "",
|
||||
))
|
||||
cd_kwargs = {}
|
||||
if msg.content:
|
||||
cd_kwargs["content"] = msg.content
|
||||
if msg.reasoning:
|
||||
cd_kwargs["reasoning_content"] = msg.reasoning
|
||||
if tc_protos:
|
||||
cd_kwargs["tool_calls"] = tc_protos
|
||||
if cd_kwargs:
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(msg.content or "", encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
|
||||
)
|
||||
# (B) buffer fallback — emit nothing during the stream.
|
||||
# The post-loop extract_tool_calls block builds the final chunk.
|
||||
|
||||
# Keep track of text + token_ids generated
|
||||
generated_text = iteration_text
|
||||
generated_token_ids = list(request_output.outputs[0].token_ids)
|
||||
finally:
|
||||
await outputs.aclose()
|
||||
|
||||
@@ -637,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
except Exception as e:
|
||||
print(f"Reasoning parser error: {e}", file=sys.stderr)
|
||||
|
||||
if self.tool_parser_cls and request.Tools:
|
||||
# When (A) native streaming ran cleanly, per-delta yields above already
|
||||
# delivered everything — do NOT extract again on the full text or we'd
|
||||
# duplicate content/tool_calls into the final chunk.
|
||||
if has_tool_parser and not (native_streaming and not native_streaming_error):
|
||||
try:
|
||||
tools = json.loads(request.Tools)
|
||||
# Some concrete parsers only accept the tokenizer; only the
|
||||
# abstract base declares the tools kwarg. Try with tools first,
|
||||
# fall back to tokenizer-only.
|
||||
try:
|
||||
tp = self.tool_parser_cls(self.tokenizer, tools=tools)
|
||||
except TypeError:
|
||||
tp = self.tool_parser_cls(self.tokenizer)
|
||||
tp = tp_instance
|
||||
if tp is None:
|
||||
# Defensive: tp_instance build failed earlier; reconstruct.
|
||||
tools = json.loads(request.Tools)
|
||||
try:
|
||||
tp = self.tool_parser_cls(self.tokenizer, tools=tools)
|
||||
except TypeError:
|
||||
tp = self.tool_parser_cls(self.tokenizer)
|
||||
info = tp.extract_tool_calls(content, request=None)
|
||||
if info.tools_called:
|
||||
content = info.content or ""
|
||||
@@ -659,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
))
|
||||
except Exception as e:
|
||||
print(f"Tool parser error: {e}", file=sys.stderr)
|
||||
elif native_streaming and not native_streaming_error:
|
||||
# Per-delta path already emitted content + tool_calls; the final
|
||||
# chat_delta should carry only metadata (token counts, logprobs).
|
||||
content = ""
|
||||
|
||||
# Extract token counts
|
||||
prompt_tokens = 0
|
||||
@@ -698,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
|
||||
if streaming:
|
||||
# Final chunk with structured data
|
||||
# Final chunk with structured data.
|
||||
#
|
||||
# If we used the buffer fallback (has_tool_parser=True AND native
|
||||
# streaming did NOT run cleanly) and the parser found no tool call,
|
||||
# flush the buffered content as ONE content delta — and clear the
|
||||
# final chat_delta's content so the metadata chunk does not repeat
|
||||
# what we just sent. This is the plain-text-with-tool-parser path.
|
||||
buffered_fallback = (
|
||||
has_tool_parser
|
||||
and not (native_streaming and not native_streaming_error)
|
||||
)
|
||||
if buffered_fallback and not tool_calls_proto and content:
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(content, encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(content=content)],
|
||||
)
|
||||
chat_delta = backend_pb2.ChatDelta(
|
||||
reasoning_content=reasoning_content,
|
||||
tool_calls=tool_calls_proto,
|
||||
)
|
||||
yield backend_pb2.Reply(
|
||||
message=b"",
|
||||
prompt_tokens=prompt_tokens,
|
||||
|
||||
@@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase):
|
||||
print(err)
|
||||
self.fail("Embedding service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
self.tearDown()
|
||||
|
||||
|
||||
class TestStreamingToolParser(unittest.TestCase):
|
||||
"""
|
||||
Server-less unit tests for the streaming + tool-parser machinery in
|
||||
BackendServicer._predict. These tests instantiate BackendServicer
|
||||
directly and mock the vLLM engine + tool parser, so they do not need
|
||||
a GPU, a model, or a running gRPC server. Kept in a separate class to
|
||||
avoid the parent setUp() which spawns a subprocess.
|
||||
|
||||
Covers #582 (follow-up to #10346):
|
||||
1. Markup-leak prevention with a non-streaming parser (buffer fallback)
|
||||
2. No content duplication on the plain-text path with the buffer fallback
|
||||
3. Native streaming progressive plain-text emission
|
||||
4. Native streaming structured tool_call, no markup leak
|
||||
5. Parser exception → graceful fallback to buffer, still no markup
|
||||
6. No-tool-parser regression: unchanged per-delta content stream
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _make_generate(chunks):
|
||||
"""Build a fake vLLM engine.generate that yields cumulative chunks."""
|
||||
from types import SimpleNamespace
|
||||
async def gen(*a, **k):
|
||||
for i, t in enumerate(chunks):
|
||||
yield SimpleNamespace(
|
||||
outputs=[SimpleNamespace(
|
||||
text=t,
|
||||
token_ids=list(range(i + 1)),
|
||||
logprobs=None,
|
||||
)],
|
||||
prompt_token_ids=[0],
|
||||
)
|
||||
return lambda *a, **k: gen()
|
||||
|
||||
@staticmethod
|
||||
def _collect(servicer, req):
|
||||
import asyncio
|
||||
async def run():
|
||||
return [r async for r in servicer._predict(req, None, streaming=True)]
|
||||
return asyncio.run(run())
|
||||
|
||||
def _new_servicer(self):
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from backend import BackendServicer
|
||||
s = BackendServicer()
|
||||
s.reasoning_parser_cls = None
|
||||
s.tool_parser_cls = None
|
||||
s.tokenizer = None
|
||||
return s
|
||||
|
||||
# ── Case 1+2: parser without streaming method → buffer fallback ──
|
||||
def test_buffer_path_no_markup_no_duplication(self):
|
||||
from types import SimpleNamespace
|
||||
|
||||
def parser_cls(called, content_text, calls):
|
||||
class _P:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
pass
|
||||
# NOTE: NO extract_tool_calls_streaming → takes the buffer path
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
return SimpleNamespace(
|
||||
tools_called=called, content=content_text, tool_calls=calls,
|
||||
)
|
||||
return _P
|
||||
|
||||
tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
|
||||
|
||||
# Tool-call case: no raw markup in any delta.content
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
'<tool_call>\n{"name": "calc"',
|
||||
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||
]))
|
||||
call = SimpleNamespace(id="call_1",
|
||||
function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
|
||||
s.tool_parser_cls = parser_cls(True, "", [call])
|
||||
req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
|
||||
replies = self._collect(s, req)
|
||||
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||
self.assertFalse(
|
||||
any("<tool_call" in c for c in contents),
|
||||
f"markup leaked: {contents!r}",
|
||||
)
|
||||
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
|
||||
self.assertIn("calc", names, "tool_call missing from final chunk")
|
||||
|
||||
# Plain-text-with-tools case: full content delivered exactly once
|
||||
s2 = self._new_servicer()
|
||||
s2.llm = SimpleNamespace(generate=self._make_generate([
|
||||
"The capital ",
|
||||
"The capital of France is Paris.",
|
||||
]))
|
||||
s2.tool_parser_cls = parser_cls(False, "", [])
|
||||
req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
|
||||
joined = "".join(
|
||||
cd.content for r in self._collect(s2, req2)
|
||||
for cd in r.chat_deltas if cd.content
|
||||
)
|
||||
self.assertEqual(
|
||||
joined.count("The capital of France is Paris."), 1,
|
||||
f"buffered content duplicated: {joined!r}",
|
||||
)
|
||||
|
||||
# ── Case 3: native streaming, progressive plain text ──
|
||||
def test_native_streaming_progressive_plain_text(self):
|
||||
from types import SimpleNamespace
|
||||
|
||||
class _DeltaMsg:
|
||||
def __init__(self, content=None, reasoning=None, tool_calls=None):
|
||||
self.content = content
|
||||
self.reasoning = reasoning
|
||||
self.tool_calls = tool_calls or []
|
||||
|
||||
class StreamingParser:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
pass
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
# Should NOT be called when native streaming runs successfully.
|
||||
raise AssertionError("extract_tool_calls invoked on native-streaming path")
|
||||
def extract_tool_calls_streaming(
|
||||
self, previous_text, current_text, delta_text,
|
||||
previous_token_ids, current_token_ids, delta_token_ids, request,
|
||||
):
|
||||
if not delta_text:
|
||||
return None
|
||||
return _DeltaMsg(content=delta_text)
|
||||
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
"Paris ",
|
||||
"Paris is ",
|
||||
"Paris is the capital of France.",
|
||||
]))
|
||||
s.tool_parser_cls = StreamingParser
|
||||
req = backend_pb2.PredictOptions(
|
||||
Prompt="x",
|
||||
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||
)
|
||||
replies = self._collect(s, req)
|
||||
|
||||
intermediate_content = [
|
||||
cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
|
||||
]
|
||||
self.assertTrue(
|
||||
len(intermediate_content) > 0,
|
||||
"Plain-text response not streamed progressively (native streaming inactive?)",
|
||||
)
|
||||
assembled = "".join(
|
||||
cd.content for r in replies for cd in r.chat_deltas if cd.content
|
||||
)
|
||||
self.assertEqual(
|
||||
assembled, "Paris is the capital of France.",
|
||||
f"Assembled content wrong: {assembled!r}",
|
||||
)
|
||||
|
||||
# ── Case 4: native streaming, structured tool_call, no markup ──
|
||||
def test_native_streaming_tool_call_no_markup_leak(self):
|
||||
from types import SimpleNamespace
|
||||
|
||||
class _DeltaMsg:
|
||||
def __init__(self, content=None, reasoning=None, tool_calls=None):
|
||||
self.content = content
|
||||
self.reasoning = reasoning
|
||||
self.tool_calls = tool_calls or []
|
||||
|
||||
class _ToolCallStreamer:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
self._emitted = False
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
raise AssertionError("extract_tool_calls invoked on native-streaming path")
|
||||
def extract_tool_calls_streaming(
|
||||
self, previous_text, current_text, delta_text,
|
||||
previous_token_ids, current_token_ids, delta_token_ids, request,
|
||||
):
|
||||
if "</tool_call>" in current_text and not self._emitted:
|
||||
self._emitted = True
|
||||
fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
|
||||
tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
|
||||
return _DeltaMsg(tool_calls=[tc])
|
||||
return None
|
||||
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
'<tool_call>\n',
|
||||
'<tool_call>\n{"name": "calc"',
|
||||
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||
]))
|
||||
s.tool_parser_cls = _ToolCallStreamer
|
||||
req = backend_pb2.PredictOptions(
|
||||
Prompt="x",
|
||||
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||
)
|
||||
replies = self._collect(s, req)
|
||||
|
||||
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||
self.assertFalse(
|
||||
any("<tool_call" in c or "</tool_call>" in c for c in contents),
|
||||
f"markup leaked as content: {contents!r}",
|
||||
)
|
||||
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
|
||||
args = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
|
||||
self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
|
||||
self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
|
||||
|
||||
# ── Case 5: parser exception → fallback to buffer, no leak ──
|
||||
def test_native_streaming_parser_exception_falls_back_to_buffer(self):
|
||||
from types import SimpleNamespace
|
||||
call = SimpleNamespace(id="call_1",
|
||||
function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
|
||||
|
||||
class _BrokenStreamer:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
pass
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
|
||||
def extract_tool_calls_streaming(self, *a, **kw):
|
||||
raise RuntimeError("simulated parser bug")
|
||||
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
'<tool_call>\n{"name": "calc"',
|
||||
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||
]))
|
||||
s.tool_parser_cls = _BrokenStreamer
|
||||
req = backend_pb2.PredictOptions(
|
||||
Prompt="x",
|
||||
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||
)
|
||||
replies = self._collect(s, req)
|
||||
|
||||
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||
self.assertFalse(
|
||||
any("<tool_call" in c for c in contents),
|
||||
f"markup leaked after parser exception: {contents!r}",
|
||||
)
|
||||
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
|
||||
self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
|
||||
|
||||
# ── Case 6: no tool parser → unchanged per-delta content stream ──
|
||||
def test_no_tool_parser_unchanged_per_delta_stream(self):
|
||||
from types import SimpleNamespace
|
||||
s = self._new_servicer() # tool_parser_cls already None
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
"Hello ", "Hello world", "Hello world!",
|
||||
]))
|
||||
req = backend_pb2.PredictOptions(Prompt="x", Tools="")
|
||||
replies = self._collect(s, req)
|
||||
|
||||
intermediate = [
|
||||
cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
|
||||
]
|
||||
self.assertEqual(
|
||||
intermediate, ["Hello ", "world", "!"],
|
||||
f"plain streaming changed; got {intermediate!r}",
|
||||
)
|
||||
|
||||
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
|
||||
}
|
||||
appCfg := a.ApplicationConfig()
|
||||
|
||||
if cfg.PII.Enabled != nil {
|
||||
enabled = *cfg.PII.Enabled
|
||||
} else {
|
||||
enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
|
||||
}
|
||||
// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
|
||||
// default (cloud-proxy)" — the single source of that rule.
|
||||
enabled = cfg.PIIIsEnabled()
|
||||
if !enabled {
|
||||
return false, nil
|
||||
}
|
||||
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
|
||||
if len(detectors) == 0 {
|
||||
detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
|
||||
}
|
||||
return enabled, detectors
|
||||
return true, detectors // enabled is necessarily true past the !enabled guard
|
||||
}
|
||||
|
||||
// PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
|
||||
|
||||
@@ -215,6 +215,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
|
||||
envBackendGalleries := slices.Equal(appConfig.BackendGalleries, startupAppConfig.BackendGalleries)
|
||||
envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
|
||||
envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
|
||||
envPIIDefaultDetectors := slices.Equal(appConfig.PIIDefaultDetectors, startupAppConfig.PIIDefaultDetectors)
|
||||
envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
|
||||
envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
|
||||
envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
|
||||
@@ -335,6 +336,15 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
|
||||
if settings.AutoloadBackendGalleries != nil && !envAutoloadBackendGalleries {
|
||||
appConfig.AutoloadBackendGalleries = *settings.AutoloadBackendGalleries
|
||||
}
|
||||
if settings.PIIDefaultDetectors != nil && !envPIIDefaultDetectors {
|
||||
// Request-side default redaction reads this live via
|
||||
// ResolvePIIPolicy, so a file edit takes effect on the next chat
|
||||
// request. The MITM listener resolves its per-host detector map
|
||||
// once at start, so a raw file edit reaches cloud-proxy traffic
|
||||
// only after a restart or a POST /api/settings (which rebuilds
|
||||
// the listener) — the admin UI uses the latter.
|
||||
appConfig.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
|
||||
}
|
||||
if settings.AutoUpgradeBackends != nil {
|
||||
appConfig.AutoUpgradeBackends = *settings.AutoUpgradeBackends
|
||||
}
|
||||
|
||||
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
|
||||
Pressure: pressure,
|
||||
})
|
||||
|
||||
// Wire staging-progress broadcasting so file-staging shows up on every
|
||||
// replica, not just the one performing the transfer. Without this, a
|
||||
// /api/operations poll that round-robins onto a peer sees no staging row and
|
||||
// the progress flickers. The origin publishes; peers mirror via the wildcard.
|
||||
router.StagingTracker().SetPublisher(natsClient)
|
||||
if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
|
||||
xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
|
||||
}
|
||||
|
||||
// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
|
||||
// RegistrationToken feed the state-reconciliation passes: pending op
|
||||
// drain uses the adapter, and model health probes use the token to auth
|
||||
|
||||
@@ -109,6 +109,52 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
|
||||
})
|
||||
})
|
||||
|
||||
// Instance-wide default PII detectors. The file is the only source (no
|
||||
// env var), and the loader runs immediately before startMITMIfConfigured,
|
||||
// so a regression here means the cloud-proxy MITM listener resolves an
|
||||
// empty detector set at boot and forwards intercepted traffic unredacted —
|
||||
// even though pii_default_detectors is on disk and the MITM model has PII
|
||||
// enabled. It also breaks request-side default redaction the same way.
|
||||
Describe("PII default detectors", func() {
|
||||
It("loads pii_default_detectors from the file", func() {
|
||||
cfg := &config.ApplicationConfig{DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["privacy-filter-nemotron", "secret-filter"]}`)}
|
||||
loadRuntimeSettingsFromFile(cfg)
|
||||
Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"privacy-filter-nemotron", "secret-filter"}))
|
||||
})
|
||||
|
||||
It("does not override an env/CLI-set value (LOCALAI_PII_DEFAULT_DETECTORS)", func() {
|
||||
cfg := &config.ApplicationConfig{
|
||||
DynamicConfigsDir: seedSettings(`{"pii_default_detectors": ["from-file"]}`),
|
||||
PIIDefaultDetectors: []string{"from-env"}, // simulate WithPIIDefaultDetectors(env)
|
||||
}
|
||||
loadRuntimeSettingsFromFile(cfg)
|
||||
Expect(cfg.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env var must win over the persisted file value")
|
||||
})
|
||||
})
|
||||
|
||||
// The live file watcher applies pii_default_detectors on a runtime change
|
||||
// the same way it handles galleries/threads/etc.: env-set values (current
|
||||
// == startup snapshot) are left alone, otherwise the file value is applied
|
||||
// to the live config so request-side default redaction picks it up without
|
||||
// a restart.
|
||||
Describe("file watcher: pii_default_detectors", func() {
|
||||
It("applies a changed file value to the live config", func() {
|
||||
startup := config.ApplicationConfig{} // no env baseline
|
||||
live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"old"}}
|
||||
handler := readRuntimeSettingsJson(startup)
|
||||
Expect(handler([]byte(`{"pii_default_detectors":["new-a","new-b"]}`), live)).To(Succeed())
|
||||
Expect(live.PIIDefaultDetectors).To(Equal([]string{"new-a", "new-b"}))
|
||||
})
|
||||
|
||||
It("leaves an env-controlled value untouched", func() {
|
||||
startup := config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
|
||||
live := &config.ApplicationConfig{PIIDefaultDetectors: []string{"from-env"}}
|
||||
handler := readRuntimeSettingsJson(startup)
|
||||
Expect(handler([]byte(`{"pii_default_detectors":["from-file"]}`), live)).To(Succeed())
|
||||
Expect(live.PIIDefaultDetectors).To(Equal([]string{"from-env"}), "env-controlled detectors must not be overwritten by the file")
|
||||
})
|
||||
})
|
||||
|
||||
// The Agent Pool block has a mix of zero and non-zero defaults
|
||||
// (Enabled=true, EmbeddingModel="granite-...", MaxChunkingSize=400,
|
||||
// VectorEngine="chromem", AgentHubURL="https://agenthub.localai.io").
|
||||
|
||||
@@ -25,6 +25,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/services/storage"
|
||||
coreStartup "github.com/mudler/LocalAI/core/startup"
|
||||
"github.com/mudler/LocalAI/internal"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/signals"
|
||||
"github.com/mudler/LocalAI/pkg/vram"
|
||||
|
||||
@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to create ModelPath: %q", err)
|
||||
}
|
||||
|
||||
// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
|
||||
// by an OOM/restart, or stalled before cleanup could run). The 24h window
|
||||
// is well beyond any legitimate in-flight download, so this never trims an
|
||||
// active transfer; it just stops dead partials accumulating on the volume.
|
||||
if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
|
||||
xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
|
||||
} else if removed > 0 {
|
||||
xlog.Info("Reaped stale partial downloads", "count", removed)
|
||||
}
|
||||
if options.GeneratedContentDir != "" {
|
||||
err := os.MkdirAll(options.GeneratedContentDir, 0o750)
|
||||
if err != nil {
|
||||
@@ -633,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
|
||||
options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||
}
|
||||
}
|
||||
if settings.SizeAwareEviction != nil {
|
||||
// Only apply if current value is default (false), suggesting it wasn't set from env var
|
||||
if !options.SizeAwareEviction {
|
||||
options.SizeAwareEviction = *settings.SizeAwareEviction
|
||||
}
|
||||
}
|
||||
if settings.LRUEvictionMaxRetries != nil {
|
||||
// Only apply if current value is default (30), suggesting it wasn't set from env var
|
||||
if options.LRUEvictionMaxRetries == 0 {
|
||||
@@ -733,6 +750,20 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
|
||||
options.MITMListen = *settings.MITMListen
|
||||
}
|
||||
|
||||
// Instance-wide default PII detectors. LOCALAI_PII_DEFAULT_DETECTORS (via
|
||||
// WithPIIDefaultDetectors) wins when set; otherwise the file is the source
|
||||
// — apply it only when the env/CLI left the value empty, mirroring the
|
||||
// "env > file" precedence used for the other fields. This must land before
|
||||
// startMITMIfConfigured (called right after this loader): the cloud-proxy
|
||||
// listener resolves each intercept host's detectors once at start via
|
||||
// ResolvePIIPolicy, and a MITM model that names no detectors of its own
|
||||
// falls back to these defaults. Without it the listener (and request-side
|
||||
// default redaction) starts with an empty detector set and forwards
|
||||
// traffic unredacted even though pii_default_detectors is on disk.
|
||||
if settings.PIIDefaultDetectors != nil && len(options.PIIDefaultDetectors) == 0 {
|
||||
options.PIIDefaultDetectors = append([]string(nil), (*settings.PIIDefaultDetectors)...)
|
||||
}
|
||||
|
||||
// Backend upgrade flags
|
||||
if settings.AutoUpgradeBackends != nil {
|
||||
if !options.AutoUpgradeBackends {
|
||||
@@ -836,6 +867,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
|
||||
model.WithLRULimit(lruLimit),
|
||||
model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
|
||||
model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
|
||||
model.WithSizeAwareEviction(options.SizeAwareEviction),
|
||||
)
|
||||
application.ModelLoader().SetWatchDog(wd)
|
||||
|
||||
|
||||
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
|
||||
model.WithLRULimit(lruLimit),
|
||||
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
|
||||
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
|
||||
model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
|
||||
)
|
||||
|
||||
// Create new stop channel BEFORE setting up any goroutines
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/rand/v2"
|
||||
@@ -12,7 +13,9 @@ import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/trace"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/vram"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
|
||||
})
|
||||
}
|
||||
|
||||
// estimateModelSizeBytes uses the unified EstimateModel entry point to compute
|
||||
// the total weight-file size for a model config. It collects all weight files
|
||||
// from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
|
||||
// repo ID so EstimateModel can fall back to the HF API when local file
|
||||
// metadata is unavailable (e.g. not-yet-downloaded models).
|
||||
func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
|
||||
seen := make(map[string]bool)
|
||||
input := vram.ModelEstimateInput{}
|
||||
|
||||
addFile := func(uri string) {
|
||||
if !vram.IsWeightFile(uri) {
|
||||
return
|
||||
}
|
||||
resolved := uri
|
||||
if !strings.Contains(uri, "://") {
|
||||
resolved = "file://" + filepath.Join(modelsPath, uri)
|
||||
}
|
||||
if seen[resolved] {
|
||||
return
|
||||
}
|
||||
seen[resolved] = true
|
||||
input.Files = append(input.Files, vram.FileInput{URI: resolved})
|
||||
}
|
||||
|
||||
// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
|
||||
// then extracts the org/model repo ID for use as the HF fallback path.
|
||||
tryHFRepo := func(uri string) {
|
||||
if input.HFRepo != "" {
|
||||
return
|
||||
}
|
||||
resolved := downloader.URI(uri).ResolveURL()
|
||||
if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
|
||||
input.HFRepo = repoID
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range c.DownloadFiles {
|
||||
uriStr := string(f.URI)
|
||||
addFile(uriStr)
|
||||
tryHFRepo(uriStr)
|
||||
}
|
||||
addFile(c.Model)
|
||||
tryHFRepo(c.Model)
|
||||
if c.MMProj != "" {
|
||||
addFile(c.MMProj)
|
||||
}
|
||||
|
||||
if len(input.Files) == 0 && input.HFRepo == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
result, err := vram.EstimateModelMultiContext(ctx, input, nil)
|
||||
if err != nil || result.SizeBytes == 0 {
|
||||
return 0
|
||||
}
|
||||
return int64(result.SizeBytes)
|
||||
}
|
||||
|
||||
func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
|
||||
defOpts := []model.Option{
|
||||
model.WithBackendString(c.Backend),
|
||||
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
|
||||
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
||||
}
|
||||
|
||||
if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
|
||||
defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
|
||||
}
|
||||
|
||||
return append(defOpts, opts...)
|
||||
}
|
||||
|
||||
@@ -90,10 +158,11 @@ func getSeed(c config.ModelConfig) int32 {
|
||||
// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
|
||||
// model config leaves them unset. Exported so callers that must respect the
|
||||
// effective decode window — notably the router's prompt trimmer — resolve the
|
||||
// same numbers grpcModelOpts does instead of guessing.
|
||||
// same numbers grpcModelOpts does instead of guessing. The values are owned by
|
||||
// core/config (single source of truth shared with the config default tiers).
|
||||
const (
|
||||
DefaultContextSize = 4096
|
||||
DefaultBatchSize = 512
|
||||
DefaultContextSize = config.DefaultContextSize
|
||||
DefaultBatchSize = config.DefaultPhysicalBatch
|
||||
)
|
||||
|
||||
// EffectiveContextSize is the context window the backend will run with: the
|
||||
@@ -129,7 +198,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
|
||||
ctxSize := EffectiveContextSize(c)
|
||||
b := EffectiveBatchSize(c)
|
||||
|
||||
flashAttention := "auto"
|
||||
flashAttention := config.DefaultFlashAttention
|
||||
|
||||
if c.FlashAttention != nil {
|
||||
flashAttention = *c.FlashAttention
|
||||
@@ -175,7 +244,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
|
||||
mmlock = *c.MMlock
|
||||
}
|
||||
|
||||
nGPULayers := 9999999
|
||||
nGPULayers := config.DefaultNGPULayers
|
||||
if c.NGPULayers != nil {
|
||||
nGPULayers = *c.NGPULayers
|
||||
}
|
||||
|
||||
88
core/backend/sound_classification.go
Normal file
88
core/backend/sound_classification.go
Normal file
@@ -0,0 +1,88 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
// SoundDetectionRequest carries the knobs the HTTP layer collects for an
|
||||
// audio-tagging / sound-event-classification call. Audio is the path to the
|
||||
// uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
|
||||
type SoundDetectionRequest struct {
|
||||
Audio string
|
||||
TopK int32
|
||||
Threshold float32
|
||||
}
|
||||
|
||||
func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
|
||||
return &proto.SoundDetectionRequest{
|
||||
Src: r.Audio,
|
||||
TopK: r.TopK,
|
||||
Threshold: r.Threshold,
|
||||
}
|
||||
}
|
||||
|
||||
func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
|
||||
if modelConfig.Backend == "" {
|
||||
return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
|
||||
}
|
||||
opts := ModelOptions(modelConfig, appConfig)
|
||||
m, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
|
||||
return nil, err
|
||||
}
|
||||
if m == nil {
|
||||
return nil, fmt.Errorf("could not load sound classification model")
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// ModelSoundDetection runs the SoundDetection RPC against the configured
|
||||
// backend and returns a normalized schema.SoundClassificationResult.
|
||||
func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
|
||||
m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r, err := m.SoundDetection(ctx, req.toProto())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return soundClassificationResultFromProto(modelConfig.Name, r), nil
|
||||
}
|
||||
|
||||
// soundClassificationResultFromProto maps the backend detections to the
|
||||
// HTTP-facing schema, keeping the backend's score-descending order.
|
||||
func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
|
||||
out := &schema.SoundClassificationResult{
|
||||
Model: modelName,
|
||||
Detections: []schema.SoundClassification{},
|
||||
}
|
||||
if r == nil {
|
||||
return out
|
||||
}
|
||||
for _, d := range r.Detections {
|
||||
if d == nil {
|
||||
continue
|
||||
}
|
||||
out.Detections = append(out.Detections, schema.SoundClassification{
|
||||
Index: int(d.Index),
|
||||
Label: d.Label,
|
||||
Score: d.Score,
|
||||
})
|
||||
}
|
||||
sort.SliceStable(out.Detections, func(i, j int) bool {
|
||||
return out.Detections[i].Score > out.Detections[j].Score
|
||||
})
|
||||
return out
|
||||
}
|
||||
@@ -93,6 +93,7 @@ type RunCMD struct {
|
||||
EnableMemoryReclaimer bool `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
|
||||
MemoryReclaimerThreshold float64 `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
|
||||
ForceEvictionWhenBusy bool `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
|
||||
SizeAwareEviction bool `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
|
||||
LRUEvictionMaxRetries int `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
|
||||
LRUEvictionRetryInterval string `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
|
||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||
@@ -180,6 +181,8 @@ type RunCMD struct {
|
||||
// Cloud-proxy MITM listener (off by default).
|
||||
MITMListen string `env:"LOCALAI_MITM_LISTEN" help:"Address (host:port) for the cloudproxy MITM listener. Empty = disabled. Clients set HTTPS_PROXY=http://<this>:<port>. Intercept hosts are declared per-model via the model YAML mitm.hosts: block; create one from the Add Model UI." group:"middleware"`
|
||||
MITMCADir string `env:"LOCALAI_MITM_CA_DIR" type:"path" help:"Directory holding the MITM proxy CA cert + key. Defaults to <data-path>/mitm-ca." group:"middleware"`
|
||||
|
||||
PIIDefaultDetectors []string `env:"LOCALAI_PII_DEFAULT_DETECTORS" help:"Instance-wide default PII/secret detector model names applied to any PII-enabled model (chiefly cloud-proxy / MITM models) that names no pii.detectors of its own. Comma-separated, e.g. privacy-filter-nemotron,secret-filter. Takes precedence over the value persisted via the Middleware UI." group:"middleware"`
|
||||
}
|
||||
|
||||
func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
@@ -242,6 +245,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
config.WithAPIAddress(r.Address),
|
||||
config.WithMITMListen(r.MITMListen),
|
||||
config.WithMITMCADir(r.MITMCADir),
|
||||
config.WithPIIDefaultDetectors(r.PIIDefaultDetectors),
|
||||
config.WithAgentJobRetentionDays(r.AgentJobRetentionDays),
|
||||
config.WithLlamaCPPTunnelCallback(func(tunnels []string) {
|
||||
tunnelEnvVar := strings.Join(tunnels, ",")
|
||||
@@ -564,6 +568,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
if r.ForceEvictionWhenBusy {
|
||||
opts = append(opts, config.WithForceEvictionWhenBusy(true))
|
||||
}
|
||||
if r.SizeAwareEviction {
|
||||
opts = append(opts, config.WithSizeAwareEviction(true))
|
||||
}
|
||||
if r.LRUEvictionMaxRetries > 0 {
|
||||
opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
|
||||
}
|
||||
|
||||
@@ -119,6 +119,7 @@ type ApplicationConfig struct {
|
||||
|
||||
// Eviction settings
|
||||
ForceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||
SizeAwareEviction bool // Evict largest models first rather than least-recently-used (default: false)
|
||||
LRUEvictionMaxRetries int // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
||||
LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
|
||||
|
||||
@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
|
||||
}
|
||||
}
|
||||
|
||||
// WithSizeAwareEviction enables size-aware eviction ordering.
|
||||
// When true, the watchdog evicts the largest loaded model first rather than the
|
||||
// least-recently-used one, keeping small utility models resident and maximizing
|
||||
// memory freed per eviction.
|
||||
func WithSizeAwareEviction(enabled bool) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.SizeAwareEviction = enabled
|
||||
}
|
||||
}
|
||||
|
||||
// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
|
||||
func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
@@ -701,6 +712,18 @@ func WithMITMCADir(dir string) AppOption {
|
||||
}
|
||||
}
|
||||
|
||||
// WithPIIDefaultDetectors sets the instance-wide default PII/secret detector
|
||||
// model names applied to any PII-enabled model (chiefly cloud-proxy / MITM
|
||||
// models) that names no pii.detectors of its own. CLI/env:
|
||||
// LOCALAI_PII_DEFAULT_DETECTORS. Empty leaves the value to
|
||||
// runtime_settings.json / the Middleware UI; a non-empty value takes
|
||||
// precedence over the file (env > file).
|
||||
func WithPIIDefaultDetectors(detectors []string) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.PIIDefaultDetectors = detectors
|
||||
}
|
||||
}
|
||||
|
||||
func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.DynamicConfigsDir = dynamicConfigsDir
|
||||
@@ -1028,6 +1051,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
||||
memoryReclaimerEnabled := o.MemoryReclaimerEnabled
|
||||
memoryReclaimerThreshold := o.MemoryReclaimerThreshold
|
||||
forceEvictionWhenBusy := o.ForceEvictionWhenBusy
|
||||
sizeAwareEviction := o.SizeAwareEviction
|
||||
lruEvictionMaxRetries := o.LRUEvictionMaxRetries
|
||||
threads := o.Threads
|
||||
contextSize := o.ContextSize
|
||||
@@ -1120,6 +1144,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
||||
MemoryReclaimerEnabled: &memoryReclaimerEnabled,
|
||||
MemoryReclaimerThreshold: &memoryReclaimerThreshold,
|
||||
ForceEvictionWhenBusy: &forceEvictionWhenBusy,
|
||||
SizeAwareEviction: &sizeAwareEviction,
|
||||
LRUEvictionMaxRetries: &lruEvictionMaxRetries,
|
||||
LRUEvictionRetryInterval: &lruEvictionRetryInterval,
|
||||
Threads: &threads,
|
||||
@@ -1244,6 +1269,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
|
||||
o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
}
|
||||
if settings.SizeAwareEviction != nil {
|
||||
o.SizeAwareEviction = *settings.SizeAwareEviction
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
}
|
||||
if settings.LRUEvictionMaxRetries != nil {
|
||||
o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
|
||||
@@ -8,27 +8,28 @@ import (
|
||||
// Usecase name constants — the canonical string values used in gallery entries,
|
||||
// model configs (known_usecases), and UsecaseInfoMap keys.
|
||||
const (
|
||||
UsecaseChat = "chat"
|
||||
UsecaseCompletion = "completion"
|
||||
UsecaseEdit = "edit"
|
||||
UsecaseVision = "vision"
|
||||
UsecaseEmbeddings = "embeddings"
|
||||
UsecaseTokenize = "tokenize"
|
||||
UsecaseImage = "image"
|
||||
UsecaseVideo = "video"
|
||||
UsecaseTranscript = "transcript"
|
||||
UsecaseTTS = "tts"
|
||||
UsecaseSoundGeneration = "sound_generation"
|
||||
UsecaseRerank = "rerank"
|
||||
UsecaseDetection = "detection"
|
||||
UsecaseDepth = "depth"
|
||||
UsecaseVAD = "vad"
|
||||
UsecaseAudioTransform = "audio_transform"
|
||||
UsecaseDiarization = "diarization"
|
||||
UsecaseRealtimeAudio = "realtime_audio"
|
||||
UsecaseFaceRecognition = "face_recognition"
|
||||
UsecaseSpeakerRecognition = "speaker_recognition"
|
||||
UsecaseTokenClassify = "token_classify"
|
||||
UsecaseChat = "chat"
|
||||
UsecaseCompletion = "completion"
|
||||
UsecaseEdit = "edit"
|
||||
UsecaseVision = "vision"
|
||||
UsecaseEmbeddings = "embeddings"
|
||||
UsecaseTokenize = "tokenize"
|
||||
UsecaseImage = "image"
|
||||
UsecaseVideo = "video"
|
||||
UsecaseTranscript = "transcript"
|
||||
UsecaseTTS = "tts"
|
||||
UsecaseSoundGeneration = "sound_generation"
|
||||
UsecaseRerank = "rerank"
|
||||
UsecaseDetection = "detection"
|
||||
UsecaseDepth = "depth"
|
||||
UsecaseVAD = "vad"
|
||||
UsecaseAudioTransform = "audio_transform"
|
||||
UsecaseDiarization = "diarization"
|
||||
UsecaseSoundClassification = "sound_classification"
|
||||
UsecaseRealtimeAudio = "realtime_audio"
|
||||
UsecaseFaceRecognition = "face_recognition"
|
||||
UsecaseSpeakerRecognition = "speaker_recognition"
|
||||
UsecaseTokenClassify = "token_classify"
|
||||
)
|
||||
|
||||
// GRPCMethod identifies a Backend service RPC from backend.proto.
|
||||
@@ -51,6 +52,7 @@ const (
|
||||
MethodVAD GRPCMethod = "VAD"
|
||||
MethodAudioTransform GRPCMethod = "AudioTransform"
|
||||
MethodDiarize GRPCMethod = "Diarize"
|
||||
MethodSoundDetection GRPCMethod = "SoundDetection"
|
||||
MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
|
||||
MethodFaceVerify GRPCMethod = "FaceVerify"
|
||||
MethodFaceAnalyze GRPCMethod = "FaceAnalyze"
|
||||
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
|
||||
GRPCMethod: MethodDiarize,
|
||||
Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
|
||||
},
|
||||
UsecaseSoundClassification: {
|
||||
Flag: FLAG_SOUND_CLASSIFICATION,
|
||||
GRPCMethod: MethodSoundDetection,
|
||||
Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
|
||||
},
|
||||
UsecaseRealtimeAudio: {
|
||||
Flag: FLAG_REALTIME_AUDIO,
|
||||
GRPCMethod: MethodAudioToAudioStream,
|
||||
|
||||
30
core/config/defaults.go
Normal file
30
core/config/defaults.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package config
|
||||
|
||||
// Canonical default values.
|
||||
//
|
||||
// These are owned here so the two layers that need them share a single source
|
||||
// of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
|
||||
// *decide* defaults) and core/backend/options.go (which *translates* a
|
||||
// ModelConfig to the backend wire format and supplies the same fallbacks
|
||||
// defensively). Previously these were duplicated as literals across both
|
||||
// packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
|
||||
// constants of 512). core/backend imports core/config, so backend references
|
||||
// these; config never imports backend.
|
||||
const (
|
||||
// DefaultContextSize is the fallback context window when none is configured
|
||||
// or estimable from the model.
|
||||
DefaultContextSize = 4096
|
||||
|
||||
// GGUFFallbackContextSize is the context window for a GGUF model whose
|
||||
// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
|
||||
// smaller than DefaultContextSize to stay conservative on memory there.
|
||||
GGUFFallbackContextSize = 1024
|
||||
|
||||
// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
|
||||
// clamps to what actually fits in device memory.
|
||||
DefaultNGPULayers = 99999999
|
||||
|
||||
// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
|
||||
// backend enable it when the model + backend support it.
|
||||
DefaultFlashAttention = "auto"
|
||||
)
|
||||
115
core/config/generic_defaults.go
Normal file
115
core/config/generic_defaults.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package config
|
||||
|
||||
import "os"
|
||||
|
||||
// ApplyGenericDefaults fills the generic fallback values applied after the
|
||||
// higher-priority tiers (ApplyInferenceDefaults for the model family,
|
||||
// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
|
||||
// policy): sampling parameters and a few runtime flags. Like the other tiers it
|
||||
// only fills values still left unset, so model-family / explicit config wins.
|
||||
func ApplyGenericDefaults(cfg *ModelConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||
defaultTopP := 0.95
|
||||
defaultTopK := 40
|
||||
defaultMinP := 0.0
|
||||
defaultTemp := 0.9
|
||||
// https://github.com/mudler/LocalAI/issues/2780
|
||||
defaultMirostat := 0
|
||||
defaultMirostatTAU := 5.0
|
||||
defaultMirostatETA := 0.1
|
||||
defaultTypicalP := 1.0
|
||||
defaultTFZ := 1.0
|
||||
defaultZero := 0
|
||||
|
||||
trueV := true
|
||||
falseV := false
|
||||
|
||||
if cfg.Seed == nil {
|
||||
// random number generator seed
|
||||
defaultSeed := RAND_SEED
|
||||
cfg.Seed = &defaultSeed
|
||||
}
|
||||
|
||||
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
||||
// native default differs (issue #6632). Only inject it for the llama.cpp
|
||||
// family and the empty/auto backend; leave TopK nil for known non-llama
|
||||
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
||||
// is 0 rather than a silently-changed 40.
|
||||
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
||||
cfg.TopK = &defaultTopK
|
||||
}
|
||||
|
||||
if cfg.MinP == nil {
|
||||
cfg.MinP = &defaultMinP
|
||||
}
|
||||
|
||||
if cfg.TypicalP == nil {
|
||||
cfg.TypicalP = &defaultTypicalP
|
||||
}
|
||||
|
||||
if cfg.TFZ == nil {
|
||||
cfg.TFZ = &defaultTFZ
|
||||
}
|
||||
|
||||
if cfg.MMap == nil {
|
||||
// MMap is enabled by default
|
||||
|
||||
// Only exception is for Intel GPUs
|
||||
if os.Getenv("XPU") != "" {
|
||||
cfg.MMap = &falseV
|
||||
} else {
|
||||
cfg.MMap = &trueV
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.MMlock == nil {
|
||||
// MMlock is disabled by default
|
||||
cfg.MMlock = &falseV
|
||||
}
|
||||
|
||||
if cfg.TopP == nil {
|
||||
cfg.TopP = &defaultTopP
|
||||
}
|
||||
if cfg.Temperature == nil {
|
||||
cfg.Temperature = &defaultTemp
|
||||
}
|
||||
|
||||
if cfg.Maxtokens == nil {
|
||||
cfg.Maxtokens = &defaultZero
|
||||
}
|
||||
|
||||
if cfg.Mirostat == nil {
|
||||
cfg.Mirostat = &defaultMirostat
|
||||
}
|
||||
|
||||
if cfg.MirostatETA == nil {
|
||||
cfg.MirostatETA = &defaultMirostatETA
|
||||
}
|
||||
|
||||
if cfg.MirostatTAU == nil {
|
||||
cfg.MirostatTAU = &defaultMirostatTAU
|
||||
}
|
||||
|
||||
if cfg.LowVRAM == nil {
|
||||
cfg.LowVRAM = &falseV
|
||||
}
|
||||
|
||||
if cfg.Embeddings == nil {
|
||||
cfg.Embeddings = &falseV
|
||||
}
|
||||
|
||||
if cfg.Reranking == nil {
|
||||
cfg.Reranking = &falseV
|
||||
}
|
||||
|
||||
if cfg.PromptCacheAll == nil {
|
||||
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
||||
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
||||
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
||||
cfg.PromptCacheAll = &trueV
|
||||
}
|
||||
}
|
||||
36
core/config/generic_defaults_test.go
Normal file
36
core/config/generic_defaults_test.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
|
||||
It("fills sampling + runtime fallbacks when unset", func() {
|
||||
cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
|
||||
ApplyGenericDefaults(cfg)
|
||||
Expect(cfg.TopP).ToNot(BeNil())
|
||||
Expect(*cfg.TopP).To(Equal(0.95))
|
||||
Expect(*cfg.TopK).To(Equal(40))
|
||||
Expect(*cfg.Temperature).To(Equal(0.9))
|
||||
Expect(*cfg.MMap).To(BeTrue())
|
||||
Expect(*cfg.MMlock).To(BeFalse())
|
||||
Expect(*cfg.PromptCacheAll).To(BeTrue())
|
||||
})
|
||||
|
||||
It("never overrides explicit values", func() {
|
||||
tk := 7
|
||||
tp := 0.5
|
||||
cfg := &ModelConfig{}
|
||||
cfg.TopK = &tk
|
||||
cfg.TopP = &tp
|
||||
ApplyGenericDefaults(cfg)
|
||||
Expect(*cfg.TopK).To(Equal(7))
|
||||
Expect(*cfg.TopP).To(Equal(0.5))
|
||||
})
|
||||
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
@@ -14,11 +14,6 @@ import (
|
||||
"github.com/gpustack/gguf-parser-go/util/ptr"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultContextSize = 1024
|
||||
defaultNGPULayers = 99999999
|
||||
)
|
||||
|
||||
// reservedNonChatModel reports whether the operator reserved this model for an
|
||||
// internal primitive — the router score classifier or the PII NER
|
||||
// token_classify tier. Such a model has no chat template and must not be
|
||||
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||
cSize := int(ctxSize)
|
||||
cfg.ContextSize = &cSize
|
||||
} else {
|
||||
defaultCtx = defaultContextSize
|
||||
defaultCtx = GGUFFallbackContextSize
|
||||
cfg.ContextSize = &defaultCtx
|
||||
}
|
||||
}
|
||||
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||
|
||||
if cfg.NGPULayers == nil {
|
||||
// we assume we want to offload all layers
|
||||
defaultHigh := defaultNGPULayers
|
||||
defaultHigh := DefaultNGPULayers
|
||||
cfg.NGPULayers = &defaultHigh
|
||||
}
|
||||
|
||||
|
||||
180
core/config/hardware_defaults.go
Normal file
180
core/config/hardware_defaults.go
Normal file
@@ -0,0 +1,180 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// Hardware-driven model-config defaults.
|
||||
//
|
||||
// This sits alongside the other config overriders (ApplyInferenceDefaults for
|
||||
// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
|
||||
// heuristically fill ModelConfig values the user left unset. Hardware tuning is
|
||||
// the same domain — "adjust the config from the device that will run it" — so
|
||||
// it lives here rather than scattered into the backend or a separate package.
|
||||
//
|
||||
// The heuristics are parameterized on a GPU descriptor (not on direct
|
||||
// detection) so they apply in both deployment shapes: SetDefaults passes the
|
||||
// LocalGPU on a single host, and the distributed router passes the *selected
|
||||
// node's* reported GPU before loading there (the frontend that loaded the
|
||||
// config may have no GPU at all).
|
||||
|
||||
// GPU describes the device that will run a model.
|
||||
type GPU struct {
|
||||
// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
|
||||
Vendor string
|
||||
// ComputeCapability is the NVIDIA compute capability as "major.minor"
|
||||
// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
|
||||
ComputeCapability string
|
||||
// VRAM is total device memory in bytes (0 = unknown).
|
||||
VRAM uint64
|
||||
}
|
||||
|
||||
// Physical batch (n_batch / n_ubatch) defaults.
|
||||
const (
|
||||
// DefaultPhysicalBatch is the conservative default when no hardware-specific
|
||||
// tuning applies. core/backend.DefaultBatchSize references this (single source).
|
||||
DefaultPhysicalBatch = 512
|
||||
// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
|
||||
// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
|
||||
// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
|
||||
// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
|
||||
BlackwellPhysicalBatch = 2048
|
||||
)
|
||||
|
||||
// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
|
||||
// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
|
||||
// reports a different compute capability and is intentionally not matched.
|
||||
func (g GPU) IsNVIDIABlackwell() bool {
|
||||
maj, _ := parseComputeCapability(g.ComputeCapability)
|
||||
return maj >= 12
|
||||
}
|
||||
|
||||
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
||||
// given hardware, used when the model config leaves batch unset.
|
||||
func PhysicalBatch(g GPU) int {
|
||||
if g.IsNVIDIABlackwell() {
|
||||
return BlackwellPhysicalBatch
|
||||
}
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
|
||||
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||
// Callers that re-tune a value chosen by an upstream host (the distributed
|
||||
// router correcting the frontend's guess) use this to avoid clobbering an
|
||||
// explicit user batch such as 1024.
|
||||
func IsManagedPhysicalBatch(n int) bool {
|
||||
return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
|
||||
}
|
||||
|
||||
// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
|
||||
// n_parallel=1 (the backend default) and only auto-enables continuous batching
|
||||
// when n_parallel > 1 — so a single-slot default makes concurrent requests
|
||||
// queue. We default a slot count by GPU size so multi-user serving works out of
|
||||
// the box. With the backend's unified KV cache the slots SHARE the context
|
||||
// budget, so more slots add concurrency without multiplying KV memory.
|
||||
const (
|
||||
parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
|
||||
parallelSlotsVRAMMid = uint64(8) << 30 // >=8 GiB -> 4 slots
|
||||
parallelSlotsVRAMLow = uint64(4) << 30 // >=4 GiB -> 2 slots
|
||||
)
|
||||
|
||||
// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
|
||||
// 1 (no concurrency) when VRAM is unknown or too small, so we never change
|
||||
// behavior on CPU-only / tiny devices.
|
||||
func DefaultParallelSlots(g GPU) int {
|
||||
switch {
|
||||
case g.VRAM >= parallelSlotsVRAMHigh:
|
||||
return 8
|
||||
case g.VRAM >= parallelSlotsVRAMMid:
|
||||
return 4
|
||||
case g.VRAM >= parallelSlotsVRAMLow:
|
||||
return 2
|
||||
default:
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
|
||||
// model doesn't already set one (and the GPU warrants concurrency). Returns the
|
||||
// possibly-extended options. Shared by the single-host config path
|
||||
// (ApplyHardwareDefaults) and the distributed router (per selected node).
|
||||
func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||
if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
|
||||
return append(opts, fmt.Sprintf("parallel:%d", slots))
|
||||
}
|
||||
return opts
|
||||
}
|
||||
|
||||
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
||||
// so we never override an explicit value (helper shared with serving_defaults.go).
|
||||
func hasParallelOption(opts []string) bool {
|
||||
return backendOptionSet(opts, "parallel", "n_parallel")
|
||||
}
|
||||
|
||||
// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
|
||||
// a single host (the distributed router builds it from the selected node's
|
||||
// reported info instead). It is a package var so tests can inject a
|
||||
// deterministic device — detection does a live nvidia-smi call.
|
||||
var localGPU = func() GPU {
|
||||
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||
vram, _ := xsysinfo.TotalAvailableVRAM()
|
||||
return GPU{
|
||||
Vendor: vendor,
|
||||
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||
VRAM: vram,
|
||||
}
|
||||
}
|
||||
|
||||
// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
|
||||
// and were left unset by the user. Currently: a larger physical batch on
|
||||
// Blackwell. Explicit config always wins (we only touch zero values).
|
||||
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
||||
}
|
||||
|
||||
// Enable concurrent serving by default on a capable GPU: without this the
|
||||
// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
|
||||
// (continuous batching stays off). Unified KV means the slots share the
|
||||
// context budget, so this is concurrency without extra KV memory. Explicit
|
||||
// parallel/n_parallel in the model options always wins.
|
||||
if before := len(cfg.Options); true {
|
||||
cfg.Options = EnsureParallelOption(cfg.Options, gpu)
|
||||
if len(cfg.Options) > before {
|
||||
xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
|
||||
"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseComputeCapability splits a "major.minor" string into integer parts.
|
||||
// Returns (-1, -1) when it can't be parsed.
|
||||
func parseComputeCapability(cc string) (int, int) {
|
||||
cc = strings.TrimSpace(cc)
|
||||
if cc == "" {
|
||||
return -1, -1
|
||||
}
|
||||
majStr, minStr := cc, "0"
|
||||
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
|
||||
majStr, minStr = cc[:dot], cc[dot+1:]
|
||||
}
|
||||
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
|
||||
if err != nil {
|
||||
return -1, -1
|
||||
}
|
||||
min, err := strconv.Atoi(strings.TrimSpace(minStr))
|
||||
if err != nil {
|
||||
min = 0
|
||||
}
|
||||
return maj, min
|
||||
}
|
||||
37
core/config/hardware_defaults_internal_test.go
Normal file
37
core/config/hardware_defaults_internal_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Single-instance path: SetDefaults applies hardware defaults from the local
|
||||
// GPU. The detection seam (localGPU) is injected so the path is deterministic
|
||||
// without a real GPU.
|
||||
var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
|
||||
var orig func() GPU
|
||||
BeforeEach(func() { orig = localGPU })
|
||||
AfterEach(func() { localGPU = orig })
|
||||
|
||||
It("sets the physical batch on a local Blackwell GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
|
||||
It("leaves batch unset on a non-Blackwell local GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
|
||||
It("never overrides an explicit batch", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(1024))
|
||||
})
|
||||
})
|
||||
97
core/config/hardware_defaults_test.go
Normal file
97
core/config/hardware_defaults_test.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("Hardware-driven config defaults", func() {
|
||||
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
||||
func(cc string, want bool) {
|
||||
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
||||
},
|
||||
Entry("GB10 12.1", "12.1", true),
|
||||
Entry("RTX 50 12.0", "12.0", true),
|
||||
Entry("future 13.0", "13.0", true),
|
||||
Entry("Hopper 9.0", "9.0", false),
|
||||
Entry("Ada 8.9", "8.9", false),
|
||||
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
|
||||
Entry("unknown", "", false),
|
||||
)
|
||||
|
||||
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
|
||||
It("returns the Blackwell batch on Blackwell", func() {
|
||||
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("returns the default batch otherwise", func() {
|
||||
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
|
||||
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
It("recognizes managed defaults but not explicit values", func() {
|
||||
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
|
||||
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
|
||||
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ApplyHardwareDefaults", func() {
|
||||
It("raises an unset batch to 2048 on Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("leaves batch unset on non-Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
It("never overrides an explicit batch", func() {
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
Expect(cfg.Batch).To(Equal(1024))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||
func(vramGiB uint64, want int) {
|
||||
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
||||
},
|
||||
Entry("GB10 119 GiB", uint64(119), 8),
|
||||
Entry("48 GiB", uint64(48), 8),
|
||||
Entry("24 GiB", uint64(24), 4),
|
||||
Entry("8 GiB", uint64(8), 4),
|
||||
Entry("6 GiB", uint64(6), 2),
|
||||
Entry("2 GiB", uint64(2), 1),
|
||||
Entry("unknown 0", uint64(0), 1),
|
||||
)
|
||||
|
||||
Describe("ApplyHardwareDefaults parallel slots", func() {
|
||||
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Options).To(ContainElement("parallel:8"))
|
||||
})
|
||||
It("scales the slot count down with VRAM", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
|
||||
Expect(cfg.Options).To(ContainElement("parallel:4"))
|
||||
})
|
||||
It("adds no parallel option on small/unknown VRAM", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
|
||||
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
||||
})
|
||||
It("never overrides an explicit parallel option", func() {
|
||||
cfg := &ModelConfig{Options: []string{"parallel:2"}}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
|
||||
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
|
||||
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
||||
defer func() {
|
||||
if cfg.ContextSize == nil {
|
||||
ctx := defaultContextSize
|
||||
ctx := GGUFFallbackContextSize
|
||||
cfg.ContextSize = &ctx
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
|
||||
{Value: "face_recognition", Label: "Face Recognition"},
|
||||
{Value: "transcript", Label: "Transcript"},
|
||||
{Value: "diarization", Label: "Diarization"},
|
||||
{Value: "sound_classification", Label: "Sound Classification"},
|
||||
{Value: "speaker_recognition", Label: "Speaker Recognition"},
|
||||
{Value: "tts", Label: "TTS"},
|
||||
{Value: "sound_generation", Label: "Sound Generation"},
|
||||
|
||||
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
Order: 45,
|
||||
},
|
||||
|
||||
// --- Alias ---
|
||||
"alias": {
|
||||
Section: "alias",
|
||||
Label: "Alias target",
|
||||
Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
|
||||
Component: "model-select",
|
||||
Order: 0,
|
||||
},
|
||||
|
||||
// --- Pipeline ---
|
||||
"pipeline.llm": {
|
||||
Section: "pipeline",
|
||||
@@ -319,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
AutocompleteProvider: ProviderModelsVAD,
|
||||
Order: 63,
|
||||
},
|
||||
"pipeline.sound_detection": {
|
||||
Section: "pipeline",
|
||||
Label: "Sound Detection Model",
|
||||
Description: "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
|
||||
Component: "model-select",
|
||||
AutocompleteProvider: ProviderModels,
|
||||
Order: 64,
|
||||
},
|
||||
"pipeline.sound_detection_window_ms": {
|
||||
Section: "pipeline",
|
||||
Label: "Sound Detection Window (ms)",
|
||||
Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
|
||||
Component: "number",
|
||||
Min: f64(0),
|
||||
Order: 65,
|
||||
},
|
||||
"pipeline.sound_detection_hop_ms": {
|
||||
Section: "pipeline",
|
||||
Label: "Sound Detection Hop (ms)",
|
||||
Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
|
||||
Component: "number",
|
||||
Min: f64(0),
|
||||
Order: 66,
|
||||
},
|
||||
"pipeline.reasoning_effort": {
|
||||
Section: "pipeline",
|
||||
Label: "Reasoning Effort",
|
||||
@@ -448,6 +481,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
Component: "json-editor",
|
||||
Order: 78,
|
||||
},
|
||||
"pipeline.voice_recognition.enforce": {
|
||||
Section: "pipeline",
|
||||
Label: "Voice Gate Enforce",
|
||||
Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
|
||||
Component: "toggle",
|
||||
Order: 80,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.announce": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Announce",
|
||||
Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
|
||||
Component: "toggle",
|
||||
Order: 81,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.announce_unknown": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Announce Unknown",
|
||||
Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
|
||||
Component: "toggle",
|
||||
Order: 82,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.personalize": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Personalize",
|
||||
Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
|
||||
Component: "toggle",
|
||||
Order: 83,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.inject_name": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Inject Name",
|
||||
Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
|
||||
Component: "toggle",
|
||||
Order: 84,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.inject_system_note": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Inject System Note",
|
||||
Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
|
||||
Component: "toggle",
|
||||
Order: 85,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.note_unknown": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Note Unknown",
|
||||
Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
|
||||
Component: "toggle",
|
||||
Order: 86,
|
||||
},
|
||||
"pipeline.max_history_items": {
|
||||
Section: "pipeline",
|
||||
Label: "Max History Items",
|
||||
@@ -455,6 +537,36 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
Component: "number",
|
||||
Order: 79,
|
||||
},
|
||||
"pipeline.compaction.enabled": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Enabled",
|
||||
Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
|
||||
Component: "toggle",
|
||||
Order: 80,
|
||||
},
|
||||
"pipeline.compaction.trigger_items": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Trigger Items",
|
||||
Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
|
||||
Component: "number",
|
||||
Order: 81,
|
||||
},
|
||||
"pipeline.compaction.summary_model": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Summary Model",
|
||||
Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
|
||||
Component: "input",
|
||||
Advanced: true,
|
||||
Order: 82,
|
||||
},
|
||||
"pipeline.compaction.max_summary_tokens": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Max Summary Tokens",
|
||||
Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
|
||||
Component: "number",
|
||||
Advanced: true,
|
||||
Order: 83,
|
||||
},
|
||||
|
||||
// --- Functions ---
|
||||
"function.grammar.parallel_calls": {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user