mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
Compare commits
62 Commits
worktree-f
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd8c8778e2 | ||
|
|
06a7b6cadb | ||
|
|
67c8889866 | ||
|
|
1d49041c85 | ||
|
|
2edc4e25b3 | ||
|
|
7888067914 | ||
|
|
9eedbf537a | ||
|
|
69c16481c8 | ||
|
|
56f8a6623f | ||
|
|
4755d676a3 | ||
|
|
10184b5e28 | ||
|
|
fdf475ec5f | ||
|
|
9d54a599b0 | ||
|
|
63bcbf6c12 | ||
|
|
95b058e1c5 | ||
|
|
f2abcc7503 | ||
|
|
62c99c10b3 | ||
|
|
7226bb9f30 | ||
|
|
569d9bbd9e | ||
|
|
682fb2718c | ||
|
|
20c643e1f6 | ||
|
|
64a4351f3a | ||
|
|
b7d67f5779 | ||
|
|
600dafd20b | ||
|
|
ce8a3e9266 | ||
|
|
a88d9d2de3 | ||
|
|
1cf1bf32e1 | ||
|
|
f45c6acc54 | ||
|
|
1a1bd57469 | ||
|
|
1f29e96030 | ||
|
|
64560a974b | ||
|
|
32c47706ae | ||
|
|
e58870a573 | ||
|
|
8fab1d2e45 | ||
|
|
7b462a0d51 | ||
|
|
aed181e6c1 | ||
|
|
a556cd9afc | ||
|
|
b50b1fe418 | ||
|
|
b4c0dc67fe | ||
|
|
01fa12e0de | ||
|
|
cf7f9573a2 | ||
|
|
c6303104c7 | ||
|
|
3e96d811b7 | ||
|
|
23f225260c | ||
|
|
aef10723c9 | ||
|
|
9565db5f94 | ||
|
|
e19c43cf04 | ||
|
|
b081247d95 | ||
|
|
1be959ce30 | ||
|
|
518381278e | ||
|
|
93706fec57 | ||
|
|
11aee03a80 | ||
|
|
8915f2ab91 | ||
|
|
f143d7f688 | ||
|
|
dd928f0bdd | ||
|
|
c43a752afc | ||
|
|
079ac0e15a | ||
|
|
2e734bf560 | ||
|
|
72d46c1115 | ||
|
|
606128e4e9 | ||
|
|
59c7ad5153 | ||
|
|
78d682224a |
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
|
|||||||
- If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
|
- If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
|
||||||
- Check similar backends to determine the correct context
|
- Check similar backends to determine the correct context
|
||||||
|
|
||||||
|
## Documenting the backend (README + docs)
|
||||||
|
|
||||||
|
A backend is not "added" until it is discoverable. Update the user-facing docs:
|
||||||
|
|
||||||
|
- **`docs/content/features/backends.md`** - add the backend to the right
|
||||||
|
category in the "LocalAI supports various types of backends" list (and add a
|
||||||
|
new category if it introduces a new modality, e.g. sound classification).
|
||||||
|
- If the backend introduces a **new API surface** (a new endpoint or a realtime
|
||||||
|
capability), document it under `docs/content/` where its area lives (audio,
|
||||||
|
vision, etc.) and follow the api-endpoints checklist in
|
||||||
|
[api-endpoints-and-auth.md](api-endpoints-and-auth.md).
|
||||||
|
|
||||||
|
**If the backend is a native C/C++/GGML engine created and maintained by the
|
||||||
|
LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
|
||||||
|
`vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
|
||||||
|
ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
|
||||||
|
engines ... developed and maintained by the LocalAI project itself". Add a row
|
||||||
|
linking the upstream engine repo with a one-line description. This is the
|
||||||
|
project's showcase of its own engines; a new in-house backend that is missing
|
||||||
|
from it is a documentation bug.
|
||||||
|
|
||||||
## 5. Verification Checklist
|
## 5. Verification Checklist
|
||||||
|
|
||||||
After adding a new backend, verify:
|
After adding a new backend, verify:
|
||||||
@@ -211,6 +232,8 @@ After adding a new backend, verify:
|
|||||||
- [ ] No YAML syntax errors (check with linter)
|
- [ ] No YAML syntax errors (check with linter)
|
||||||
- [ ] No Makefile syntax errors (check with linter)
|
- [ ] No Makefile syntax errors (check with linter)
|
||||||
- [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
|
- [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
|
||||||
|
- [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
|
||||||
|
- [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`
|
||||||
|
|
||||||
## Bundling runtime shared libraries (`package.sh`)
|
## Bundling runtime shared libraries (`package.sh`)
|
||||||
|
|
||||||
|
|||||||
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
|
|||||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||||
|
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
|
||||||
|
# manifests. The LunarG SDK below only provides the loader and shader
|
||||||
|
# tooling, not hardware drivers — without Mesa the packaged Vulkan backend
|
||||||
|
# would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
|
||||||
|
# .so files plus their deps into the backend so it stays self-contained.
|
||||||
|
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||||
if [ "amd64" = "${TARGETARCH:-}" ]; then
|
if [ "amd64" = "${TARGETARCH:-}" ]; then
|
||||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
|
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
|
||||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
|
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
|
||||||
|
|||||||
152
.github/backend-matrix.yml
vendored
152
.github/backend-matrix.yml
vendored
@@ -3575,6 +3575,154 @@ include:
|
|||||||
dockerfile: "./backend/Dockerfile.golang"
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
# ced
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "8"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-12-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-13-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-ced'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f32'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f32-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f16'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f16-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-ced'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-arm64-ced'
|
||||||
|
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2204'
|
||||||
|
- build-type: 'hipblas'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-rocm-hipblas-ced'
|
||||||
|
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
# acestep-cpp
|
# acestep-cpp
|
||||||
- build-type: ''
|
- build-type: ''
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
@@ -4754,6 +4902,10 @@ includeDarwin:
|
|||||||
tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
|
tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
|
||||||
build-type: "metal"
|
build-type: "metal"
|
||||||
lang: "go"
|
lang: "go"
|
||||||
|
- backend: "ced"
|
||||||
|
tag-suffix: "-metal-darwin-arm64-ced"
|
||||||
|
build-type: "metal"
|
||||||
|
lang: "go"
|
||||||
- backend: "acestep-cpp"
|
- backend: "acestep-cpp"
|
||||||
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
|
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
|
||||||
build-type: "metal"
|
build-type: "metal"
|
||||||
|
|||||||
2
.github/workflows/backend.yml
vendored
2
.github/workflows/backend.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
|||||||
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
|
|
||||||
- name: Setup Bun
|
- name: Setup Bun
|
||||||
uses: oven-sh/setup-bun@v2
|
uses: oven-sh/setup-bun@v2
|
||||||
|
|||||||
2
.github/workflows/backend_build.yml
vendored
2
.github/workflows/backend_build.yml
vendored
@@ -101,7 +101,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
|
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
|
||||||
|
|||||||
2
.github/workflows/backend_build_darwin.yml
vendored
2
.github/workflows/backend_build_darwin.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
|||||||
HOMEBREW_NO_ANALYTICS: '1'
|
HOMEBREW_NO_ANALYTICS: '1'
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
|
||||||
|
|||||||
2
.github/workflows/backend_merge.yml
vendored
2
.github/workflows/backend_merge.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
|||||||
# Sparse checkout: the merge job needs `.github/scripts/` (for the
|
# Sparse checkout: the merge job needs `.github/scripts/` (for the
|
||||||
# keepalive cleanup script) but none of the source tree.
|
# keepalive cleanup script) but none of the source tree.
|
||||||
- name: Checkout (.github/scripts only)
|
- name: Checkout (.github/scripts only)
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
sparse-checkout: |
|
sparse-checkout: |
|
||||||
.github/scripts
|
.github/scripts
|
||||||
|
|||||||
2
.github/workflows/backend_pr.yml
vendored
2
.github/workflows/backend_pr.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
|||||||
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
|
|
||||||
- name: Setup Bun
|
- name: Setup Bun
|
||||||
uses: oven-sh/setup-bun@v2
|
uses: oven-sh/setup-bun@v2
|
||||||
|
|||||||
2
.github/workflows/base-images.yml
vendored
2
.github/workflows/base-images.yml
vendored
@@ -127,7 +127,7 @@ jobs:
|
|||||||
# the original l4t matrix entry which set skip-drivers: 'true'.
|
# the original l4t matrix entry which set skip-drivers: 'true'.
|
||||||
skip-drivers: 'true'
|
skip-drivers: 'true'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: false
|
submodules: false
|
||||||
- name: Free disk space
|
- name: Free disk space
|
||||||
|
|||||||
6
.github/workflows/build-test.yaml
vendored
6
.github/workflows/build-test.yaml
vendored
@@ -11,7 +11,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
- name: Set up Go
|
- name: Set up Go
|
||||||
@@ -25,7 +25,7 @@ jobs:
|
|||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
- name: Set up Go
|
- name: Set up Go
|
||||||
@@ -47,7 +47,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
- name: Configure apt mirror on runner
|
- name: Configure apt mirror on runner
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ jobs:
|
|||||||
bump:
|
bump:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
|
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
|
|||||||
8
.github/workflows/bump_deps.yaml
vendored
8
.github/workflows/bump_deps.yaml
vendored
@@ -42,6 +42,10 @@ jobs:
|
|||||||
variable: "PARAKEET_VERSION"
|
variable: "PARAKEET_VERSION"
|
||||||
branch: "master"
|
branch: "master"
|
||||||
file: "backend/go/parakeet-cpp/Makefile"
|
file: "backend/go/parakeet-cpp/Makefile"
|
||||||
|
- repository: "mudler/ced.cpp"
|
||||||
|
variable: "CED_VERSION"
|
||||||
|
branch: "master"
|
||||||
|
file: "backend/go/ced/Makefile"
|
||||||
- repository: "mudler/depth-anything.cpp"
|
- repository: "mudler/depth-anything.cpp"
|
||||||
variable: "DEPTHANYTHING_VERSION"
|
variable: "DEPTHANYTHING_VERSION"
|
||||||
branch: "master"
|
branch: "master"
|
||||||
@@ -88,7 +92,7 @@ jobs:
|
|||||||
file: "backend/go/vibevoice-cpp/Makefile"
|
file: "backend/go/vibevoice-cpp/Makefile"
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
- name: Bump dependencies 🔧
|
- name: Bump dependencies 🔧
|
||||||
id: bump
|
id: bump
|
||||||
run: |
|
run: |
|
||||||
@@ -124,7 +128,7 @@ jobs:
|
|||||||
if: github.repository == 'mudler/LocalAI'
|
if: github.repository == 'mudler/LocalAI'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
- name: Bump vLLM cu130 wheel pin 🔧
|
- name: Bump vLLM cu130 wheel pin 🔧
|
||||||
id: bump
|
id: bump
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/bump_docs.yaml
vendored
2
.github/workflows/bump_docs.yaml
vendored
@@ -13,7 +13,7 @@ jobs:
|
|||||||
- repository: "mudler/LocalAI"
|
- repository: "mudler/LocalAI"
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
- name: Bump dependencies 🔧
|
- name: Bump dependencies 🔧
|
||||||
run: |
|
run: |
|
||||||
bash .github/bump_docs.sh ${{ matrix.repository }}
|
bash .github/bump_docs.sh ${{ matrix.repository }}
|
||||||
|
|||||||
2
.github/workflows/checksum_checker.yaml
vendored
2
.github/workflows/checksum_checker.yaml
vendored
@@ -8,7 +8,7 @@ jobs:
|
|||||||
if: github.repository == 'mudler/LocalAI'
|
if: github.repository == 'mudler/LocalAI'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
- name: Configure apt mirror on runner
|
- name: Configure apt mirror on runner
|
||||||
uses: ./.github/actions/configure-apt-mirror
|
uses: ./.github/actions/configure-apt-mirror
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
2
.github/workflows/deploy-explorer.yaml
vendored
2
.github/workflows/deploy-explorer.yaml
vendored
@@ -16,7 +16,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
|
|||||||
2
.github/workflows/gallery-agent.yaml
vendored
2
.github/workflows/gallery-agent.yaml
vendored
@@ -31,7 +31,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
|||||||
2
.github/workflows/generate_intel_image.yaml
vendored
2
.github/workflows/generate_intel_image.yaml
vendored
@@ -44,7 +44,7 @@ jobs:
|
|||||||
uses: docker/setup-buildx-action@master
|
uses: docker/setup-buildx-action@master
|
||||||
|
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
|
|
||||||
- name: Cache Intel images
|
- name: Cache Intel images
|
||||||
uses: docker/build-push-action@v7
|
uses: docker/build-push-action@v7
|
||||||
|
|||||||
2
.github/workflows/gh-pages.yml
vendored
2
.github/workflows/gh-pages.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
|||||||
HUGO_VERSION: "0.146.3"
|
HUGO_VERSION: "0.146.3"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0 # needed for enableGitInfo
|
fetch-depth: 0 # needed for enableGitInfo
|
||||||
submodules: true
|
submodules: true
|
||||||
|
|||||||
2
.github/workflows/image_build.yml
vendored
2
.github/workflows/image_build.yml
vendored
@@ -80,7 +80,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
|
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
|
|
||||||
- name: Configure apt mirror on runner
|
- name: Configure apt mirror on runner
|
||||||
id: apt_mirror
|
id: apt_mirror
|
||||||
|
|||||||
2
.github/workflows/image_merge.yml
vendored
2
.github/workflows/image_merge.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
|||||||
# Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
|
# Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
|
||||||
# script). Skips the rest of the source tree.
|
# script). Skips the rest of the source tree.
|
||||||
- name: Checkout (.github/scripts only)
|
- name: Checkout (.github/scripts only)
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
sparse-checkout: |
|
sparse-checkout: |
|
||||||
.github/scripts
|
.github/scripts
|
||||||
|
|||||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -20,7 +20,7 @@ jobs:
|
|||||||
golangci-lint:
|
golangci-lint:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
# Full history so golangci-lint's new-from-merge-base can reach
|
# Full history so golangci-lint's new-from-merge-base can reach
|
||||||
# origin/master and compute the diff against it.
|
# origin/master and compute the diff against it.
|
||||||
|
|||||||
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@@ -10,7 +10,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
- name: Set up Go
|
- name: Set up Go
|
||||||
@@ -28,7 +28,7 @@ jobs:
|
|||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
- name: Set up Go
|
- name: Set up Go
|
||||||
@@ -46,7 +46,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
- name: Configure apt mirror on runner
|
- name: Configure apt mirror on runner
|
||||||
|
|||||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -14,7 +14,7 @@ jobs:
|
|||||||
GO111MODULE: on
|
GO111MODULE: on
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Source
|
- name: Checkout Source
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||||
- name: Run Gosec Security Scanner
|
- name: Run Gosec Security Scanner
|
||||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||||
|
|||||||
86
.github/workflows/test-extra.yml
vendored
86
.github/workflows/test-extra.yml
vendored
@@ -50,7 +50,7 @@ jobs:
|
|||||||
parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
|
parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
- name: Setup Bun
|
- name: Setup Bun
|
||||||
uses: oven-sh/setup-bun@v2
|
uses: oven-sh/setup-bun@v2
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
@@ -67,7 +67,7 @@ jobs:
|
|||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -90,7 +90,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -113,7 +113,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -137,7 +137,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -158,7 +158,7 @@ jobs:
|
|||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -178,7 +178,7 @@ jobs:
|
|||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -240,7 +240,7 @@ jobs:
|
|||||||
# sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
|
# sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
|
||||||
# df -h
|
# df -h
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -265,7 +265,7 @@ jobs:
|
|||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -288,7 +288,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -309,7 +309,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -330,7 +330,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -351,7 +351,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -373,7 +373,7 @@ jobs:
|
|||||||
# timeout-minutes: 45
|
# timeout-minutes: 45
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -394,7 +394,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -415,7 +415,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -436,7 +436,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -462,7 +462,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -484,7 +484,7 @@ jobs:
|
|||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -513,7 +513,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -530,7 +530,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -552,7 +552,7 @@ jobs:
|
|||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -579,7 +579,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -604,7 +604,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -625,7 +625,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -645,7 +645,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -664,7 +664,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -681,7 +681,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -698,7 +698,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -741,7 +741,7 @@ jobs:
|
|||||||
# timeout-minutes: 90
|
# timeout-minutes: 90
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -783,7 +783,7 @@ jobs:
|
|||||||
# timeout-minutes: 90
|
# timeout-minutes: 90
|
||||||
# steps:
|
# steps:
|
||||||
# - name: Clone
|
# - name: Clone
|
||||||
# uses: actions/checkout@v6
|
# uses: actions/checkout@v7
|
||||||
# with:
|
# with:
|
||||||
# submodules: true
|
# submodules: true
|
||||||
# - name: Dependencies
|
# - name: Dependencies
|
||||||
@@ -808,7 +808,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -840,7 +840,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -876,7 +876,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -915,7 +915,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -952,7 +952,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -987,7 +987,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -1013,7 +1013,7 @@ jobs:
|
|||||||
timeout-minutes: 150
|
timeout-minutes: 150
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -1042,7 +1042,7 @@ jobs:
|
|||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go
|
- name: Setup Go
|
||||||
@@ -1058,7 +1058,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -1091,7 +1091,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -1114,7 +1114,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
@@ -1140,7 +1140,7 @@ jobs:
|
|||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
|
|||||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
|||||||
go-version: ['1.26.x']
|
go-version: ['1.26.x']
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Free disk space
|
- name: Free disk space
|
||||||
@@ -84,7 +84,7 @@ jobs:
|
|||||||
go-version: ['1.26.x']
|
go-version: ['1.26.x']
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Setup Go ${{ matrix.go-version }}
|
- name: Setup Go ${{ matrix.go-version }}
|
||||||
|
|||||||
2
.github/workflows/tests-aio.yml
vendored
2
.github/workflows/tests-aio.yml
vendored
@@ -62,7 +62,7 @@ jobs:
|
|||||||
sudo rm -rfv build || true
|
sudo rm -rfv build || true
|
||||||
df -h
|
df -h
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
|
|||||||
2
.github/workflows/tests-e2e.yml
vendored
2
.github/workflows/tests-e2e.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
|||||||
go-version: ['1.25.x']
|
go-version: ['1.25.x']
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Configure apt mirror on runner
|
- name: Configure apt mirror on runner
|
||||||
|
|||||||
97
.github/workflows/tests-pii-ner-e2e.yml
vendored
Normal file
97
.github/workflows/tests-pii-ner-e2e.yml
vendored
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
---
|
||||||
|
name: 'PII NER tier E2E (live GGUF, CPU)'
|
||||||
|
|
||||||
|
# Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
|
||||||
|
# hermetic tests/e2e suite cannot cover (it only exercises the in-process
|
||||||
|
# pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
|
||||||
|
# GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
|
||||||
|
#
|
||||||
|
# This drives the container-level harness (tests/e2e-backends) via
|
||||||
|
# `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
|
||||||
|
# downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
|
||||||
|
# TokenClassify spans. The complementary HTTP-path specs in tests/e2e
|
||||||
|
# (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 3 * * *'
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
paths:
|
||||||
|
- 'backend/cpp/privacy-filter/**'
|
||||||
|
- 'backend/Dockerfile.privacy-filter'
|
||||||
|
- 'core/services/routing/pii/**'
|
||||||
|
- 'core/services/routing/piidetector/**'
|
||||||
|
- 'core/backend/token_classify.go'
|
||||||
|
- 'core/http/endpoints/localai/pii.go'
|
||||||
|
- 'core/schema/pii.go'
|
||||||
|
- 'tests/e2e-backends/**'
|
||||||
|
- 'tests/e2e/e2e_pii_ner_test.go'
|
||||||
|
- 'tests/e2e/e2e_suite_test.go'
|
||||||
|
- '.github/workflows/tests-pii-ner-e2e.yml'
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'backend/cpp/privacy-filter/**'
|
||||||
|
- 'backend/Dockerfile.privacy-filter'
|
||||||
|
- 'core/services/routing/pii/**'
|
||||||
|
- 'core/services/routing/piidetector/**'
|
||||||
|
- 'core/backend/token_classify.go'
|
||||||
|
- 'core/http/endpoints/localai/pii.go'
|
||||||
|
- 'core/schema/pii.go'
|
||||||
|
- 'tests/e2e-backends/**'
|
||||||
|
- 'tests/e2e/e2e_pii_ner_test.go'
|
||||||
|
- 'tests/e2e/e2e_suite_test.go'
|
||||||
|
- '.github/workflows/tests-pii-ner-e2e.yml'
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
|
||||||
|
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
tests-pii-ner-e2e:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
go-version: ['1.25.x']
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v7
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
- name: Free disk space
|
||||||
|
run: |
|
||||||
|
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
|
||||||
|
sudo docker image prune --all --force || true
|
||||||
|
df -h
|
||||||
|
- name: Configure apt mirror on runner
|
||||||
|
uses: ./.github/actions/configure-apt-mirror
|
||||||
|
- name: Setup Go ${{ matrix.go-version }}
|
||||||
|
uses: actions/setup-go@v5
|
||||||
|
with:
|
||||||
|
go-version: ${{ matrix.go-version }}
|
||||||
|
cache: false
|
||||||
|
- name: Proto Dependencies
|
||||||
|
run: |
|
||||||
|
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||||
|
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||||
|
rm protoc.zip
|
||||||
|
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||||
|
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||||
|
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||||
|
- name: Dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y build-essential
|
||||||
|
# Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
|
||||||
|
# CPU and runs the token_classify capability spec (byte-offset contract).
|
||||||
|
- name: Run live PII NER backend E2E
|
||||||
|
run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
|
||||||
|
- name: Setup tmate session if tests fail
|
||||||
|
if: ${{ failure() }}
|
||||||
|
uses: mxschmitt/action-tmate@v3.23
|
||||||
|
with:
|
||||||
|
detached: true
|
||||||
|
connect-timeout-seconds: 180
|
||||||
|
limit-access-to-actor: true
|
||||||
2
.github/workflows/tests-ui-e2e.yml
vendored
2
.github/workflows/tests-ui-e2e.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
|||||||
go-version: ['1.26.x']
|
go-version: ['1.26.x']
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
uses: actions/checkout@v6
|
uses: actions/checkout@v7
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
- name: Configure apt mirror on runner
|
- name: Configure apt mirror on runner
|
||||||
|
|||||||
2
.github/workflows/update_swagger.yaml
vendored
2
.github/workflows/update_swagger.yaml
vendored
@@ -10,7 +10,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v6
|
- uses: actions/checkout@v7
|
||||||
- name: Configure apt mirror on runner
|
- name: Configure apt mirror on runner
|
||||||
uses: ./.github/actions/configure-apt-mirror
|
uses: ./.github/actions/configure-apt-mirror
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/
|
|||||||
|
|
||||||
# Local worktrees
|
# Local worktrees
|
||||||
.worktrees/
|
.worktrees/
|
||||||
|
|
||||||
|
# SDD / brainstorm scratch (agent-driven development)
|
||||||
|
.superpowers/
|
||||||
|
|||||||
10
Makefile
10
Makefile
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
|
|||||||
BACKEND_TEST_CTX_SIZE=2048 \
|
BACKEND_TEST_CTX_SIZE=2048 \
|
||||||
$(MAKE) test-extra-backend
|
$(MAKE) test-extra-backend
|
||||||
|
|
||||||
|
## privacy-filter: the PII/NER token-classification backend. Exercises the
|
||||||
|
## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
|
||||||
|
## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
|
||||||
|
## active params). This is the live-backend coverage for the PII NER tier.
|
||||||
|
test-extra-backend-privacy-filter: docker-build-privacy-filter
|
||||||
|
BACKEND_IMAGE=local-ai-backend:privacy-filter \
|
||||||
|
BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
|
||||||
|
BACKEND_TEST_CAPS=health,load,token_classify \
|
||||||
|
$(MAKE) test-extra-backend
|
||||||
|
|
||||||
## vllm is resolved from a HuggingFace model id (no file download) and
|
## vllm is resolved from a HuggingFace model id (no file download) and
|
||||||
## exercises Predict + streaming + tool-call extraction via the hermes parser.
|
## exercises Predict + streaming + tool-call extraction via the hermes parser.
|
||||||
## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
|
## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
|
||||||
|
|||||||
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
|||||||
| Backend | What it does |
|
| Backend | What it does |
|
||||||
|---------|-------------|
|
|---------|-------------|
|
||||||
| [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
|
| [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
|
||||||
|
| [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
|
||||||
| [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
|
| [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
|
||||||
| [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
|
| [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
|
||||||
| [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
|
| [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
|
||||||
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
|||||||
| [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
|
| [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
|
||||||
| [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
|
| [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
|
||||||
|
|
||||||
|
We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
|
||||||
|
|
||||||
## Resources
|
## Resources
|
||||||
|
|
||||||
- [Documentation](https://localai.io/)
|
- [Documentation](https://localai.io/)
|
||||||
|
|||||||
@@ -65,7 +65,12 @@ RUN <<EOT bash
|
|||||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||||
|
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||||
|
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||||
|
# LunarG SDK below only provides the loader and shader tooling, not
|
||||||
|
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||||
|
# bundle and the packaged backend finds no GPU at runtime.
|
||||||
if [ "amd64" = "$TARGETARCH" ]; then
|
if [ "amd64" = "$TARGETARCH" ]; then
|
||||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||||
|
|||||||
@@ -66,7 +66,12 @@ RUN <<EOT bash
|
|||||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||||
|
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||||
|
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||||
|
# LunarG SDK below only provides the loader and shader tooling, not
|
||||||
|
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||||
|
# bundle and the packaged backend finds no GPU at runtime.
|
||||||
if [ "amd64" = "$TARGETARCH" ]; then
|
if [ "amd64" = "$TARGETARCH" ]; then
|
||||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||||
|
|||||||
@@ -24,6 +24,9 @@ service Backend {
|
|||||||
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
||||||
rpc Status(HealthMessage) returns (StatusResponse) {}
|
rpc Status(HealthMessage) returns (StatusResponse) {}
|
||||||
rpc Detect(DetectOptions) returns (DetectResponse) {}
|
rpc Detect(DetectOptions) returns (DetectResponse) {}
|
||||||
|
// SoundDetection runs an audio-tagging / sound-event-classification model
|
||||||
|
// (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
|
||||||
|
rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
|
||||||
rpc Depth(DepthRequest) returns (DepthResponse) {}
|
rpc Depth(DepthRequest) returns (DepthResponse) {}
|
||||||
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
|
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
|
||||||
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
|
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
|
||||||
@@ -671,6 +674,24 @@ message DetectResponse {
|
|||||||
repeated Detection Detections = 1;
|
repeated Detection Detections = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Sound-event classification / audio tagging messages (CED) ---
|
||||||
|
|
||||||
|
message SoundDetectionRequest {
|
||||||
|
string src = 1; // audio file path (LocalAI writes the upload to disk)
|
||||||
|
int32 top_k = 2; // number of top tags to return (0 = all classes)
|
||||||
|
float threshold = 3; // optional: drop tags scoring below this
|
||||||
|
}
|
||||||
|
|
||||||
|
message SoundClass {
|
||||||
|
string label = 1; // AudioSet class name, e.g. "Baby cry, infant cry"
|
||||||
|
float score = 2; // per-class probability (multi-label, independent)
|
||||||
|
int32 index = 3; // class index in the model ontology
|
||||||
|
}
|
||||||
|
|
||||||
|
message SoundDetectionResponse {
|
||||||
|
repeated SoundClass detections = 1; // score-descending
|
||||||
|
}
|
||||||
|
|
||||||
// --- Depth estimation messages (Depth Anything 3) ---
|
// --- Depth estimation messages (Depth Anything 3) ---
|
||||||
|
|
||||||
message DepthRequest {
|
message DepthRequest {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
|
IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
|
||||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||||
|
|
||||||
CMAKE_ARGS?=
|
CMAKE_ARGS?=
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
|
LLAMA_VERSION?=73618f27a801c0b8614ceaf3547d3c2a99baae14
|
||||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||||
|
|
||||||
CMAKE_ARGS?=
|
CMAKE_ARGS?=
|
||||||
|
|||||||
@@ -18,6 +18,18 @@
|
|||||||
#if __has_include("server-chat.cpp")
|
#if __has_include("server-chat.cpp")
|
||||||
#include "server-chat.cpp"
|
#include "server-chat.cpp"
|
||||||
#endif
|
#endif
|
||||||
|
// server-schema.cpp exists only in llama.cpp after the upstream refactor that
|
||||||
|
// extracted the JSON request-schema evaluation (previously the static
|
||||||
|
// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
|
||||||
|
// server-context.cpp and grpc-server.cpp both call into it, so its definitions
|
||||||
|
// must be part of this translation unit or the link fails. __has_include keeps
|
||||||
|
// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
|
||||||
|
// predate the split and still expose params_from_json_cmpl (see the guarded
|
||||||
|
// call sites below).
|
||||||
|
#if __has_include("server-schema.cpp")
|
||||||
|
#define LOCALAI_HAS_SERVER_SCHEMA 1
|
||||||
|
#include "server-schema.cpp"
|
||||||
|
#endif
|
||||||
#include "server-context.cpp"
|
#include "server-context.cpp"
|
||||||
|
|
||||||
// LocalAI
|
// LocalAI
|
||||||
@@ -2102,7 +2114,11 @@ public:
|
|||||||
task.index = i;
|
task.index = i;
|
||||||
|
|
||||||
task.tokens = std::move(inputs[i]);
|
task.tokens = std::move(inputs[i]);
|
||||||
|
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||||
|
task.params = server_schema::eval_llama_cmpl_schema(
|
||||||
|
#else
|
||||||
task.params = server_task::params_from_json_cmpl(
|
task.params = server_task::params_from_json_cmpl(
|
||||||
|
#endif
|
||||||
ctx_server.impl->vocab,
|
ctx_server.impl->vocab,
|
||||||
params_base,
|
params_base,
|
||||||
ctx_server.get_meta().slot_n_ctx,
|
ctx_server.get_meta().slot_n_ctx,
|
||||||
@@ -2116,7 +2132,7 @@ public:
|
|||||||
// cannot detect tool calls or separate reasoning from content.
|
// cannot detect tool calls or separate reasoning from content.
|
||||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||||
task.params.oaicompat_cmpl_id = completion_id;
|
task.params.oaicompat_cmpl_id = completion_id;
|
||||||
// oaicompat_model is already populated by params_from_json_cmpl
|
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||||
|
|
||||||
tasks.push_back(std::move(task));
|
tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
@@ -2940,7 +2956,11 @@ public:
|
|||||||
task.index = i;
|
task.index = i;
|
||||||
|
|
||||||
task.tokens = std::move(inputs[i]);
|
task.tokens = std::move(inputs[i]);
|
||||||
|
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||||
|
task.params = server_schema::eval_llama_cmpl_schema(
|
||||||
|
#else
|
||||||
task.params = server_task::params_from_json_cmpl(
|
task.params = server_task::params_from_json_cmpl(
|
||||||
|
#endif
|
||||||
ctx_server.impl->vocab,
|
ctx_server.impl->vocab,
|
||||||
params_base,
|
params_base,
|
||||||
ctx_server.get_meta().slot_n_ctx,
|
ctx_server.get_meta().slot_n_ctx,
|
||||||
@@ -2952,7 +2972,7 @@ public:
|
|||||||
// reasoning, tool calls, and content are classified into ChatDeltas.
|
// reasoning, tool calls, and content are classified into ChatDeltas.
|
||||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||||
task.params.oaicompat_cmpl_id = completion_id;
|
task.params.oaicompat_cmpl_id = completion_id;
|
||||||
// oaicompat_model is already populated by params_from_json_cmpl
|
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||||
|
|
||||||
tasks.push_back(std::move(task));
|
tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
# Local development: point at a working checkout instead of cloning, e.g.
|
# Local development: point at a working checkout instead of cloning, e.g.
|
||||||
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
||||||
|
|
||||||
PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
|
PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
|
||||||
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
||||||
PRIVACY_FILTER_SRC?=
|
PRIVACY_FILTER_SRC?=
|
||||||
|
|
||||||
|
|||||||
11
backend/go/ced/.gitignore
vendored
Normal file
11
backend/go/ced/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
.cache/
|
||||||
|
sources/
|
||||||
|
build/
|
||||||
|
package/
|
||||||
|
ced-grpc
|
||||||
|
# build artifacts staged in-tree by the Makefile (cp from sources/) or
|
||||||
|
# symlinked for local dev; the real sources live in ced.cpp upstream.
|
||||||
|
*.so
|
||||||
|
*.so.*
|
||||||
|
ced_capi.h
|
||||||
|
compile_commands.json
|
||||||
77
backend/go/ced/Makefile
Normal file
77
backend/go/ced/Makefile
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# ced sound-classification backend Makefile.
|
||||||
|
#
|
||||||
|
# Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
|
||||||
|
# and update it (matches the parakeet-cpp / whisper.cpp convention).
|
||||||
|
#
|
||||||
|
# Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
|
||||||
|
# skip the clone/cmake steps entirely:
|
||||||
|
# ln -sf /path/to/ced.cpp/build-shared/libced.so .
|
||||||
|
# ln -sf /path/to/ced.cpp/include/ced_capi.h .
|
||||||
|
# go build -o ced-grpc .
|
||||||
|
|
||||||
|
CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
|
||||||
|
CED_REPO?=https://github.com/mudler/ced.cpp
|
||||||
|
|
||||||
|
GOCMD?=go
|
||||||
|
GO_TAGS?=
|
||||||
|
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||||
|
|
||||||
|
BUILD_TYPE?=
|
||||||
|
NATIVE?=false
|
||||||
|
|
||||||
|
# Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
|
||||||
|
# dlopen needs no libggml*.so alongside it, only system libs the runtime image
|
||||||
|
# already provides.
|
||||||
|
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||||
|
|
||||||
|
ifeq ($(NATIVE),false)
|
||||||
|
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||||
|
endif
|
||||||
|
|
||||||
|
# ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
|
||||||
|
# "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
|
||||||
|
ifeq ($(BUILD_TYPE),cublas)
|
||||||
|
CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
|
||||||
|
else ifeq ($(BUILD_TYPE),openblas)
|
||||||
|
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
|
else ifeq ($(BUILD_TYPE),hipblas)
|
||||||
|
CMAKE_ARGS+=-DCED_GGML_HIP=ON
|
||||||
|
else ifeq ($(BUILD_TYPE),vulkan)
|
||||||
|
CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
|
||||||
|
endif
|
||||||
|
|
||||||
|
.PHONY: ced-grpc package build clean purge test all
|
||||||
|
|
||||||
|
all: ced-grpc
|
||||||
|
|
||||||
|
sources/ced.cpp:
|
||||||
|
mkdir -p sources/ced.cpp
|
||||||
|
cd sources/ced.cpp && \
|
||||||
|
git init -q && \
|
||||||
|
git remote add origin $(CED_REPO) && \
|
||||||
|
git fetch --depth 1 origin $(CED_VERSION) && \
|
||||||
|
git checkout FETCH_HEAD && \
|
||||||
|
git submodule update --init --recursive --depth 1 --single-branch
|
||||||
|
|
||||||
|
libced.so: sources/ced.cpp
|
||||||
|
cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
|
||||||
|
cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
|
||||||
|
cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
|
||||||
|
cp -fv sources/ced.cpp/include/ced_capi.h ./
|
||||||
|
|
||||||
|
ced-grpc: libced.so main.go goced.go
|
||||||
|
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
|
||||||
|
|
||||||
|
package: ced-grpc
|
||||||
|
bash package.sh
|
||||||
|
|
||||||
|
build: package
|
||||||
|
|
||||||
|
test:
|
||||||
|
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
|
||||||
|
|
||||||
|
clean: purge
|
||||||
|
rm -rf libced.so* ced_capi.h package ced-grpc
|
||||||
|
|
||||||
|
purge:
|
||||||
|
rm -rf sources/ced.cpp
|
||||||
130
backend/go/ced/goced.go
Normal file
130
backend/go/ced/goced.go
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
// Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
|
||||||
|
// SoundDetection implementation.
|
||||||
|
//
|
||||||
|
// SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
|
||||||
|
// `make protogen-go`). The C side is single-threaded per ctx, so we guard the
|
||||||
|
// engine with engineMu; LocalAI also serializes via base.SingleThread.
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"sync"
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||||
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
|
)
|
||||||
|
|
||||||
|
// purego-bound entry points from libced.so. Names match ced_capi.h exactly.
|
||||||
|
var (
|
||||||
|
CppAbiVersion func() int32
|
||||||
|
CppLoad func(ggufPath string) uintptr
|
||||||
|
CppFree func(ctx uintptr)
|
||||||
|
CppLastError func(ctx uintptr) string
|
||||||
|
CppNumClasses func(ctx uintptr) int32
|
||||||
|
CppSampleRate func(ctx uintptr) int32
|
||||||
|
CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
|
||||||
|
CppClassifyPcmJSON func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
|
||||||
|
CppFreeString func(s uintptr)
|
||||||
|
)
|
||||||
|
|
||||||
|
// cstr copies a malloc'd C string (returned as uintptr) into a Go string and
|
||||||
|
// frees the original via ced_capi_free_string. Empty/0 -> "".
|
||||||
|
func cstr(p uintptr) string {
|
||||||
|
if p == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
defer CppFreeString(p)
|
||||||
|
var b []byte
|
||||||
|
for i := 0; ; i++ {
|
||||||
|
ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
|
||||||
|
if ch == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
b = append(b, ch)
|
||||||
|
}
|
||||||
|
return string(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ced is the gRPC backend. One loaded CED model per instance.
|
||||||
|
type Ced struct {
|
||||||
|
base.Base
|
||||||
|
ctxPtr uintptr
|
||||||
|
engineMu sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load resolves the GGUF and opens the C-API context.
|
||||||
|
func (c *Ced) Load(opts *pb.ModelOptions) error {
|
||||||
|
if opts.ModelFile == "" {
|
||||||
|
return errors.New("ced: ModelFile is required")
|
||||||
|
}
|
||||||
|
ctx := CppLoad(opts.ModelFile)
|
||||||
|
if ctx == 0 {
|
||||||
|
return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
|
||||||
|
}
|
||||||
|
c.ctxPtr = ctx
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// jsonTag mirrors the ced_capi JSON tag objects.
|
||||||
|
type jsonTag struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Score float32 `json:"score"`
|
||||||
|
Label string `json:"label"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
|
||||||
|
func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
|
||||||
|
if c.ctxPtr == 0 {
|
||||||
|
return nil, errors.New("ced: model not loaded")
|
||||||
|
}
|
||||||
|
if req.GetSrc() == "" {
|
||||||
|
return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
|
||||||
|
}
|
||||||
|
topK := req.GetTopK()
|
||||||
|
if topK <= 0 {
|
||||||
|
topK = 10 // sensible default for a tagging response
|
||||||
|
}
|
||||||
|
|
||||||
|
c.engineMu.Lock()
|
||||||
|
out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
|
||||||
|
lastErr := CppLastError(c.ctxPtr)
|
||||||
|
c.engineMu.Unlock()
|
||||||
|
|
||||||
|
if out == "" {
|
||||||
|
return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
|
||||||
|
}
|
||||||
|
var tags []jsonTag
|
||||||
|
if err := json.Unmarshal([]byte(out), &tags); err != nil {
|
||||||
|
return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
thr := req.GetThreshold()
|
||||||
|
resp := &pb.SoundDetectionResponse{}
|
||||||
|
for _, t := range tags {
|
||||||
|
if t.Score < thr {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
resp.Detections = append(resp.Detections, &pb.SoundClass{
|
||||||
|
Label: t.Label, Score: t.Score, Index: int32(t.Index),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(resp.Detections, func(i, j int) bool {
|
||||||
|
return resp.Detections[i].Score > resp.Detections[j].Score
|
||||||
|
})
|
||||||
|
return resp, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Ced) Free() error {
|
||||||
|
c.engineMu.Lock()
|
||||||
|
defer c.engineMu.Unlock()
|
||||||
|
if c.ctxPtr != 0 {
|
||||||
|
CppFree(c.ctxPtr)
|
||||||
|
c.ctxPtr = 0
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
59
backend/go/ced/main.go
Normal file
59
backend/go/ced/main.go
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
// ced sound-classification backend. Started internally by LocalAI: one gRPC
|
||||||
|
// server per loaded model. Loads libced.so via purego and registers the flat
|
||||||
|
// C-API declared in ced_capi.h. The library name can be overridden with
|
||||||
|
// CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
|
||||||
|
// for the .so next to this binary.
|
||||||
|
//
|
||||||
|
// SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
|
||||||
|
// addition, and a built libced.so (see Makefile). See DESIGN.md.
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/ebitengine/purego"
|
||||||
|
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||||
|
)
|
||||||
|
|
||||||
|
var addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||||
|
|
||||||
|
type libFunc struct {
|
||||||
|
ptr any
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
libName := os.Getenv("CED_LIBRARY")
|
||||||
|
if libName == "" {
|
||||||
|
libName = "libced.so"
|
||||||
|
}
|
||||||
|
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
|
||||||
|
// so we can free the same pointer with ced_capi_free_string after copying
|
||||||
|
// (purego's string return would copy and leak the original).
|
||||||
|
for _, lf := range []libFunc{
|
||||||
|
{&CppAbiVersion, "ced_capi_abi_version"},
|
||||||
|
{&CppLoad, "ced_capi_load"},
|
||||||
|
{&CppFree, "ced_capi_free"},
|
||||||
|
{&CppLastError, "ced_capi_last_error"},
|
||||||
|
{&CppNumClasses, "ced_capi_num_classes"},
|
||||||
|
{&CppSampleRate, "ced_capi_sample_rate"},
|
||||||
|
{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
|
||||||
|
{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
|
||||||
|
{&CppFreeString, "ced_capi_free_string"},
|
||||||
|
} {
|
||||||
|
purego.RegisterLibFunc(lf.ptr, lib, lf.name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
|
||||||
|
flag.Parse()
|
||||||
|
if err := grpc.StartServer(*addr, &Ced{}); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
60
backend/go/ced/package.sh
Executable file
60
backend/go/ced/package.sh
Executable file
@@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
|
||||||
|
# libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
|
||||||
|
# is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
|
||||||
|
# the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
REPO_ROOT="${CURDIR}/../../.."
|
||||||
|
|
||||||
|
mkdir -p "$CURDIR/package/lib"
|
||||||
|
|
||||||
|
cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
|
||||||
|
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||||
|
|
||||||
|
cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||||
|
echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
|
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||||
|
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||||
|
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||||
|
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||||
|
echo "Detected Darwin"
|
||||||
|
else
|
||||||
|
echo "Error: Could not detect architecture"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||||
|
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||||
|
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||||
|
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||||
|
package_gpu_libs
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Packaging completed successfully"
|
||||||
|
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||||
15
backend/go/ced/run.sh
Executable file
15
backend/go/ced/run.sh
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
|
||||||
|
|
||||||
|
# If a self-contained ld.so was packaged, route through it so the packaged
|
||||||
|
# libc / libstdc++ are used instead of the host's (matches the sibling backends).
|
||||||
|
if [ -f "$CURDIR/lib/ld.so" ]; then
|
||||||
|
echo "Using lib/ld.so"
|
||||||
|
exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec "$CURDIR/ced-grpc" "$@"
|
||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# CrispASR version (release tag)
|
# CrispASR version (release tag)
|
||||||
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
|
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
|
||||||
CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
|
CRISPASR_VERSION?=63b57289255267edf66e43e33bc3911e04a2e92d
|
||||||
SO_TARGET?=libgocrispasr.so
|
SO_TARGET?=libgocrispasr.so
|
||||||
|
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||||
@@ -67,7 +67,7 @@ sources/CrispASR:
|
|||||||
# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
|
# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
|
||||||
# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
|
# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
|
||||||
# which is correct both standalone and as a subproject. Idempotent.
|
# which is correct both standalone and as a subproject. Idempotent.
|
||||||
sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
|
sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
|
||||||
|
|
||||||
# Detect OS
|
# Detect OS
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
|
|||||||
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
|
|||||||
g_abort.store(v, std::memory_order_relaxed);
|
g_abort.store(v, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- word-level timestamp accessors ---
|
||||||
|
extern "C" {
|
||||||
|
int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
|
||||||
|
const char *crispasr_session_result_word_text(crispasr_session_result *r,
|
||||||
|
int seg_i, int word_i);
|
||||||
|
int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
|
||||||
|
int word_i);
|
||||||
|
int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
|
||||||
|
int word_i);
|
||||||
|
|
||||||
|
// Parakeet-specific word accessors
|
||||||
|
int crispasr_parakeet_result_n_words(void *r);
|
||||||
|
const char *crispasr_parakeet_result_word_text(void *r, int word_i);
|
||||||
|
int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
|
||||||
|
int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *get_result(void) { return g_result; }
|
||||||
|
|
||||||
|
int get_word_count(int seg_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_session_result_n_words(g_result, seg_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *get_word_text(int seg_i, int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return "";
|
||||||
|
return crispasr_session_result_word_text(g_result, seg_i, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_word_t0(int seg_i, int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_session_result_word_t0(g_result, seg_i, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_word_t1(int seg_i, int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_session_result_word_t1(g_result, seg_i, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parakeet-specific word accessors
|
||||||
|
int get_parakeet_word_count(void) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_parakeet_result_n_words(g_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *get_parakeet_word_text(int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return "";
|
||||||
|
return crispasr_parakeet_result_word_text(g_result, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_parakeet_word_t0(int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_parakeet_result_word_t0(g_result, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_parakeet_word_t1(int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_parakeet_result_word_t1(g_result, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
||||||
void *data) {
|
void *data) {
|
||||||
const char *level_str;
|
const char *level_str;
|
||||||
|
|||||||
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
|
|||||||
void tts_free(float *pcm);
|
void tts_free(float *pcm);
|
||||||
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
||||||
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
||||||
|
|
||||||
|
// --- word-level timestamp accessors ---
|
||||||
|
// Session-based (works for whisper-like backends)
|
||||||
|
void *get_result(void);
|
||||||
|
int get_word_count(int seg_i);
|
||||||
|
const char *get_word_text(int seg_i, int word_i);
|
||||||
|
int64_t get_word_t0(int seg_i, int word_i);
|
||||||
|
int64_t get_word_t1(int seg_i, int word_i);
|
||||||
|
|
||||||
|
// Parakeet-specific (global word list, no segment index)
|
||||||
|
int get_parakeet_word_count(void);
|
||||||
|
const char *get_parakeet_word_text(int word_i);
|
||||||
|
int64_t get_parakeet_word_t0(int word_i);
|
||||||
|
int64_t get_parakeet_word_t1(int word_i);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,18 @@ var (
|
|||||||
CppTTSFree func(ptr uintptr)
|
CppTTSFree func(ptr uintptr)
|
||||||
CppTTSSetVoice func(name string) int
|
CppTTSSetVoice func(name string) int
|
||||||
CppTTSSetVoiceFile func(path string, refText string) int
|
CppTTSSetVoiceFile func(path string, refText string) int
|
||||||
|
|
||||||
|
// Word-level timestamp accessors (session-based, per-segment)
|
||||||
|
CppGetWordCount func(segI int) int
|
||||||
|
CppGetWordText func(segI int, wordI int) string
|
||||||
|
CppGetWordT0 func(segI int, wordI int) int64
|
||||||
|
CppGetWordT1 func(segI int, wordI int) int64
|
||||||
|
|
||||||
|
// Parakeet-specific word accessors (global, no segment index)
|
||||||
|
CppGetParakeetWordCount func() int
|
||||||
|
CppGetParakeetWordText func(wordI int) string
|
||||||
|
CppGetParakeetWordT0 func(wordI int) int64
|
||||||
|
CppGetParakeetWordT1 func(wordI int) int64
|
||||||
)
|
)
|
||||||
|
|
||||||
type CrispASR struct {
|
type CrispASR struct {
|
||||||
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isValidWord reports whether a TranscriptWord contains recognisable speech
|
||||||
|
// content. The parakeet-specific word accessors can return stale initialisation
|
||||||
|
// data (model name, binary blobs) when a segment has no real speech. A word is
|
||||||
|
// considered valid only when:
|
||||||
|
// - the text is non-empty after trimming,
|
||||||
|
// - it contains no U+FFFD replacement characters (from binary data scrubbing),
|
||||||
|
// - both timestamps are non-negative,
|
||||||
|
// - the word has positive duration (end > start).
|
||||||
|
func isValidWord(w *pb.TranscriptWord) bool {
|
||||||
|
txt := strings.TrimSpace(w.Text)
|
||||||
|
if txt == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if strings.ContainsRune(txt, '\uFFFD') {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if w.Start < 0 || w.End < 0 || w.End <= w.Start {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
|
return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
|
||||||
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
|
|||||||
// IDs, so Tokens is left empty.
|
// IDs, so Tokens is left empty.
|
||||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||||
|
|
||||||
|
// Populate word-level timestamps. Try session-based functions first
|
||||||
|
// (per-segment); fall back to parakeet-specific functions (global word
|
||||||
|
// list with no segment index — only populated on the first segment to
|
||||||
|
// avoid duplication).
|
||||||
|
words := []*pb.TranscriptWord{}
|
||||||
|
wordCount := CppGetWordCount(i)
|
||||||
|
if wordCount == 0 && i == 0 {
|
||||||
|
wordCount = CppGetParakeetWordCount()
|
||||||
|
for j := 0; j < wordCount; j++ {
|
||||||
|
w := &pb.TranscriptWord{
|
||||||
|
Start: CppGetParakeetWordT0(j) * (10000000),
|
||||||
|
End: CppGetParakeetWordT1(j) * (10000000),
|
||||||
|
Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
|
||||||
|
}
|
||||||
|
if isValidWord(w) {
|
||||||
|
words = append(words, w)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for j := 0; j < wordCount; j++ {
|
||||||
|
w := &pb.TranscriptWord{
|
||||||
|
Start: CppGetWordT0(i, j) * (10000000),
|
||||||
|
End: CppGetWordT1(i, j) * (10000000),
|
||||||
|
Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
|
||||||
|
}
|
||||||
|
if isValidWord(w) {
|
||||||
|
words = append(words, w)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip empty segments with no recognisable content (e.g. trailing
|
||||||
|
// silence segments that parakeet emits with stale init data).
|
||||||
|
trimmed := strings.TrimSpace(txt)
|
||||||
|
if trimmed == "" && len(words) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
segment := &pb.TranscriptSegment{
|
segment := &pb.TranscriptSegment{
|
||||||
Id: int32(i),
|
Id: int32(i),
|
||||||
Text: txt,
|
Text: txt,
|
||||||
Start: s, End: t,
|
Start: s, End: t,
|
||||||
|
Words: words,
|
||||||
}
|
}
|
||||||
|
|
||||||
segments = append(segments, segment)
|
segments = append(segments, segment)
|
||||||
|
|
||||||
text += " " + strings.TrimSpace(txt)
|
text += " " + trimmed
|
||||||
}
|
}
|
||||||
|
|
||||||
return pb.TranscriptResult{
|
return pb.TranscriptResult{
|
||||||
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
|
|||||||
s := CppGetSegmentStart(i) * 10000000
|
s := CppGetSegmentStart(i) * 10000000
|
||||||
t := CppGetSegmentEnd(i) * 10000000
|
t := CppGetSegmentEnd(i) * 10000000
|
||||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||||
|
|
||||||
|
// Skip empty segments (e.g. trailing silence that parakeet emits
|
||||||
|
// with stale init data).
|
||||||
|
trimmed := strings.TrimSpace(txt)
|
||||||
|
if trimmed == "" && s == t {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
segments = append(segments, &pb.TranscriptSegment{
|
segments = append(segments, &pb.TranscriptSegment{
|
||||||
Id: int32(i),
|
Id: int32(i),
|
||||||
Text: txt,
|
Text: txt,
|
||||||
Start: s, End: t,
|
Start: s, End: t,
|
||||||
})
|
})
|
||||||
|
|
||||||
trimmed := strings.TrimSpace(txt)
|
|
||||||
if trimmed == "" {
|
if trimmed == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,6 +44,14 @@ func main() {
|
|||||||
{&CppTTSFree, "tts_free"},
|
{&CppTTSFree, "tts_free"},
|
||||||
{&CppTTSSetVoice, "tts_set_voice"},
|
{&CppTTSSetVoice, "tts_set_voice"},
|
||||||
{&CppTTSSetVoiceFile, "tts_set_voice_file"},
|
{&CppTTSSetVoiceFile, "tts_set_voice_file"},
|
||||||
|
{&CppGetWordCount, "get_word_count"},
|
||||||
|
{&CppGetWordText, "get_word_text"},
|
||||||
|
{&CppGetWordT0, "get_word_t0"},
|
||||||
|
{&CppGetWordT1, "get_word_t1"},
|
||||||
|
{&CppGetParakeetWordCount, "get_parakeet_word_count"},
|
||||||
|
{&CppGetParakeetWordText, "get_parakeet_word_text"},
|
||||||
|
{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
|
||||||
|
{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, lf := range libFuncs {
|
for _, lf := range libFuncs {
|
||||||
|
|||||||
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
|
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
|
||||||
# merge upstream can orphan a branch, so the native version is pinned by SHA.
|
# merge upstream can orphan a branch, so the native version is pinned by SHA.
|
||||||
# This SHA adds the nested two-file metric C-API (abi_version 4,
|
# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
|
||||||
# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
|
# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
|
||||||
# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
|
# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
|
||||||
|
# It is kept alive by the upstream tag da2-support (survives a squash-merge);
|
||||||
|
# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
|
||||||
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
|
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
|
||||||
DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
|
DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118
|
||||||
|
|
||||||
ifeq ($(NATIVE),false)
|
ifeq ($(NATIVE),false)
|
||||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# omnivoice.cpp version
|
# omnivoice.cpp version
|
||||||
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
||||||
OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
|
OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
|
||||||
SO_TARGET?=libgomnivoicecpp.so
|
SO_TARGET?=libgomnivoicecpp.so
|
||||||
|
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# parakeet-cpp backend Makefile.
|
# parakeet-cpp backend Makefile.
|
||||||
#
|
#
|
||||||
# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||||
# (.github/bump_deps.sh) can find and update it - matches the
|
# (.github/bump_deps.sh) can find and update it - matches the
|
||||||
# whisper.cpp / ds4 / vibevoice-cpp convention.
|
# whisper.cpp / ds4 / vibevoice-cpp convention.
|
||||||
#
|
#
|
||||||
@@ -15,7 +15,7 @@
|
|||||||
# That's what the L0 smoke test uses. The default target below does the
|
# That's what the L0 smoke test uses. The default target below does the
|
||||||
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
|
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
|
||||||
|
|
||||||
PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||||
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
|
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
|
||||||
|
|
||||||
GOCMD?=go
|
GOCMD?=go
|
||||||
|
|||||||
@@ -1,23 +1,68 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
|
# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
|
||||||
# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
|
# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
|
||||||
# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
|
# BUILD_TYPE so the package is self-contained. Mirrors
|
||||||
|
# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
|
||||||
|
# through lib/ld.so so the packaged libc is used instead of the host's.
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
CURDIR=$(dirname "$(realpath "$0")")
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
REPO_ROOT="${CURDIR}/../../.."
|
||||||
|
|
||||||
mkdir -p "$CURDIR/package/lib"
|
mkdir -p "$CURDIR/package/lib"
|
||||||
|
|
||||||
cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
|
cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
|
||||||
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||||
|
|
||||||
# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
|
# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
|
||||||
|
# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
|
||||||
cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||||
echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
|
echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "L0 package layout (full ldd walk lands in L3):"
|
# Detect architecture and copy the core runtime libs libparakeet.so links
|
||||||
|
# against, plus the matching dynamic loader as lib/ld.so.
|
||||||
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
|
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||||
|
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||||
|
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||||
|
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||||
|
echo "Detected Darwin"
|
||||||
|
else
|
||||||
|
echo "Error: Could not detect architecture"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
|
||||||
|
# based on BUILD_TYPE so the backend can reach the GPU without the runtime
|
||||||
|
# base image shipping those drivers.
|
||||||
|
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||||
|
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||||
|
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||||
|
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||||
|
package_gpu_libs
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Packaging completed successfully"
|
||||||
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# qwentts.cpp version
|
# qwentts.cpp version
|
||||||
QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
|
QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
|
||||||
QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
|
QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
|
||||||
SO_TARGET?=libgoqwen3ttscpp.so
|
SO_TARGET?=libgoqwen3ttscpp.so
|
||||||
|
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# stablediffusion.cpp (ggml)
|
# stablediffusion.cpp (ggml)
|
||||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||||
STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
|
STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f
|
||||||
|
|
||||||
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# whisper.cpp version
|
# whisper.cpp version
|
||||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||||
WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
|
WHISPER_CPP_VERSION?=bae6bc02b1940bbfb87b6a0299c565e563b916d1
|
||||||
SO_TARGET?=libgowhisper.so
|
SO_TARGET?=libgowhisper.so
|
||||||
|
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||||
|
|||||||
@@ -178,6 +178,37 @@
|
|||||||
nvidia-cuda-12: "cuda12-parakeet-cpp"
|
nvidia-cuda-12: "cuda12-parakeet-cpp"
|
||||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
|
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
|
||||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
|
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
|
||||||
|
- &ced
|
||||||
|
name: "ced"
|
||||||
|
alias: "ced"
|
||||||
|
license: mit
|
||||||
|
icon: https://avatars.githubusercontent.com/u/95302084
|
||||||
|
description: |
|
||||||
|
CED sound-event classification / audio tagging (527-class AudioSet).
|
||||||
|
ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
|
||||||
|
taxonomy, exposed through the SoundDetection gRPC rpc and the
|
||||||
|
/v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
|
||||||
|
AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
|
||||||
|
urls:
|
||||||
|
- https://github.com/mudler/ced.cpp
|
||||||
|
tags:
|
||||||
|
- audio-classification
|
||||||
|
- CPU
|
||||||
|
- GPU
|
||||||
|
- CUDA
|
||||||
|
- HIP
|
||||||
|
capabilities:
|
||||||
|
default: "cpu-ced"
|
||||||
|
nvidia: "cuda12-ced"
|
||||||
|
intel: "intel-sycl-f16-ced"
|
||||||
|
metal: "metal-ced"
|
||||||
|
amd: "rocm-ced"
|
||||||
|
vulkan: "vulkan-ced"
|
||||||
|
nvidia-l4t: "nvidia-l4t-arm64-ced"
|
||||||
|
nvidia-cuda-13: "cuda13-ced"
|
||||||
|
nvidia-cuda-12: "cuda12-ced"
|
||||||
|
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
|
||||||
|
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
|
||||||
- &voxtral
|
- &voxtral
|
||||||
name: "voxtral"
|
name: "voxtral"
|
||||||
alias: "voxtral"
|
alias: "voxtral"
|
||||||
@@ -2650,6 +2681,121 @@
|
|||||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
|
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
|
||||||
mirrors:
|
mirrors:
|
||||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
|
- localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
|
||||||
|
## ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "ced-development"
|
||||||
|
capabilities:
|
||||||
|
default: "cpu-ced-development"
|
||||||
|
nvidia: "cuda12-ced-development"
|
||||||
|
intel: "intel-sycl-f16-ced-development"
|
||||||
|
metal: "metal-ced-development"
|
||||||
|
amd: "rocm-ced-development"
|
||||||
|
vulkan: "vulkan-ced-development"
|
||||||
|
nvidia-l4t: "nvidia-l4t-arm64-ced-development"
|
||||||
|
nvidia-cuda-13: "cuda13-ced-development"
|
||||||
|
nvidia-cuda-12: "cuda12-ced-development"
|
||||||
|
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
|
||||||
|
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "nvidia-l4t-arm64-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-nvidia-l4t-arm64-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "nvidia-l4t-arm64-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-nvidia-l4t-arm64-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cuda13-nvidia-l4t-arm64-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cuda13-nvidia-l4t-arm64-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cpu-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-cpu-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cpu-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-cpu-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "metal-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-metal-darwin-arm64-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "metal-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-metal-darwin-arm64-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cuda12-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cuda12-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-gpu-nvidia-cuda-12-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "rocm-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-gpu-rocm-hipblas-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "rocm-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-gpu-rocm-hipblas-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "intel-sycl-f32-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-gpu-intel-sycl-f32-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "intel-sycl-f32-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-gpu-intel-sycl-f32-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "intel-sycl-f16-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-gpu-intel-sycl-f16-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "intel-sycl-f16-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-gpu-intel-sycl-f16-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "vulkan-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-gpu-vulkan-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "vulkan-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-gpu-vulkan-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cuda13-ced"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
|
||||||
|
- !!merge <<: *ced
|
||||||
|
name: "cuda13-ced-development"
|
||||||
|
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
|
||||||
|
mirrors:
|
||||||
|
- localai/localai-backends:master-gpu-nvidia-cuda-13-ced
|
||||||
## stablediffusion-ggml
|
## stablediffusion-ggml
|
||||||
- !!merge <<: *stablediffusionggml
|
- !!merge <<: *stablediffusionggml
|
||||||
name: "cpu-stablediffusion-ggml"
|
name: "cpu-stablediffusion-ggml"
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
opencv-python
|
opencv-python
|
||||||
transformers
|
transformers==4.57.6
|
||||||
torchvision==0.22.1
|
torchvision==0.22.1
|
||||||
accelerate
|
accelerate
|
||||||
git+https://github.com/xhinker/sd_embed
|
git+https://github.com/xhinker/sd_embed
|
||||||
@@ -10,9 +10,15 @@ sentencepiece
|
|||||||
torch==2.7.1
|
torch==2.7.1
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
ftfy
|
ftfy
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cu121
|
--extra-index-url https://download.pytorch.org/whl/cu121
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
opencv-python
|
opencv-python
|
||||||
transformers
|
transformers==4.57.6
|
||||||
torchvision
|
torchvision
|
||||||
accelerate
|
accelerate
|
||||||
git+https://github.com/xhinker/sd_embed
|
git+https://github.com/xhinker/sd_embed
|
||||||
@@ -10,9 +10,15 @@ sentencepiece
|
|||||||
torch
|
torch
|
||||||
ftfy
|
ftfy
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
opencv-python
|
opencv-python
|
||||||
transformers
|
transformers==4.57.6
|
||||||
torchvision
|
torchvision
|
||||||
accelerate
|
accelerate
|
||||||
git+https://github.com/xhinker/sd_embed
|
git+https://github.com/xhinker/sd_embed
|
||||||
@@ -10,9 +10,15 @@ sentencepiece
|
|||||||
torch
|
torch
|
||||||
ftfy
|
ftfy
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
|
|||||||
@@ -1,17 +1,23 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/rocm7.0
|
--extra-index-url https://download.pytorch.org/whl/rocm7.0
|
||||||
torch==2.10.0+rocm7.0
|
torch==2.10.0+rocm7.0
|
||||||
torchvision==0.25.0+rocm7.0
|
torchvision==0.25.0+rocm7.0
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
opencv-python
|
opencv-python
|
||||||
transformers
|
transformers==4.57.6
|
||||||
accelerate
|
accelerate
|
||||||
peft
|
peft
|
||||||
sentencepiece
|
sentencepiece
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
ftfy
|
ftfy
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
@@ -3,18 +3,24 @@ torch
|
|||||||
torchvision
|
torchvision
|
||||||
optimum[openvino]
|
optimum[openvino]
|
||||||
setuptools
|
setuptools
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
opencv-python
|
opencv-python
|
||||||
transformers
|
transformers==4.57.6
|
||||||
accelerate
|
accelerate
|
||||||
git+https://github.com/xhinker/sd_embed
|
git+https://github.com/xhinker/sd_embed
|
||||||
peft
|
peft
|
||||||
sentencepiece
|
sentencepiece
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
ftfy
|
ftfy
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
|
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
|
||||||
torch
|
torch
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
transformers
|
transformers==4.57.6
|
||||||
accelerate
|
accelerate
|
||||||
peft
|
peft
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
@@ -9,9 +9,15 @@ numpy<2
|
|||||||
sentencepiece
|
sentencepiece
|
||||||
torchvision
|
torchvision
|
||||||
ftfy
|
ftfy
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||||
torch
|
torch
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
transformers
|
transformers==4.57.6
|
||||||
accelerate
|
accelerate
|
||||||
peft
|
peft
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
@@ -10,9 +10,15 @@ sentencepiece
|
|||||||
torchvision
|
torchvision
|
||||||
ftfy
|
ftfy
|
||||||
chardet
|
chardet
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
|
|||||||
@@ -1,16 +1,22 @@
|
|||||||
torch==2.7.1
|
torch==2.7.1
|
||||||
torchvision==0.22.1
|
torchvision==0.22.1
|
||||||
git+https://github.com/huggingface/diffusers
|
diffusers==0.38.0
|
||||||
opencv-python
|
opencv-python
|
||||||
transformers
|
transformers==4.57.6
|
||||||
accelerate
|
accelerate
|
||||||
peft
|
peft
|
||||||
sentencepiece
|
sentencepiece
|
||||||
optimum-quanto
|
optimum-quanto
|
||||||
ftfy
|
ftfy
|
||||||
# TODO: re-add compel once it supports transformers >= 5.
|
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||||
# https://github.com/damian0815/compel/issues/128
|
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
# last known-good released pair so builds are reproducible and can't drift into
|
||||||
|
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||||
|
#
|
||||||
|
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||||
|
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||||
|
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||||
|
# the import succeeding, so dropping it here is safe.
|
||||||
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
|
|
||||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||||
|
|
||||||
|
def _get_stride_seconds(self):
|
||||||
|
"""Compute the seconds-per-frame stride for the loaded model.
|
||||||
|
|
||||||
|
stride = preprocessor_window_stride * encoder_subsampling_factor
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
preprocessor = self.model.preprocessor
|
||||||
|
window_stride = preprocessor._cfg.get('window_stride', 0.01)
|
||||||
|
subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
|
||||||
|
return window_stride * subsampling_factor
|
||||||
|
except (AttributeError, KeyError, TypeError) as err:
|
||||||
|
print(
|
||||||
|
f"Warning: could not compute stride from model config ({err}), "
|
||||||
|
f"falling back to 0.08s/frame",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return 0.08
|
||||||
|
|
||||||
|
def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
|
||||||
|
"""Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
|
||||||
|
|
||||||
|
Supports two granularity modes:
|
||||||
|
- "word": one TranscriptSegment per word, each with a single TranscriptWord entry
|
||||||
|
- "segment" (default): merge consecutive words into sentence-level segments,
|
||||||
|
splitting at word-level time gaps that exceed a dynamic threshold.
|
||||||
|
"""
|
||||||
|
if not hypothesis or not isinstance(hypothesis.timestamp, dict):
|
||||||
|
return []
|
||||||
|
|
||||||
|
word_offsets = hypothesis.timestamp.get('word', [])
|
||||||
|
if not word_offsets:
|
||||||
|
return []
|
||||||
|
|
||||||
|
granularities = list(timestamp_granularities) if timestamp_granularities else []
|
||||||
|
granularity = "word" if "word" in granularities else "segment"
|
||||||
|
|
||||||
|
# Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
|
||||||
|
transcript_words = []
|
||||||
|
for wo in word_offsets:
|
||||||
|
word_text = wo.get('word', '')
|
||||||
|
if not word_text:
|
||||||
|
continue
|
||||||
|
start_offset = wo.get('start_offset', 0)
|
||||||
|
end_offset = wo.get('end_offset', start_offset)
|
||||||
|
start_ns = int(start_offset * stride * 1_000_000_000)
|
||||||
|
end_ns = int(end_offset * stride * 1_000_000_000)
|
||||||
|
transcript_words.append({
|
||||||
|
'text': word_text,
|
||||||
|
'start': start_ns,
|
||||||
|
'end': end_ns,
|
||||||
|
})
|
||||||
|
|
||||||
|
if not transcript_words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if granularity == "word":
|
||||||
|
# One segment per word
|
||||||
|
result = []
|
||||||
|
for idx, tw in enumerate(transcript_words):
|
||||||
|
word = backend_pb2.TranscriptWord(
|
||||||
|
start=tw['start'], end=tw['end'], text=tw['text']
|
||||||
|
)
|
||||||
|
result.append(backend_pb2.TranscriptSegment(
|
||||||
|
id=idx,
|
||||||
|
start=tw['start'],
|
||||||
|
end=tw['end'],
|
||||||
|
text=tw['text'],
|
||||||
|
words=[word],
|
||||||
|
))
|
||||||
|
return result
|
||||||
|
|
||||||
|
# segment mode — merge at word-level time-gap boundaries
|
||||||
|
# Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
|
||||||
|
gaps = []
|
||||||
|
for i in range(1, len(transcript_words)):
|
||||||
|
gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
|
||||||
|
if gap > 0:
|
||||||
|
gaps.append(gap)
|
||||||
|
if gaps:
|
||||||
|
gaps.sort()
|
||||||
|
median_gap = gaps[len(gaps) // 2]
|
||||||
|
threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
|
||||||
|
else:
|
||||||
|
threshold_ns = int(0.5 * 1_000_000_000)
|
||||||
|
|
||||||
|
result = []
|
||||||
|
buf_words = [] # list of TranscriptWord protobuf
|
||||||
|
buf_start = None
|
||||||
|
buf_end = 0
|
||||||
|
buf_text = []
|
||||||
|
prev_end = None
|
||||||
|
|
||||||
|
for tw in transcript_words:
|
||||||
|
# Detect word-level time gap
|
||||||
|
if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
|
||||||
|
seg_text = ' '.join(buf_text)
|
||||||
|
result.append(backend_pb2.TranscriptSegment(
|
||||||
|
id=len(result),
|
||||||
|
start=buf_start,
|
||||||
|
end=buf_end,
|
||||||
|
text=seg_text,
|
||||||
|
words=list(buf_words),
|
||||||
|
))
|
||||||
|
buf_words = []
|
||||||
|
buf_text = []
|
||||||
|
buf_start = None
|
||||||
|
|
||||||
|
if buf_start is None:
|
||||||
|
buf_start = tw['start']
|
||||||
|
buf_end = tw['end']
|
||||||
|
buf_text.append(tw['text'])
|
||||||
|
buf_words.append(backend_pb2.TranscriptWord(
|
||||||
|
start=tw['start'], end=tw['end'], text=tw['text']
|
||||||
|
))
|
||||||
|
prev_end = tw['end']
|
||||||
|
|
||||||
|
# flush remaining
|
||||||
|
if buf_text and buf_start is not None:
|
||||||
|
seg_text = ' '.join(buf_text)
|
||||||
|
result.append(backend_pb2.TranscriptSegment(
|
||||||
|
id=len(result),
|
||||||
|
start=buf_start,
|
||||||
|
end=buf_end,
|
||||||
|
text=seg_text,
|
||||||
|
words=list(buf_words),
|
||||||
|
))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def AudioTranscription(self, request, context):
|
def AudioTranscription(self, request, context):
|
||||||
result_segments = []
|
result_segments = []
|
||||||
text = ""
|
text = ""
|
||||||
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
|
print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
|
||||||
return backend_pb2.TranscriptResult(segments=[], text="")
|
return backend_pb2.TranscriptResult(segments=[], text="")
|
||||||
|
|
||||||
# NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
|
# Determine requested timestamp granularity
|
||||||
results = self.model.transcribe([audio_path])
|
timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
|
||||||
|
want_timestamps = bool(timestamp_granularities)
|
||||||
|
|
||||||
if not results or len(results) == 0:
|
if want_timestamps:
|
||||||
return backend_pb2.TranscriptResult(segments=[], text="")
|
# Request timestamps from NeMo.
|
||||||
|
# timestamps=True forces NeMo to return Hypothesis objects with
|
||||||
|
# the timestamp dict populated, so we omit return_hypotheses to
|
||||||
|
# let NeMo choose the correct return type.
|
||||||
|
results = self.model.transcribe([audio_path], timestamps=True)
|
||||||
|
|
||||||
# Get the transcript text from the first result.
|
if results and len(results) > 0:
|
||||||
# CTC models return List[str], TDT/RNNT models return List[Hypothesis]
|
hypotheses = results[0] if isinstance(results[0], list) else results
|
||||||
# where the actual text lives in Hypothesis.text.
|
if hypotheses and len(hypotheses) > 0:
|
||||||
result = results[0]
|
hypothesis = hypotheses[0]
|
||||||
if isinstance(result, str):
|
|
||||||
text = result
|
# Hypothesis object should have .timestamp populated
|
||||||
|
if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
|
||||||
|
print(
|
||||||
|
"Warning: timestamps were requested but NeMo did not return "
|
||||||
|
"Hypothesis objects; falling back to untimestamped output",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract text
|
||||||
|
if hasattr(hypothesis, 'text'):
|
||||||
|
text = hypothesis.text or ""
|
||||||
|
elif isinstance(hypothesis, str):
|
||||||
|
text = hypothesis
|
||||||
|
|
||||||
|
# Build segments with word-level timestamps
|
||||||
|
stride = self._get_stride_seconds()
|
||||||
|
result_segments = self._build_segments_with_words(
|
||||||
|
hypothesis, stride, timestamp_granularities
|
||||||
|
)
|
||||||
|
|
||||||
|
# If no word offsets but we have text, fall back to single segment
|
||||||
|
if not result_segments and text:
|
||||||
|
result_segments.append(backend_pb2.TranscriptSegment(
|
||||||
|
id=0, start=0, end=0, text=text
|
||||||
|
))
|
||||||
else:
|
else:
|
||||||
text = getattr(result, 'text', None) or ""
|
# Simple transcription without timestamps
|
||||||
|
# NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
|
||||||
|
results = self.model.transcribe([audio_path])
|
||||||
|
|
||||||
if text:
|
if results and len(results) > 0:
|
||||||
# Create a single segment with the full transcription
|
# Get the transcript text from the first result.
|
||||||
result_segments.append(backend_pb2.TranscriptSegment(
|
# CTC models return List[str], TDT/RNNT models return List[Hypothesis]
|
||||||
id=0, start=0, end=0, text=text
|
# where the actual text lives in Hypothesis.text.
|
||||||
))
|
result = results[0]
|
||||||
|
if isinstance(result, str):
|
||||||
|
text = result
|
||||||
|
else:
|
||||||
|
text = getattr(result, 'text', None) or ""
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Create a single segment with the full transcription
|
||||||
|
result_segments.append(backend_pb2.TranscriptSegment(
|
||||||
|
id=0, start=0, end=0, text=text
|
||||||
|
))
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
print(f"Error in AudioTranscription: {err}", file=sys.stderr)
|
print(f"Error in AudioTranscription: {err}", file=sys.stderr)
|
||||||
|
|||||||
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
|
|
||||||
dataset_split = request.dataset_split or "train"
|
dataset_split = request.dataset_split or "train"
|
||||||
if os.path.exists(request.dataset_source):
|
if os.path.exists(request.dataset_source):
|
||||||
|
_allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
|
||||||
|
_real_path = os.path.realpath(os.path.abspath(request.dataset_source))
|
||||||
|
if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
|
||||||
|
raise ValueError("Dataset source path is outside the allowed directory")
|
||||||
if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
|
if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
|
||||||
dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
|
dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
|
||||||
elif request.dataset_source.endswith('.csv'):
|
elif request.dataset_source.endswith('.csv'):
|
||||||
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
def ExportModel(self, request, context):
|
def ExportModel(self, request, context):
|
||||||
export_format = request.export_format or "lora"
|
export_format = request.export_format or "lora"
|
||||||
output_path = request.output_path
|
output_path = request.output_path
|
||||||
|
_allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
|
||||||
|
_real_output_path = os.path.realpath(os.path.abspath(output_path))
|
||||||
|
if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
|
||||||
|
raise ValueError("Output path is outside the allowed directory")
|
||||||
|
output_path = _real_output_path
|
||||||
checkpoint_path = request.checkpoint_path
|
checkpoint_path = request.checkpoint_path
|
||||||
|
|
||||||
# Extract HF token for gated model access
|
# Extract HF token for gated model access
|
||||||
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
env["NO_LOCAL_GGUF"] = "1"
|
env["NO_LOCAL_GGUF"] = "1"
|
||||||
cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
|
cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
|
||||||
conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
|
conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False) # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
|
||||||
if conv_result.returncode != 0:
|
if conv_result.returncode != 0:
|
||||||
diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
|
diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
|
||||||
return backend_pb2.Result(success=False,
|
return backend_pb2.Result(success=False,
|
||||||
|
|||||||
@@ -48,8 +48,10 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
HAS_REASONING_PARSERS = False
|
HAS_REASONING_PARSERS = False
|
||||||
|
|
||||||
|
# vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
|
||||||
|
# SamplingParams field guided_decoding -> structured_outputs.
|
||||||
try:
|
try:
|
||||||
from vllm.sampling_params import GuidedDecodingParams
|
from vllm.sampling_params import StructuredOutputsParams
|
||||||
HAS_GUIDED_DECODING = True
|
HAS_GUIDED_DECODING = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
HAS_GUIDED_DECODING = False
|
HAS_GUIDED_DECODING = False
|
||||||
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
if value not in (None, 0, [], False):
|
if value not in (None, 0, [], False):
|
||||||
setattr(sampling_params, param_field, value)
|
setattr(sampling_params, param_field, value)
|
||||||
|
|
||||||
# Guided decoding: use Grammar field to pass JSON schema or BNF
|
# Structured-output decoding: use Grammar field to pass JSON schema or BNF
|
||||||
if HAS_GUIDED_DECODING and request.Grammar:
|
if HAS_GUIDED_DECODING and request.Grammar:
|
||||||
try:
|
try:
|
||||||
json.loads(request.Grammar) # valid JSON = JSON schema
|
json.loads(request.Grammar) # valid JSON = JSON schema
|
||||||
sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
|
sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
|
sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)
|
||||||
|
|
||||||
# Extract image paths and process images
|
# Extract image paths and process images
|
||||||
prompt = request.Prompt
|
prompt = request.Prompt
|
||||||
@@ -596,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
|
|
||||||
# Stream the results
|
# Stream the results
|
||||||
generated_text = ""
|
generated_text = ""
|
||||||
|
generated_token_ids: list[int] = []
|
||||||
last_output = None
|
last_output = None
|
||||||
|
|
||||||
|
# Tool-parsing strategy decision (made once, before the loop):
|
||||||
|
#
|
||||||
|
# When a tool parser is active, the model's raw tool-call markup
|
||||||
|
# (e.g. <tool_call>...) must not be streamed verbatim as delta.content
|
||||||
|
# — clients would see the unparsed syntax. Two paths:
|
||||||
|
#
|
||||||
|
# (A) native streaming via parser.extract_tool_calls_streaming. All
|
||||||
|
# concrete tool parsers shipped with vLLM 0.23+ implement this
|
||||||
|
# (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
|
||||||
|
# llama3_json, mistral, …). The parser decides per-delta whether
|
||||||
|
# to emit content or suppress tool-call markup, and emits a
|
||||||
|
# structured DeltaMessage(tool_calls=[...]) when a call is ready.
|
||||||
|
# (B) buffer fallback — used only when the parser surprisingly lacks
|
||||||
|
# the streaming method or it raises mid-stream. The post-loop
|
||||||
|
# extract_tool_calls assembles the final chat_delta. Same correctness
|
||||||
|
# guarantee as a non-streaming response, at the cost of a delayed
|
||||||
|
# final chunk.
|
||||||
|
has_tool_parser = bool(self.tool_parser_cls and request.Tools)
|
||||||
|
tp_instance = None
|
||||||
|
tp_request = None
|
||||||
|
native_streaming = False
|
||||||
|
native_streaming_error = False
|
||||||
|
if has_tool_parser:
|
||||||
|
try:
|
||||||
|
tools_for_parser = json.loads(request.Tools)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
tools_for_parser = []
|
||||||
|
try:
|
||||||
|
tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
|
||||||
|
except TypeError:
|
||||||
|
tp_instance = self.tool_parser_cls(self.tokenizer)
|
||||||
|
# Build a minimal ChatCompletionRequest so the streaming method
|
||||||
|
# sees the tools list. We do not need any other request fields —
|
||||||
|
# parsers only read .tools (and sometimes .tool_choice, which we
|
||||||
|
# leave at default).
|
||||||
|
try:
|
||||||
|
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||||
|
ChatCompletionRequest as _CCR,
|
||||||
|
)
|
||||||
|
tp_request = _CCR(
|
||||||
|
model="local",
|
||||||
|
messages=[{"role": "user", "content": ""}],
|
||||||
|
tools=tools_for_parser or None,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
|
||||||
|
file=sys.stderr)
|
||||||
|
tp_request = None
|
||||||
|
native_streaming = (
|
||||||
|
tp_request is not None
|
||||||
|
and hasattr(tp_instance, "extract_tool_calls_streaming")
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async for request_output in outputs:
|
async for request_output in outputs:
|
||||||
iteration_text = request_output.outputs[0].text
|
iteration_text = request_output.outputs[0].text
|
||||||
last_output = request_output
|
last_output = request_output
|
||||||
|
|
||||||
if streaming:
|
if streaming:
|
||||||
# Remove text already sent as vllm concatenates the text from previous yields
|
|
||||||
delta_iteration_text = iteration_text.removeprefix(generated_text)
|
delta_iteration_text = iteration_text.removeprefix(generated_text)
|
||||||
# Send the partial result
|
new_token_ids = list(request_output.outputs[0].token_ids)
|
||||||
yield backend_pb2.Reply(
|
delta_token_ids = new_token_ids[len(generated_token_ids):]
|
||||||
message=bytes(delta_iteration_text, encoding='utf-8'),
|
|
||||||
chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Keep track of text generated
|
if not has_tool_parser:
|
||||||
|
# Plain streaming — unchanged from pre-tool-parser path.
|
||||||
|
yield backend_pb2.Reply(
|
||||||
|
message=bytes(delta_iteration_text, encoding='utf-8'),
|
||||||
|
chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
|
||||||
|
)
|
||||||
|
elif native_streaming and not native_streaming_error:
|
||||||
|
# (A) Native vLLM extract_tool_calls_streaming.
|
||||||
|
try:
|
||||||
|
msg = tp_instance.extract_tool_calls_streaming(
|
||||||
|
previous_text=generated_text,
|
||||||
|
current_text=iteration_text,
|
||||||
|
delta_text=delta_iteration_text,
|
||||||
|
previous_token_ids=generated_token_ids,
|
||||||
|
current_token_ids=new_token_ids,
|
||||||
|
delta_token_ids=delta_token_ids,
|
||||||
|
request=tp_request,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Streaming tool parser error (falling back to "
|
||||||
|
f"buffer for the rest of the stream): {e}",
|
||||||
|
file=sys.stderr)
|
||||||
|
native_streaming_error = True
|
||||||
|
msg = None
|
||||||
|
if msg is not None:
|
||||||
|
tc_protos = []
|
||||||
|
for tc in (msg.tool_calls or []):
|
||||||
|
fn = tc.function or None
|
||||||
|
tc_protos.append(backend_pb2.ToolCallDelta(
|
||||||
|
index=tc.index,
|
||||||
|
id=tc.id or "",
|
||||||
|
name=(fn.name if fn and fn.name else "") or "",
|
||||||
|
arguments=(fn.arguments if fn and fn.arguments else "") or "",
|
||||||
|
))
|
||||||
|
cd_kwargs = {}
|
||||||
|
if msg.content:
|
||||||
|
cd_kwargs["content"] = msg.content
|
||||||
|
if msg.reasoning:
|
||||||
|
cd_kwargs["reasoning_content"] = msg.reasoning
|
||||||
|
if tc_protos:
|
||||||
|
cd_kwargs["tool_calls"] = tc_protos
|
||||||
|
if cd_kwargs:
|
||||||
|
yield backend_pb2.Reply(
|
||||||
|
message=bytes(msg.content or "", encoding='utf-8'),
|
||||||
|
chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
|
||||||
|
)
|
||||||
|
# (B) buffer fallback — emit nothing during the stream.
|
||||||
|
# The post-loop extract_tool_calls block builds the final chunk.
|
||||||
|
|
||||||
|
# Keep track of text + token_ids generated
|
||||||
generated_text = iteration_text
|
generated_text = iteration_text
|
||||||
|
generated_token_ids = list(request_output.outputs[0].token_ids)
|
||||||
finally:
|
finally:
|
||||||
await outputs.aclose()
|
await outputs.aclose()
|
||||||
|
|
||||||
@@ -637,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Reasoning parser error: {e}", file=sys.stderr)
|
print(f"Reasoning parser error: {e}", file=sys.stderr)
|
||||||
|
|
||||||
if self.tool_parser_cls and request.Tools:
|
# When (A) native streaming ran cleanly, per-delta yields above already
|
||||||
|
# delivered everything — do NOT extract again on the full text or we'd
|
||||||
|
# duplicate content/tool_calls into the final chunk.
|
||||||
|
if has_tool_parser and not (native_streaming and not native_streaming_error):
|
||||||
try:
|
try:
|
||||||
tools = json.loads(request.Tools)
|
tp = tp_instance
|
||||||
# Some concrete parsers only accept the tokenizer; only the
|
if tp is None:
|
||||||
# abstract base declares the tools kwarg. Try with tools first,
|
# Defensive: tp_instance build failed earlier; reconstruct.
|
||||||
# fall back to tokenizer-only.
|
tools = json.loads(request.Tools)
|
||||||
try:
|
try:
|
||||||
tp = self.tool_parser_cls(self.tokenizer, tools=tools)
|
tp = self.tool_parser_cls(self.tokenizer, tools=tools)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
tp = self.tool_parser_cls(self.tokenizer)
|
tp = self.tool_parser_cls(self.tokenizer)
|
||||||
info = tp.extract_tool_calls(content, request=None)
|
info = tp.extract_tool_calls(content, request=None)
|
||||||
if info.tools_called:
|
if info.tools_called:
|
||||||
content = info.content or ""
|
content = info.content or ""
|
||||||
@@ -659,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
))
|
))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Tool parser error: {e}", file=sys.stderr)
|
print(f"Tool parser error: {e}", file=sys.stderr)
|
||||||
|
elif native_streaming and not native_streaming_error:
|
||||||
|
# Per-delta path already emitted content + tool_calls; the final
|
||||||
|
# chat_delta should carry only metadata (token counts, logprobs).
|
||||||
|
content = ""
|
||||||
|
|
||||||
# Extract token counts
|
# Extract token counts
|
||||||
prompt_tokens = 0
|
prompt_tokens = 0
|
||||||
@@ -698,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if streaming:
|
if streaming:
|
||||||
# Final chunk with structured data
|
# Final chunk with structured data.
|
||||||
|
#
|
||||||
|
# If we used the buffer fallback (has_tool_parser=True AND native
|
||||||
|
# streaming did NOT run cleanly) and the parser found no tool call,
|
||||||
|
# flush the buffered content as ONE content delta — and clear the
|
||||||
|
# final chat_delta's content so the metadata chunk does not repeat
|
||||||
|
# what we just sent. This is the plain-text-with-tool-parser path.
|
||||||
|
buffered_fallback = (
|
||||||
|
has_tool_parser
|
||||||
|
and not (native_streaming and not native_streaming_error)
|
||||||
|
)
|
||||||
|
if buffered_fallback and not tool_calls_proto and content:
|
||||||
|
yield backend_pb2.Reply(
|
||||||
|
message=bytes(content, encoding='utf-8'),
|
||||||
|
chat_deltas=[backend_pb2.ChatDelta(content=content)],
|
||||||
|
)
|
||||||
|
chat_delta = backend_pb2.ChatDelta(
|
||||||
|
reasoning_content=reasoning_content,
|
||||||
|
tool_calls=tool_calls_proto,
|
||||||
|
)
|
||||||
yield backend_pb2.Reply(
|
yield backend_pb2.Reply(
|
||||||
message=b"",
|
message=b"",
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
|
|||||||
@@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase):
|
|||||||
print(err)
|
print(err)
|
||||||
self.fail("Embedding service failed")
|
self.fail("Embedding service failed")
|
||||||
finally:
|
finally:
|
||||||
self.tearDown()
|
self.tearDown()
|
||||||
|
|
||||||
|
|
||||||
|
class TestStreamingToolParser(unittest.TestCase):
|
||||||
|
"""
|
||||||
|
Server-less unit tests for the streaming + tool-parser machinery in
|
||||||
|
BackendServicer._predict. These tests instantiate BackendServicer
|
||||||
|
directly and mock the vLLM engine + tool parser, so they do not need
|
||||||
|
a GPU, a model, or a running gRPC server. Kept in a separate class to
|
||||||
|
avoid the parent setUp() which spawns a subprocess.
|
||||||
|
|
||||||
|
Covers #582 (follow-up to #10346):
|
||||||
|
1. Markup-leak prevention with a non-streaming parser (buffer fallback)
|
||||||
|
2. No content duplication on the plain-text path with the buffer fallback
|
||||||
|
3. Native streaming progressive plain-text emission
|
||||||
|
4. Native streaming structured tool_call, no markup leak
|
||||||
|
5. Parser exception → graceful fallback to buffer, still no markup
|
||||||
|
6. No-tool-parser regression: unchanged per-delta content stream
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _make_generate(chunks):
|
||||||
|
"""Build a fake vLLM engine.generate that yields cumulative chunks."""
|
||||||
|
from types import SimpleNamespace
|
||||||
|
async def gen(*a, **k):
|
||||||
|
for i, t in enumerate(chunks):
|
||||||
|
yield SimpleNamespace(
|
||||||
|
outputs=[SimpleNamespace(
|
||||||
|
text=t,
|
||||||
|
token_ids=list(range(i + 1)),
|
||||||
|
logprobs=None,
|
||||||
|
)],
|
||||||
|
prompt_token_ids=[0],
|
||||||
|
)
|
||||||
|
return lambda *a, **k: gen()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _collect(servicer, req):
|
||||||
|
import asyncio
|
||||||
|
async def run():
|
||||||
|
return [r async for r in servicer._predict(req, None, streaming=True)]
|
||||||
|
return asyncio.run(run())
|
||||||
|
|
||||||
|
def _new_servicer(self):
|
||||||
|
import sys, os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
from backend import BackendServicer
|
||||||
|
s = BackendServicer()
|
||||||
|
s.reasoning_parser_cls = None
|
||||||
|
s.tool_parser_cls = None
|
||||||
|
s.tokenizer = None
|
||||||
|
return s
|
||||||
|
|
||||||
|
# ── Case 1+2: parser without streaming method → buffer fallback ──
|
||||||
|
def test_buffer_path_no_markup_no_duplication(self):
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
def parser_cls(called, content_text, calls):
|
||||||
|
class _P:
|
||||||
|
def __init__(self, tokenizer, tools=None):
|
||||||
|
pass
|
||||||
|
# NOTE: NO extract_tool_calls_streaming → takes the buffer path
|
||||||
|
def extract_tool_calls(self, c, request=None):
|
||||||
|
return SimpleNamespace(
|
||||||
|
tools_called=called, content=content_text, tool_calls=calls,
|
||||||
|
)
|
||||||
|
return _P
|
||||||
|
|
||||||
|
tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
|
||||||
|
|
||||||
|
# Tool-call case: no raw markup in any delta.content
|
||||||
|
s = self._new_servicer()
|
||||||
|
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||||
|
'<tool_call>\n{"name": "calc"',
|
||||||
|
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||||
|
]))
|
||||||
|
call = SimpleNamespace(id="call_1",
|
||||||
|
function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
|
||||||
|
s.tool_parser_cls = parser_cls(True, "", [call])
|
||||||
|
req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
|
||||||
|
replies = self._collect(s, req)
|
||||||
|
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||||
|
self.assertFalse(
|
||||||
|
any("<tool_call" in c for c in contents),
|
||||||
|
f"markup leaked: {contents!r}",
|
||||||
|
)
|
||||||
|
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
|
||||||
|
self.assertIn("calc", names, "tool_call missing from final chunk")
|
||||||
|
|
||||||
|
# Plain-text-with-tools case: full content delivered exactly once
|
||||||
|
s2 = self._new_servicer()
|
||||||
|
s2.llm = SimpleNamespace(generate=self._make_generate([
|
||||||
|
"The capital ",
|
||||||
|
"The capital of France is Paris.",
|
||||||
|
]))
|
||||||
|
s2.tool_parser_cls = parser_cls(False, "", [])
|
||||||
|
req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
|
||||||
|
joined = "".join(
|
||||||
|
cd.content for r in self._collect(s2, req2)
|
||||||
|
for cd in r.chat_deltas if cd.content
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
joined.count("The capital of France is Paris."), 1,
|
||||||
|
f"buffered content duplicated: {joined!r}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Case 3: native streaming, progressive plain text ──
|
||||||
|
def test_native_streaming_progressive_plain_text(self):
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
class _DeltaMsg:
|
||||||
|
def __init__(self, content=None, reasoning=None, tool_calls=None):
|
||||||
|
self.content = content
|
||||||
|
self.reasoning = reasoning
|
||||||
|
self.tool_calls = tool_calls or []
|
||||||
|
|
||||||
|
class StreamingParser:
|
||||||
|
def __init__(self, tokenizer, tools=None):
|
||||||
|
pass
|
||||||
|
def extract_tool_calls(self, c, request=None):
|
||||||
|
# Should NOT be called when native streaming runs successfully.
|
||||||
|
raise AssertionError("extract_tool_calls invoked on native-streaming path")
|
||||||
|
def extract_tool_calls_streaming(
|
||||||
|
self, previous_text, current_text, delta_text,
|
||||||
|
previous_token_ids, current_token_ids, delta_token_ids, request,
|
||||||
|
):
|
||||||
|
if not delta_text:
|
||||||
|
return None
|
||||||
|
return _DeltaMsg(content=delta_text)
|
||||||
|
|
||||||
|
s = self._new_servicer()
|
||||||
|
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||||
|
"Paris ",
|
||||||
|
"Paris is ",
|
||||||
|
"Paris is the capital of France.",
|
||||||
|
]))
|
||||||
|
s.tool_parser_cls = StreamingParser
|
||||||
|
req = backend_pb2.PredictOptions(
|
||||||
|
Prompt="x",
|
||||||
|
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||||
|
)
|
||||||
|
replies = self._collect(s, req)
|
||||||
|
|
||||||
|
intermediate_content = [
|
||||||
|
cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
|
||||||
|
]
|
||||||
|
self.assertTrue(
|
||||||
|
len(intermediate_content) > 0,
|
||||||
|
"Plain-text response not streamed progressively (native streaming inactive?)",
|
||||||
|
)
|
||||||
|
assembled = "".join(
|
||||||
|
cd.content for r in replies for cd in r.chat_deltas if cd.content
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
assembled, "Paris is the capital of France.",
|
||||||
|
f"Assembled content wrong: {assembled!r}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Case 4: native streaming, structured tool_call, no markup ──
|
||||||
|
def test_native_streaming_tool_call_no_markup_leak(self):
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
class _DeltaMsg:
|
||||||
|
def __init__(self, content=None, reasoning=None, tool_calls=None):
|
||||||
|
self.content = content
|
||||||
|
self.reasoning = reasoning
|
||||||
|
self.tool_calls = tool_calls or []
|
||||||
|
|
||||||
|
class _ToolCallStreamer:
|
||||||
|
def __init__(self, tokenizer, tools=None):
|
||||||
|
self._emitted = False
|
||||||
|
def extract_tool_calls(self, c, request=None):
|
||||||
|
raise AssertionError("extract_tool_calls invoked on native-streaming path")
|
||||||
|
def extract_tool_calls_streaming(
|
||||||
|
self, previous_text, current_text, delta_text,
|
||||||
|
previous_token_ids, current_token_ids, delta_token_ids, request,
|
||||||
|
):
|
||||||
|
if "</tool_call>" in current_text and not self._emitted:
|
||||||
|
self._emitted = True
|
||||||
|
fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
|
||||||
|
tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
|
||||||
|
return _DeltaMsg(tool_calls=[tc])
|
||||||
|
return None
|
||||||
|
|
||||||
|
s = self._new_servicer()
|
||||||
|
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||||
|
'<tool_call>\n',
|
||||||
|
'<tool_call>\n{"name": "calc"',
|
||||||
|
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||||
|
]))
|
||||||
|
s.tool_parser_cls = _ToolCallStreamer
|
||||||
|
req = backend_pb2.PredictOptions(
|
||||||
|
Prompt="x",
|
||||||
|
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||||
|
)
|
||||||
|
replies = self._collect(s, req)
|
||||||
|
|
||||||
|
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||||
|
self.assertFalse(
|
||||||
|
any("<tool_call" in c or "</tool_call>" in c for c in contents),
|
||||||
|
f"markup leaked as content: {contents!r}",
|
||||||
|
)
|
||||||
|
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
|
||||||
|
args = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
|
||||||
|
self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
|
||||||
|
self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
|
||||||
|
|
||||||
|
# ── Case 5: parser exception → fallback to buffer, no leak ──
|
||||||
|
def test_native_streaming_parser_exception_falls_back_to_buffer(self):
|
||||||
|
from types import SimpleNamespace
|
||||||
|
call = SimpleNamespace(id="call_1",
|
||||||
|
function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
|
||||||
|
|
||||||
|
class _BrokenStreamer:
|
||||||
|
def __init__(self, tokenizer, tools=None):
|
||||||
|
pass
|
||||||
|
def extract_tool_calls(self, c, request=None):
|
||||||
|
return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
|
||||||
|
def extract_tool_calls_streaming(self, *a, **kw):
|
||||||
|
raise RuntimeError("simulated parser bug")
|
||||||
|
|
||||||
|
s = self._new_servicer()
|
||||||
|
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||||
|
'<tool_call>\n{"name": "calc"',
|
||||||
|
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||||
|
]))
|
||||||
|
s.tool_parser_cls = _BrokenStreamer
|
||||||
|
req = backend_pb2.PredictOptions(
|
||||||
|
Prompt="x",
|
||||||
|
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||||
|
)
|
||||||
|
replies = self._collect(s, req)
|
||||||
|
|
||||||
|
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||||
|
self.assertFalse(
|
||||||
|
any("<tool_call" in c for c in contents),
|
||||||
|
f"markup leaked after parser exception: {contents!r}",
|
||||||
|
)
|
||||||
|
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
|
||||||
|
self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
|
||||||
|
|
||||||
|
# ── Case 6: no tool parser → unchanged per-delta content stream ──
|
||||||
|
def test_no_tool_parser_unchanged_per_delta_stream(self):
|
||||||
|
from types import SimpleNamespace
|
||||||
|
s = self._new_servicer() # tool_parser_cls already None
|
||||||
|
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||||
|
"Hello ", "Hello world", "Hello world!",
|
||||||
|
]))
|
||||||
|
req = backend_pb2.PredictOptions(Prompt="x", Tools="")
|
||||||
|
replies = self._collect(s, req)
|
||||||
|
|
||||||
|
intermediate = [
|
||||||
|
cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
|
||||||
|
]
|
||||||
|
self.assertEqual(
|
||||||
|
intermediate, ["Hello ", "world", "!"],
|
||||||
|
f"plain streaming changed; got {intermediate!r}",
|
||||||
|
)
|
||||||
|
|||||||
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
|
|||||||
}
|
}
|
||||||
appCfg := a.ApplicationConfig()
|
appCfg := a.ApplicationConfig()
|
||||||
|
|
||||||
if cfg.PII.Enabled != nil {
|
// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
|
||||||
enabled = *cfg.PII.Enabled
|
// default (cloud-proxy)" — the single source of that rule.
|
||||||
} else {
|
enabled = cfg.PIIIsEnabled()
|
||||||
enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
|
|
||||||
}
|
|
||||||
if !enabled {
|
if !enabled {
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
|
|||||||
if len(detectors) == 0 {
|
if len(detectors) == 0 {
|
||||||
detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
|
detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
|
||||||
}
|
}
|
||||||
return enabled, detectors
|
return true, detectors // enabled is necessarily true past the !enabled guard
|
||||||
}
|
}
|
||||||
|
|
||||||
// PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
|
// PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
|
||||||
|
|||||||
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
|
|||||||
Pressure: pressure,
|
Pressure: pressure,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Wire staging-progress broadcasting so file-staging shows up on every
|
||||||
|
// replica, not just the one performing the transfer. Without this, a
|
||||||
|
// /api/operations poll that round-robins onto a peer sees no staging row and
|
||||||
|
// the progress flickers. The origin publishes; peers mirror via the wildcard.
|
||||||
|
router.StagingTracker().SetPublisher(natsClient)
|
||||||
|
if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
|
||||||
|
xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
|
// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
|
||||||
// RegistrationToken feed the state-reconciliation passes: pending op
|
// RegistrationToken feed the state-reconciliation passes: pending op
|
||||||
// drain uses the adapter, and model health probes use the token to auth
|
// drain uses the adapter, and model health probes use the token to auth
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ import (
|
|||||||
"github.com/mudler/LocalAI/core/services/storage"
|
"github.com/mudler/LocalAI/core/services/storage"
|
||||||
coreStartup "github.com/mudler/LocalAI/core/startup"
|
coreStartup "github.com/mudler/LocalAI/core/startup"
|
||||||
"github.com/mudler/LocalAI/internal"
|
"github.com/mudler/LocalAI/internal"
|
||||||
|
"github.com/mudler/LocalAI/pkg/downloader"
|
||||||
"github.com/mudler/LocalAI/pkg/signals"
|
"github.com/mudler/LocalAI/pkg/signals"
|
||||||
"github.com/mudler/LocalAI/pkg/vram"
|
"github.com/mudler/LocalAI/pkg/vram"
|
||||||
|
|
||||||
@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to create ModelPath: %q", err)
|
return nil, fmt.Errorf("unable to create ModelPath: %q", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
|
||||||
|
// by an OOM/restart, or stalled before cleanup could run). The 24h window
|
||||||
|
// is well beyond any legitimate in-flight download, so this never trims an
|
||||||
|
// active transfer; it just stops dead partials accumulating on the volume.
|
||||||
|
if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
|
||||||
|
xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
|
||||||
|
} else if removed > 0 {
|
||||||
|
xlog.Info("Reaped stale partial downloads", "count", removed)
|
||||||
|
}
|
||||||
if options.GeneratedContentDir != "" {
|
if options.GeneratedContentDir != "" {
|
||||||
err := os.MkdirAll(options.GeneratedContentDir, 0o750)
|
err := os.MkdirAll(options.GeneratedContentDir, 0o750)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -633,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
|
|||||||
options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if settings.SizeAwareEviction != nil {
|
||||||
|
// Only apply if current value is default (false), suggesting it wasn't set from env var
|
||||||
|
if !options.SizeAwareEviction {
|
||||||
|
options.SizeAwareEviction = *settings.SizeAwareEviction
|
||||||
|
}
|
||||||
|
}
|
||||||
if settings.LRUEvictionMaxRetries != nil {
|
if settings.LRUEvictionMaxRetries != nil {
|
||||||
// Only apply if current value is default (30), suggesting it wasn't set from env var
|
// Only apply if current value is default (30), suggesting it wasn't set from env var
|
||||||
if options.LRUEvictionMaxRetries == 0 {
|
if options.LRUEvictionMaxRetries == 0 {
|
||||||
@@ -836,6 +853,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
|
|||||||
model.WithLRULimit(lruLimit),
|
model.WithLRULimit(lruLimit),
|
||||||
model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
|
model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
|
||||||
model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
|
model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
|
||||||
|
model.WithSizeAwareEviction(options.SizeAwareEviction),
|
||||||
)
|
)
|
||||||
application.ModelLoader().SetWatchDog(wd)
|
application.ModelLoader().SetWatchDog(wd)
|
||||||
|
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
|
|||||||
model.WithLRULimit(lruLimit),
|
model.WithLRULimit(lruLimit),
|
||||||
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
|
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
|
||||||
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
|
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
|
||||||
|
model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
|
||||||
)
|
)
|
||||||
|
|
||||||
// Create new stop channel BEFORE setting up any goroutines
|
// Create new stop channel BEFORE setting up any goroutines
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package backend
|
package backend
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math/rand/v2"
|
"math/rand/v2"
|
||||||
@@ -12,7 +13,9 @@ import (
|
|||||||
"github.com/mudler/LocalAI/core/config"
|
"github.com/mudler/LocalAI/core/config"
|
||||||
"github.com/mudler/LocalAI/core/trace"
|
"github.com/mudler/LocalAI/core/trace"
|
||||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
|
"github.com/mudler/LocalAI/pkg/downloader"
|
||||||
"github.com/mudler/LocalAI/pkg/model"
|
"github.com/mudler/LocalAI/pkg/model"
|
||||||
|
"github.com/mudler/LocalAI/pkg/vram"
|
||||||
"github.com/mudler/xlog"
|
"github.com/mudler/xlog"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// estimateModelSizeBytes uses the unified EstimateModel entry point to compute
|
||||||
|
// the total weight-file size for a model config. It collects all weight files
|
||||||
|
// from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
|
||||||
|
// repo ID so EstimateModel can fall back to the HF API when local file
|
||||||
|
// metadata is unavailable (e.g. not-yet-downloaded models).
|
||||||
|
func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
|
||||||
|
seen := make(map[string]bool)
|
||||||
|
input := vram.ModelEstimateInput{}
|
||||||
|
|
||||||
|
addFile := func(uri string) {
|
||||||
|
if !vram.IsWeightFile(uri) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resolved := uri
|
||||||
|
if !strings.Contains(uri, "://") {
|
||||||
|
resolved = "file://" + filepath.Join(modelsPath, uri)
|
||||||
|
}
|
||||||
|
if seen[resolved] {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
seen[resolved] = true
|
||||||
|
input.Files = append(input.Files, vram.FileInput{URI: resolved})
|
||||||
|
}
|
||||||
|
|
||||||
|
// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
|
||||||
|
// then extracts the org/model repo ID for use as the HF fallback path.
|
||||||
|
tryHFRepo := func(uri string) {
|
||||||
|
if input.HFRepo != "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
resolved := downloader.URI(uri).ResolveURL()
|
||||||
|
if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
|
||||||
|
input.HFRepo = repoID
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, f := range c.DownloadFiles {
|
||||||
|
uriStr := string(f.URI)
|
||||||
|
addFile(uriStr)
|
||||||
|
tryHFRepo(uriStr)
|
||||||
|
}
|
||||||
|
addFile(c.Model)
|
||||||
|
tryHFRepo(c.Model)
|
||||||
|
if c.MMProj != "" {
|
||||||
|
addFile(c.MMProj)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(input.Files) == 0 && input.HFRepo == "" {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
result, err := vram.EstimateModelMultiContext(ctx, input, nil)
|
||||||
|
if err != nil || result.SizeBytes == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return int64(result.SizeBytes)
|
||||||
|
}
|
||||||
|
|
||||||
func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
|
func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
|
||||||
defOpts := []model.Option{
|
defOpts := []model.Option{
|
||||||
model.WithBackendString(c.Backend),
|
model.WithBackendString(c.Backend),
|
||||||
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
|
|||||||
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
|
||||||
|
defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
|
||||||
|
}
|
||||||
|
|
||||||
return append(defOpts, opts...)
|
return append(defOpts, opts...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,10 +158,11 @@ func getSeed(c config.ModelConfig) int32 {
|
|||||||
// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
|
// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
|
||||||
// model config leaves them unset. Exported so callers that must respect the
|
// model config leaves them unset. Exported so callers that must respect the
|
||||||
// effective decode window — notably the router's prompt trimmer — resolve the
|
// effective decode window — notably the router's prompt trimmer — resolve the
|
||||||
// same numbers grpcModelOpts does instead of guessing.
|
// same numbers grpcModelOpts does instead of guessing. The values are owned by
|
||||||
|
// core/config (single source of truth shared with the config default tiers).
|
||||||
const (
|
const (
|
||||||
DefaultContextSize = 4096
|
DefaultContextSize = config.DefaultContextSize
|
||||||
DefaultBatchSize = 512
|
DefaultBatchSize = config.DefaultPhysicalBatch
|
||||||
)
|
)
|
||||||
|
|
||||||
// EffectiveContextSize is the context window the backend will run with: the
|
// EffectiveContextSize is the context window the backend will run with: the
|
||||||
@@ -129,7 +198,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
|
|||||||
ctxSize := EffectiveContextSize(c)
|
ctxSize := EffectiveContextSize(c)
|
||||||
b := EffectiveBatchSize(c)
|
b := EffectiveBatchSize(c)
|
||||||
|
|
||||||
flashAttention := "auto"
|
flashAttention := config.DefaultFlashAttention
|
||||||
|
|
||||||
if c.FlashAttention != nil {
|
if c.FlashAttention != nil {
|
||||||
flashAttention = *c.FlashAttention
|
flashAttention = *c.FlashAttention
|
||||||
@@ -175,7 +244,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
|
|||||||
mmlock = *c.MMlock
|
mmlock = *c.MMlock
|
||||||
}
|
}
|
||||||
|
|
||||||
nGPULayers := 9999999
|
nGPULayers := config.DefaultNGPULayers
|
||||||
if c.NGPULayers != nil {
|
if c.NGPULayers != nil {
|
||||||
nGPULayers = *c.NGPULayers
|
nGPULayers = *c.NGPULayers
|
||||||
}
|
}
|
||||||
|
|||||||
88
core/backend/sound_classification.go
Normal file
88
core/backend/sound_classification.go
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
package backend
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/core/config"
|
||||||
|
"github.com/mudler/LocalAI/core/schema"
|
||||||
|
|
||||||
|
grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
|
||||||
|
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
|
"github.com/mudler/LocalAI/pkg/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SoundDetectionRequest carries the knobs the HTTP layer collects for an
|
||||||
|
// audio-tagging / sound-event-classification call. Audio is the path to the
|
||||||
|
// uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
|
||||||
|
type SoundDetectionRequest struct {
|
||||||
|
Audio string
|
||||||
|
TopK int32
|
||||||
|
Threshold float32
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
|
||||||
|
return &proto.SoundDetectionRequest{
|
||||||
|
Src: r.Audio,
|
||||||
|
TopK: r.TopK,
|
||||||
|
Threshold: r.Threshold,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
|
||||||
|
if modelConfig.Backend == "" {
|
||||||
|
return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
|
||||||
|
}
|
||||||
|
opts := ModelOptions(modelConfig, appConfig)
|
||||||
|
m, err := ml.Load(opts...)
|
||||||
|
if err != nil {
|
||||||
|
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if m == nil {
|
||||||
|
return nil, fmt.Errorf("could not load sound classification model")
|
||||||
|
}
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ModelSoundDetection runs the SoundDetection RPC against the configured
|
||||||
|
// backend and returns a normalized schema.SoundClassificationResult.
|
||||||
|
func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
|
||||||
|
m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
r, err := m.SoundDetection(ctx, req.toProto())
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return soundClassificationResultFromProto(modelConfig.Name, r), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// soundClassificationResultFromProto maps the backend detections to the
|
||||||
|
// HTTP-facing schema, keeping the backend's score-descending order.
|
||||||
|
func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
|
||||||
|
out := &schema.SoundClassificationResult{
|
||||||
|
Model: modelName,
|
||||||
|
Detections: []schema.SoundClassification{},
|
||||||
|
}
|
||||||
|
if r == nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for _, d := range r.Detections {
|
||||||
|
if d == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out.Detections = append(out.Detections, schema.SoundClassification{
|
||||||
|
Index: int(d.Index),
|
||||||
|
Label: d.Label,
|
||||||
|
Score: d.Score,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.SliceStable(out.Detections, func(i, j int) bool {
|
||||||
|
return out.Detections[i].Score > out.Detections[j].Score
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
}
|
||||||
@@ -93,6 +93,7 @@ type RunCMD struct {
|
|||||||
EnableMemoryReclaimer bool `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
|
EnableMemoryReclaimer bool `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
|
||||||
MemoryReclaimerThreshold float64 `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
|
MemoryReclaimerThreshold float64 `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
|
||||||
ForceEvictionWhenBusy bool `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
|
ForceEvictionWhenBusy bool `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
|
||||||
|
SizeAwareEviction bool `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
|
||||||
LRUEvictionMaxRetries int `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
|
LRUEvictionMaxRetries int `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
|
||||||
LRUEvictionRetryInterval string `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
|
LRUEvictionRetryInterval string `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
|
||||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||||
@@ -564,6 +565,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
|||||||
if r.ForceEvictionWhenBusy {
|
if r.ForceEvictionWhenBusy {
|
||||||
opts = append(opts, config.WithForceEvictionWhenBusy(true))
|
opts = append(opts, config.WithForceEvictionWhenBusy(true))
|
||||||
}
|
}
|
||||||
|
if r.SizeAwareEviction {
|
||||||
|
opts = append(opts, config.WithSizeAwareEviction(true))
|
||||||
|
}
|
||||||
if r.LRUEvictionMaxRetries > 0 {
|
if r.LRUEvictionMaxRetries > 0 {
|
||||||
opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
|
opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -119,6 +119,7 @@ type ApplicationConfig struct {
|
|||||||
|
|
||||||
// Eviction settings
|
// Eviction settings
|
||||||
ForceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
ForceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||||
|
SizeAwareEviction bool // Evict largest models first rather than least-recently-used (default: false)
|
||||||
LRUEvictionMaxRetries int // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
LRUEvictionMaxRetries int // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
||||||
LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
|
LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
|
||||||
|
|
||||||
@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WithSizeAwareEviction enables size-aware eviction ordering.
|
||||||
|
// When true, the watchdog evicts the largest loaded model first rather than the
|
||||||
|
// least-recently-used one, keeping small utility models resident and maximizing
|
||||||
|
// memory freed per eviction.
|
||||||
|
func WithSizeAwareEviction(enabled bool) AppOption {
|
||||||
|
return func(o *ApplicationConfig) {
|
||||||
|
o.SizeAwareEviction = enabled
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
|
// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
|
||||||
func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
|
func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
|
||||||
return func(o *ApplicationConfig) {
|
return func(o *ApplicationConfig) {
|
||||||
@@ -1028,6 +1039,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
|||||||
memoryReclaimerEnabled := o.MemoryReclaimerEnabled
|
memoryReclaimerEnabled := o.MemoryReclaimerEnabled
|
||||||
memoryReclaimerThreshold := o.MemoryReclaimerThreshold
|
memoryReclaimerThreshold := o.MemoryReclaimerThreshold
|
||||||
forceEvictionWhenBusy := o.ForceEvictionWhenBusy
|
forceEvictionWhenBusy := o.ForceEvictionWhenBusy
|
||||||
|
sizeAwareEviction := o.SizeAwareEviction
|
||||||
lruEvictionMaxRetries := o.LRUEvictionMaxRetries
|
lruEvictionMaxRetries := o.LRUEvictionMaxRetries
|
||||||
threads := o.Threads
|
threads := o.Threads
|
||||||
contextSize := o.ContextSize
|
contextSize := o.ContextSize
|
||||||
@@ -1120,6 +1132,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
|||||||
MemoryReclaimerEnabled: &memoryReclaimerEnabled,
|
MemoryReclaimerEnabled: &memoryReclaimerEnabled,
|
||||||
MemoryReclaimerThreshold: &memoryReclaimerThreshold,
|
MemoryReclaimerThreshold: &memoryReclaimerThreshold,
|
||||||
ForceEvictionWhenBusy: &forceEvictionWhenBusy,
|
ForceEvictionWhenBusy: &forceEvictionWhenBusy,
|
||||||
|
SizeAwareEviction: &sizeAwareEviction,
|
||||||
LRUEvictionMaxRetries: &lruEvictionMaxRetries,
|
LRUEvictionMaxRetries: &lruEvictionMaxRetries,
|
||||||
LRUEvictionRetryInterval: &lruEvictionRetryInterval,
|
LRUEvictionRetryInterval: &lruEvictionRetryInterval,
|
||||||
Threads: &threads,
|
Threads: &threads,
|
||||||
@@ -1244,6 +1257,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
|
|||||||
o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||||
// This setting doesn't require restart, can be updated dynamically
|
// This setting doesn't require restart, can be updated dynamically
|
||||||
}
|
}
|
||||||
|
if settings.SizeAwareEviction != nil {
|
||||||
|
o.SizeAwareEviction = *settings.SizeAwareEviction
|
||||||
|
// This setting doesn't require restart, can be updated dynamically
|
||||||
|
}
|
||||||
if settings.LRUEvictionMaxRetries != nil {
|
if settings.LRUEvictionMaxRetries != nil {
|
||||||
o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
|
o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
|
||||||
// This setting doesn't require restart, can be updated dynamically
|
// This setting doesn't require restart, can be updated dynamically
|
||||||
|
|||||||
@@ -8,27 +8,28 @@ import (
|
|||||||
// Usecase name constants — the canonical string values used in gallery entries,
|
// Usecase name constants — the canonical string values used in gallery entries,
|
||||||
// model configs (known_usecases), and UsecaseInfoMap keys.
|
// model configs (known_usecases), and UsecaseInfoMap keys.
|
||||||
const (
|
const (
|
||||||
UsecaseChat = "chat"
|
UsecaseChat = "chat"
|
||||||
UsecaseCompletion = "completion"
|
UsecaseCompletion = "completion"
|
||||||
UsecaseEdit = "edit"
|
UsecaseEdit = "edit"
|
||||||
UsecaseVision = "vision"
|
UsecaseVision = "vision"
|
||||||
UsecaseEmbeddings = "embeddings"
|
UsecaseEmbeddings = "embeddings"
|
||||||
UsecaseTokenize = "tokenize"
|
UsecaseTokenize = "tokenize"
|
||||||
UsecaseImage = "image"
|
UsecaseImage = "image"
|
||||||
UsecaseVideo = "video"
|
UsecaseVideo = "video"
|
||||||
UsecaseTranscript = "transcript"
|
UsecaseTranscript = "transcript"
|
||||||
UsecaseTTS = "tts"
|
UsecaseTTS = "tts"
|
||||||
UsecaseSoundGeneration = "sound_generation"
|
UsecaseSoundGeneration = "sound_generation"
|
||||||
UsecaseRerank = "rerank"
|
UsecaseRerank = "rerank"
|
||||||
UsecaseDetection = "detection"
|
UsecaseDetection = "detection"
|
||||||
UsecaseDepth = "depth"
|
UsecaseDepth = "depth"
|
||||||
UsecaseVAD = "vad"
|
UsecaseVAD = "vad"
|
||||||
UsecaseAudioTransform = "audio_transform"
|
UsecaseAudioTransform = "audio_transform"
|
||||||
UsecaseDiarization = "diarization"
|
UsecaseDiarization = "diarization"
|
||||||
UsecaseRealtimeAudio = "realtime_audio"
|
UsecaseSoundClassification = "sound_classification"
|
||||||
UsecaseFaceRecognition = "face_recognition"
|
UsecaseRealtimeAudio = "realtime_audio"
|
||||||
UsecaseSpeakerRecognition = "speaker_recognition"
|
UsecaseFaceRecognition = "face_recognition"
|
||||||
UsecaseTokenClassify = "token_classify"
|
UsecaseSpeakerRecognition = "speaker_recognition"
|
||||||
|
UsecaseTokenClassify = "token_classify"
|
||||||
)
|
)
|
||||||
|
|
||||||
// GRPCMethod identifies a Backend service RPC from backend.proto.
|
// GRPCMethod identifies a Backend service RPC from backend.proto.
|
||||||
@@ -51,6 +52,7 @@ const (
|
|||||||
MethodVAD GRPCMethod = "VAD"
|
MethodVAD GRPCMethod = "VAD"
|
||||||
MethodAudioTransform GRPCMethod = "AudioTransform"
|
MethodAudioTransform GRPCMethod = "AudioTransform"
|
||||||
MethodDiarize GRPCMethod = "Diarize"
|
MethodDiarize GRPCMethod = "Diarize"
|
||||||
|
MethodSoundDetection GRPCMethod = "SoundDetection"
|
||||||
MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
|
MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
|
||||||
MethodFaceVerify GRPCMethod = "FaceVerify"
|
MethodFaceVerify GRPCMethod = "FaceVerify"
|
||||||
MethodFaceAnalyze GRPCMethod = "FaceAnalyze"
|
MethodFaceAnalyze GRPCMethod = "FaceAnalyze"
|
||||||
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
|
|||||||
GRPCMethod: MethodDiarize,
|
GRPCMethod: MethodDiarize,
|
||||||
Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
|
Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
|
||||||
},
|
},
|
||||||
|
UsecaseSoundClassification: {
|
||||||
|
Flag: FLAG_SOUND_CLASSIFICATION,
|
||||||
|
GRPCMethod: MethodSoundDetection,
|
||||||
|
Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
|
||||||
|
},
|
||||||
UsecaseRealtimeAudio: {
|
UsecaseRealtimeAudio: {
|
||||||
Flag: FLAG_REALTIME_AUDIO,
|
Flag: FLAG_REALTIME_AUDIO,
|
||||||
GRPCMethod: MethodAudioToAudioStream,
|
GRPCMethod: MethodAudioToAudioStream,
|
||||||
|
|||||||
30
core/config/defaults.go
Normal file
30
core/config/defaults.go
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
package config
|
||||||
|
|
||||||
|
// Canonical default values.
|
||||||
|
//
|
||||||
|
// These are owned here so the two layers that need them share a single source
|
||||||
|
// of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
|
||||||
|
// *decide* defaults) and core/backend/options.go (which *translates* a
|
||||||
|
// ModelConfig to the backend wire format and supplies the same fallbacks
|
||||||
|
// defensively). Previously these were duplicated as literals across both
|
||||||
|
// packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
|
||||||
|
// constants of 512). core/backend imports core/config, so backend references
|
||||||
|
// these; config never imports backend.
|
||||||
|
const (
|
||||||
|
// DefaultContextSize is the fallback context window when none is configured
|
||||||
|
// or estimable from the model.
|
||||||
|
DefaultContextSize = 4096
|
||||||
|
|
||||||
|
// GGUFFallbackContextSize is the context window for a GGUF model whose
|
||||||
|
// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
|
||||||
|
// smaller than DefaultContextSize to stay conservative on memory there.
|
||||||
|
GGUFFallbackContextSize = 1024
|
||||||
|
|
||||||
|
// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
|
||||||
|
// clamps to what actually fits in device memory.
|
||||||
|
DefaultNGPULayers = 99999999
|
||||||
|
|
||||||
|
// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
|
||||||
|
// backend enable it when the model + backend support it.
|
||||||
|
DefaultFlashAttention = "auto"
|
||||||
|
)
|
||||||
115
core/config/generic_defaults.go
Normal file
115
core/config/generic_defaults.go
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
package config
|
||||||
|
|
||||||
|
import "os"
|
||||||
|
|
||||||
|
// ApplyGenericDefaults fills the generic fallback values applied after the
|
||||||
|
// higher-priority tiers (ApplyInferenceDefaults for the model family,
|
||||||
|
// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
|
||||||
|
// policy): sampling parameters and a few runtime flags. Like the other tiers it
|
||||||
|
// only fills values still left unset, so model-family / explicit config wins.
|
||||||
|
func ApplyGenericDefaults(cfg *ModelConfig) {
|
||||||
|
if cfg == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||||
|
defaultTopP := 0.95
|
||||||
|
defaultTopK := 40
|
||||||
|
defaultMinP := 0.0
|
||||||
|
defaultTemp := 0.9
|
||||||
|
// https://github.com/mudler/LocalAI/issues/2780
|
||||||
|
defaultMirostat := 0
|
||||||
|
defaultMirostatTAU := 5.0
|
||||||
|
defaultMirostatETA := 0.1
|
||||||
|
defaultTypicalP := 1.0
|
||||||
|
defaultTFZ := 1.0
|
||||||
|
defaultZero := 0
|
||||||
|
|
||||||
|
trueV := true
|
||||||
|
falseV := false
|
||||||
|
|
||||||
|
if cfg.Seed == nil {
|
||||||
|
// random number generator seed
|
||||||
|
defaultSeed := RAND_SEED
|
||||||
|
cfg.Seed = &defaultSeed
|
||||||
|
}
|
||||||
|
|
||||||
|
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
||||||
|
// native default differs (issue #6632). Only inject it for the llama.cpp
|
||||||
|
// family and the empty/auto backend; leave TopK nil for known non-llama
|
||||||
|
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
||||||
|
// is 0 rather than a silently-changed 40.
|
||||||
|
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
||||||
|
cfg.TopK = &defaultTopK
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.MinP == nil {
|
||||||
|
cfg.MinP = &defaultMinP
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.TypicalP == nil {
|
||||||
|
cfg.TypicalP = &defaultTypicalP
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.TFZ == nil {
|
||||||
|
cfg.TFZ = &defaultTFZ
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.MMap == nil {
|
||||||
|
// MMap is enabled by default
|
||||||
|
|
||||||
|
// Only exception is for Intel GPUs
|
||||||
|
if os.Getenv("XPU") != "" {
|
||||||
|
cfg.MMap = &falseV
|
||||||
|
} else {
|
||||||
|
cfg.MMap = &trueV
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.MMlock == nil {
|
||||||
|
// MMlock is disabled by default
|
||||||
|
cfg.MMlock = &falseV
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.TopP == nil {
|
||||||
|
cfg.TopP = &defaultTopP
|
||||||
|
}
|
||||||
|
if cfg.Temperature == nil {
|
||||||
|
cfg.Temperature = &defaultTemp
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.Maxtokens == nil {
|
||||||
|
cfg.Maxtokens = &defaultZero
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.Mirostat == nil {
|
||||||
|
cfg.Mirostat = &defaultMirostat
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.MirostatETA == nil {
|
||||||
|
cfg.MirostatETA = &defaultMirostatETA
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.MirostatTAU == nil {
|
||||||
|
cfg.MirostatTAU = &defaultMirostatTAU
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.LowVRAM == nil {
|
||||||
|
cfg.LowVRAM = &falseV
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.Embeddings == nil {
|
||||||
|
cfg.Embeddings = &falseV
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.Reranking == nil {
|
||||||
|
cfg.Reranking = &falseV
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.PromptCacheAll == nil {
|
||||||
|
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
||||||
|
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
||||||
|
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
||||||
|
cfg.PromptCacheAll = &trueV
|
||||||
|
}
|
||||||
|
}
|
||||||
36
core/config/generic_defaults_test.go
Normal file
36
core/config/generic_defaults_test.go
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
package config_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
. "github.com/mudler/LocalAI/core/config"
|
||||||
|
. "github.com/onsi/ginkgo/v2"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
|
||||||
|
It("fills sampling + runtime fallbacks when unset", func() {
|
||||||
|
cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
|
||||||
|
ApplyGenericDefaults(cfg)
|
||||||
|
Expect(cfg.TopP).ToNot(BeNil())
|
||||||
|
Expect(*cfg.TopP).To(Equal(0.95))
|
||||||
|
Expect(*cfg.TopK).To(Equal(40))
|
||||||
|
Expect(*cfg.Temperature).To(Equal(0.9))
|
||||||
|
Expect(*cfg.MMap).To(BeTrue())
|
||||||
|
Expect(*cfg.MMlock).To(BeFalse())
|
||||||
|
Expect(*cfg.PromptCacheAll).To(BeTrue())
|
||||||
|
})
|
||||||
|
|
||||||
|
It("never overrides explicit values", func() {
|
||||||
|
tk := 7
|
||||||
|
tp := 0.5
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
cfg.TopK = &tk
|
||||||
|
cfg.TopP = &tp
|
||||||
|
ApplyGenericDefaults(cfg)
|
||||||
|
Expect(*cfg.TopK).To(Equal(7))
|
||||||
|
Expect(*cfg.TopP).To(Equal(0.5))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("no-ops on nil", func() {
|
||||||
|
Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
|
||||||
|
})
|
||||||
|
})
|
||||||
@@ -14,11 +14,6 @@ import (
|
|||||||
"github.com/gpustack/gguf-parser-go/util/ptr"
|
"github.com/gpustack/gguf-parser-go/util/ptr"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
defaultContextSize = 1024
|
|
||||||
defaultNGPULayers = 99999999
|
|
||||||
)
|
|
||||||
|
|
||||||
// reservedNonChatModel reports whether the operator reserved this model for an
|
// reservedNonChatModel reports whether the operator reserved this model for an
|
||||||
// internal primitive — the router score classifier or the PII NER
|
// internal primitive — the router score classifier or the PII NER
|
||||||
// token_classify tier. Such a model has no chat template and must not be
|
// token_classify tier. Such a model has no chat template and must not be
|
||||||
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
|||||||
cSize := int(ctxSize)
|
cSize := int(ctxSize)
|
||||||
cfg.ContextSize = &cSize
|
cfg.ContextSize = &cSize
|
||||||
} else {
|
} else {
|
||||||
defaultCtx = defaultContextSize
|
defaultCtx = GGUFFallbackContextSize
|
||||||
cfg.ContextSize = &defaultCtx
|
cfg.ContextSize = &defaultCtx
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
|||||||
|
|
||||||
if cfg.NGPULayers == nil {
|
if cfg.NGPULayers == nil {
|
||||||
// we assume we want to offload all layers
|
// we assume we want to offload all layers
|
||||||
defaultHigh := defaultNGPULayers
|
defaultHigh := DefaultNGPULayers
|
||||||
cfg.NGPULayers = &defaultHigh
|
cfg.NGPULayers = &defaultHigh
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
180
core/config/hardware_defaults.go
Normal file
180
core/config/hardware_defaults.go
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||||
|
"github.com/mudler/xlog"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Hardware-driven model-config defaults.
|
||||||
|
//
|
||||||
|
// This sits alongside the other config overriders (ApplyInferenceDefaults for
|
||||||
|
// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
|
||||||
|
// heuristically fill ModelConfig values the user left unset. Hardware tuning is
|
||||||
|
// the same domain — "adjust the config from the device that will run it" — so
|
||||||
|
// it lives here rather than scattered into the backend or a separate package.
|
||||||
|
//
|
||||||
|
// The heuristics are parameterized on a GPU descriptor (not on direct
|
||||||
|
// detection) so they apply in both deployment shapes: SetDefaults passes the
|
||||||
|
// LocalGPU on a single host, and the distributed router passes the *selected
|
||||||
|
// node's* reported GPU before loading there (the frontend that loaded the
|
||||||
|
// config may have no GPU at all).
|
||||||
|
|
||||||
|
// GPU describes the device that will run a model.
|
||||||
|
type GPU struct {
|
||||||
|
// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
|
||||||
|
Vendor string
|
||||||
|
// ComputeCapability is the NVIDIA compute capability as "major.minor"
|
||||||
|
// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
|
||||||
|
ComputeCapability string
|
||||||
|
// VRAM is total device memory in bytes (0 = unknown).
|
||||||
|
VRAM uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// Physical batch (n_batch / n_ubatch) defaults.
|
||||||
|
const (
|
||||||
|
// DefaultPhysicalBatch is the conservative default when no hardware-specific
|
||||||
|
// tuning applies. core/backend.DefaultBatchSize references this (single source).
|
||||||
|
DefaultPhysicalBatch = 512
|
||||||
|
// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
|
||||||
|
// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
|
||||||
|
// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
|
||||||
|
// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
|
||||||
|
BlackwellPhysicalBatch = 2048
|
||||||
|
)
|
||||||
|
|
||||||
|
// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
|
||||||
|
// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
|
||||||
|
// reports a different compute capability and is intentionally not matched.
|
||||||
|
func (g GPU) IsNVIDIABlackwell() bool {
|
||||||
|
maj, _ := parseComputeCapability(g.ComputeCapability)
|
||||||
|
return maj >= 12
|
||||||
|
}
|
||||||
|
|
||||||
|
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
||||||
|
// given hardware, used when the model config leaves batch unset.
|
||||||
|
func PhysicalBatch(g GPU) int {
|
||||||
|
if g.IsNVIDIABlackwell() {
|
||||||
|
return BlackwellPhysicalBatch
|
||||||
|
}
|
||||||
|
return DefaultPhysicalBatch
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||||
|
// Callers that re-tune a value chosen by an upstream host (the distributed
|
||||||
|
// router correcting the frontend's guess) use this to avoid clobbering an
|
||||||
|
// explicit user batch such as 1024.
|
||||||
|
func IsManagedPhysicalBatch(n int) bool {
|
||||||
|
return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
|
||||||
|
// n_parallel=1 (the backend default) and only auto-enables continuous batching
|
||||||
|
// when n_parallel > 1 — so a single-slot default makes concurrent requests
|
||||||
|
// queue. We default a slot count by GPU size so multi-user serving works out of
|
||||||
|
// the box. With the backend's unified KV cache the slots SHARE the context
|
||||||
|
// budget, so more slots add concurrency without multiplying KV memory.
|
||||||
|
const (
|
||||||
|
parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
|
||||||
|
parallelSlotsVRAMMid = uint64(8) << 30 // >=8 GiB -> 4 slots
|
||||||
|
parallelSlotsVRAMLow = uint64(4) << 30 // >=4 GiB -> 2 slots
|
||||||
|
)
|
||||||
|
|
||||||
|
// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
|
||||||
|
// 1 (no concurrency) when VRAM is unknown or too small, so we never change
|
||||||
|
// behavior on CPU-only / tiny devices.
|
||||||
|
func DefaultParallelSlots(g GPU) int {
|
||||||
|
switch {
|
||||||
|
case g.VRAM >= parallelSlotsVRAMHigh:
|
||||||
|
return 8
|
||||||
|
case g.VRAM >= parallelSlotsVRAMMid:
|
||||||
|
return 4
|
||||||
|
case g.VRAM >= parallelSlotsVRAMLow:
|
||||||
|
return 2
|
||||||
|
default:
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
|
||||||
|
// model doesn't already set one (and the GPU warrants concurrency). Returns the
|
||||||
|
// possibly-extended options. Shared by the single-host config path
|
||||||
|
// (ApplyHardwareDefaults) and the distributed router (per selected node).
|
||||||
|
func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||||
|
if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
|
||||||
|
return append(opts, fmt.Sprintf("parallel:%d", slots))
|
||||||
|
}
|
||||||
|
return opts
|
||||||
|
}
|
||||||
|
|
||||||
|
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
||||||
|
// so we never override an explicit value (helper shared with serving_defaults.go).
|
||||||
|
func hasParallelOption(opts []string) bool {
|
||||||
|
return backendOptionSet(opts, "parallel", "n_parallel")
|
||||||
|
}
|
||||||
|
|
||||||
|
// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
|
||||||
|
// a single host (the distributed router builds it from the selected node's
|
||||||
|
// reported info instead). It is a package var so tests can inject a
|
||||||
|
// deterministic device — detection does a live nvidia-smi call.
|
||||||
|
var localGPU = func() GPU {
|
||||||
|
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||||
|
vram, _ := xsysinfo.TotalAvailableVRAM()
|
||||||
|
return GPU{
|
||||||
|
Vendor: vendor,
|
||||||
|
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||||
|
VRAM: vram,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
|
||||||
|
// and were left unset by the user. Currently: a larger physical batch on
|
||||||
|
// Blackwell. Explicit config always wins (we only touch zero values).
|
||||||
|
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||||
|
if cfg == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
|
||||||
|
cfg.Batch = BlackwellPhysicalBatch
|
||||||
|
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||||
|
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enable concurrent serving by default on a capable GPU: without this the
|
||||||
|
// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
|
||||||
|
// (continuous batching stays off). Unified KV means the slots share the
|
||||||
|
// context budget, so this is concurrency without extra KV memory. Explicit
|
||||||
|
// parallel/n_parallel in the model options always wins.
|
||||||
|
if before := len(cfg.Options); true {
|
||||||
|
cfg.Options = EnsureParallelOption(cfg.Options, gpu)
|
||||||
|
if len(cfg.Options) > before {
|
||||||
|
xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
|
||||||
|
"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseComputeCapability splits a "major.minor" string into integer parts.
|
||||||
|
// Returns (-1, -1) when it can't be parsed.
|
||||||
|
func parseComputeCapability(cc string) (int, int) {
|
||||||
|
cc = strings.TrimSpace(cc)
|
||||||
|
if cc == "" {
|
||||||
|
return -1, -1
|
||||||
|
}
|
||||||
|
majStr, minStr := cc, "0"
|
||||||
|
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
|
||||||
|
majStr, minStr = cc[:dot], cc[dot+1:]
|
||||||
|
}
|
||||||
|
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
|
||||||
|
if err != nil {
|
||||||
|
return -1, -1
|
||||||
|
}
|
||||||
|
min, err := strconv.Atoi(strings.TrimSpace(minStr))
|
||||||
|
if err != nil {
|
||||||
|
min = 0
|
||||||
|
}
|
||||||
|
return maj, min
|
||||||
|
}
|
||||||
37
core/config/hardware_defaults_internal_test.go
Normal file
37
core/config/hardware_defaults_internal_test.go
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
. "github.com/onsi/ginkgo/v2"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Single-instance path: SetDefaults applies hardware defaults from the local
|
||||||
|
// GPU. The detection seam (localGPU) is injected so the path is deterministic
|
||||||
|
// without a real GPU.
|
||||||
|
var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
|
||||||
|
var orig func() GPU
|
||||||
|
BeforeEach(func() { orig = localGPU })
|
||||||
|
AfterEach(func() { localGPU = orig })
|
||||||
|
|
||||||
|
It("sets the physical batch on a local Blackwell GPU", func() {
|
||||||
|
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
cfg.SetDefaults()
|
||||||
|
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("leaves batch unset on a non-Blackwell local GPU", func() {
|
||||||
|
localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
cfg.SetDefaults()
|
||||||
|
Expect(cfg.Batch).To(Equal(0))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("never overrides an explicit batch", func() {
|
||||||
|
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
cfg.Batch = 1024
|
||||||
|
cfg.SetDefaults()
|
||||||
|
Expect(cfg.Batch).To(Equal(1024))
|
||||||
|
})
|
||||||
|
})
|
||||||
97
core/config/hardware_defaults_test.go
Normal file
97
core/config/hardware_defaults_test.go
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
package config_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
. "github.com/mudler/LocalAI/core/config"
|
||||||
|
. "github.com/onsi/ginkgo/v2"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = Describe("Hardware-driven config defaults", func() {
|
||||||
|
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
||||||
|
func(cc string, want bool) {
|
||||||
|
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
||||||
|
},
|
||||||
|
Entry("GB10 12.1", "12.1", true),
|
||||||
|
Entry("RTX 50 12.0", "12.0", true),
|
||||||
|
Entry("future 13.0", "13.0", true),
|
||||||
|
Entry("Hopper 9.0", "9.0", false),
|
||||||
|
Entry("Ada 8.9", "8.9", false),
|
||||||
|
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
|
||||||
|
Entry("unknown", "", false),
|
||||||
|
)
|
||||||
|
|
||||||
|
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
|
||||||
|
It("returns the Blackwell batch on Blackwell", func() {
|
||||||
|
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
|
||||||
|
})
|
||||||
|
It("returns the default batch otherwise", func() {
|
||||||
|
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
|
||||||
|
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
|
||||||
|
})
|
||||||
|
It("recognizes managed defaults but not explicit values", func() {
|
||||||
|
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
|
||||||
|
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
|
||||||
|
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
Describe("ApplyHardwareDefaults", func() {
|
||||||
|
It("raises an unset batch to 2048 on Blackwell", func() {
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||||
|
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||||
|
})
|
||||||
|
It("leaves batch unset on non-Blackwell", func() {
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
||||||
|
Expect(cfg.Batch).To(Equal(0))
|
||||||
|
})
|
||||||
|
It("never overrides an explicit batch", func() {
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
cfg.Batch = 1024
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||||
|
Expect(cfg.Batch).To(Equal(1024))
|
||||||
|
})
|
||||||
|
It("no-ops on nil", func() {
|
||||||
|
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
const gib = uint64(1) << 30
|
||||||
|
|
||||||
|
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||||
|
func(vramGiB uint64, want int) {
|
||||||
|
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
||||||
|
},
|
||||||
|
Entry("GB10 119 GiB", uint64(119), 8),
|
||||||
|
Entry("48 GiB", uint64(48), 8),
|
||||||
|
Entry("24 GiB", uint64(24), 4),
|
||||||
|
Entry("8 GiB", uint64(8), 4),
|
||||||
|
Entry("6 GiB", uint64(6), 2),
|
||||||
|
Entry("2 GiB", uint64(2), 1),
|
||||||
|
Entry("unknown 0", uint64(0), 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
Describe("ApplyHardwareDefaults parallel slots", func() {
|
||||||
|
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||||
|
Expect(cfg.Options).To(ContainElement("parallel:8"))
|
||||||
|
})
|
||||||
|
It("scales the slot count down with VRAM", func() {
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
|
||||||
|
Expect(cfg.Options).To(ContainElement("parallel:4"))
|
||||||
|
})
|
||||||
|
It("adds no parallel option on small/unknown VRAM", func() {
|
||||||
|
cfg := &ModelConfig{}
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
|
||||||
|
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
||||||
|
})
|
||||||
|
It("never overrides an explicit parallel option", func() {
|
||||||
|
cfg := &ModelConfig{Options: []string{"parallel:2"}}
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
|
||||||
|
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
|
|||||||
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
||||||
defer func() {
|
defer func() {
|
||||||
if cfg.ContextSize == nil {
|
if cfg.ContextSize == nil {
|
||||||
ctx := defaultContextSize
|
ctx := GGUFFallbackContextSize
|
||||||
cfg.ContextSize = &ctx
|
cfg.ContextSize = &ctx
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
|
|||||||
{Value: "face_recognition", Label: "Face Recognition"},
|
{Value: "face_recognition", Label: "Face Recognition"},
|
||||||
{Value: "transcript", Label: "Transcript"},
|
{Value: "transcript", Label: "Transcript"},
|
||||||
{Value: "diarization", Label: "Diarization"},
|
{Value: "diarization", Label: "Diarization"},
|
||||||
|
{Value: "sound_classification", Label: "Sound Classification"},
|
||||||
{Value: "speaker_recognition", Label: "Speaker Recognition"},
|
{Value: "speaker_recognition", Label: "Speaker Recognition"},
|
||||||
{Value: "tts", Label: "TTS"},
|
{Value: "tts", Label: "TTS"},
|
||||||
{Value: "sound_generation", Label: "Sound Generation"},
|
{Value: "sound_generation", Label: "Sound Generation"},
|
||||||
|
|||||||
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
|||||||
Order: 45,
|
Order: 45,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// --- Alias ---
|
||||||
|
"alias": {
|
||||||
|
Section: "alias",
|
||||||
|
Label: "Alias target",
|
||||||
|
Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
|
||||||
|
Component: "model-select",
|
||||||
|
Order: 0,
|
||||||
|
},
|
||||||
|
|
||||||
// --- Pipeline ---
|
// --- Pipeline ---
|
||||||
"pipeline.llm": {
|
"pipeline.llm": {
|
||||||
Section: "pipeline",
|
Section: "pipeline",
|
||||||
@@ -319,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
|||||||
AutocompleteProvider: ProviderModelsVAD,
|
AutocompleteProvider: ProviderModelsVAD,
|
||||||
Order: 63,
|
Order: 63,
|
||||||
},
|
},
|
||||||
|
"pipeline.sound_detection": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Sound Detection Model",
|
||||||
|
Description: "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
|
||||||
|
Component: "model-select",
|
||||||
|
AutocompleteProvider: ProviderModels,
|
||||||
|
Order: 64,
|
||||||
|
},
|
||||||
|
"pipeline.sound_detection_window_ms": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Sound Detection Window (ms)",
|
||||||
|
Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
|
||||||
|
Component: "number",
|
||||||
|
Min: f64(0),
|
||||||
|
Order: 65,
|
||||||
|
},
|
||||||
|
"pipeline.sound_detection_hop_ms": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Sound Detection Hop (ms)",
|
||||||
|
Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
|
||||||
|
Component: "number",
|
||||||
|
Min: f64(0),
|
||||||
|
Order: 66,
|
||||||
|
},
|
||||||
"pipeline.reasoning_effort": {
|
"pipeline.reasoning_effort": {
|
||||||
Section: "pipeline",
|
Section: "pipeline",
|
||||||
Label: "Reasoning Effort",
|
Label: "Reasoning Effort",
|
||||||
@@ -448,6 +481,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
|||||||
Component: "json-editor",
|
Component: "json-editor",
|
||||||
Order: 78,
|
Order: 78,
|
||||||
},
|
},
|
||||||
|
"pipeline.voice_recognition.enforce": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Voice Gate Enforce",
|
||||||
|
Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 80,
|
||||||
|
},
|
||||||
|
"pipeline.voice_recognition.identity.announce": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Speaker Identity Announce",
|
||||||
|
Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 81,
|
||||||
|
},
|
||||||
|
"pipeline.voice_recognition.identity.announce_unknown": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Speaker Identity Announce Unknown",
|
||||||
|
Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 82,
|
||||||
|
},
|
||||||
|
"pipeline.voice_recognition.identity.personalize": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Speaker Identity Personalize",
|
||||||
|
Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 83,
|
||||||
|
},
|
||||||
|
"pipeline.voice_recognition.identity.inject_name": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Speaker Identity Inject Name",
|
||||||
|
Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 84,
|
||||||
|
},
|
||||||
|
"pipeline.voice_recognition.identity.inject_system_note": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Speaker Identity Inject System Note",
|
||||||
|
Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 85,
|
||||||
|
},
|
||||||
|
"pipeline.voice_recognition.identity.note_unknown": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Speaker Identity Note Unknown",
|
||||||
|
Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 86,
|
||||||
|
},
|
||||||
"pipeline.max_history_items": {
|
"pipeline.max_history_items": {
|
||||||
Section: "pipeline",
|
Section: "pipeline",
|
||||||
Label: "Max History Items",
|
Label: "Max History Items",
|
||||||
@@ -455,6 +537,36 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
|||||||
Component: "number",
|
Component: "number",
|
||||||
Order: 79,
|
Order: 79,
|
||||||
},
|
},
|
||||||
|
"pipeline.compaction.enabled": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Compaction Enabled",
|
||||||
|
Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
|
||||||
|
Component: "toggle",
|
||||||
|
Order: 80,
|
||||||
|
},
|
||||||
|
"pipeline.compaction.trigger_items": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Compaction Trigger Items",
|
||||||
|
Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
|
||||||
|
Component: "number",
|
||||||
|
Order: 81,
|
||||||
|
},
|
||||||
|
"pipeline.compaction.summary_model": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Compaction Summary Model",
|
||||||
|
Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
|
||||||
|
Component: "input",
|
||||||
|
Advanced: true,
|
||||||
|
Order: 82,
|
||||||
|
},
|
||||||
|
"pipeline.compaction.max_summary_tokens": {
|
||||||
|
Section: "pipeline",
|
||||||
|
Label: "Compaction Max Summary Tokens",
|
||||||
|
Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
|
||||||
|
Component: "number",
|
||||||
|
Advanced: true,
|
||||||
|
Order: 83,
|
||||||
|
},
|
||||||
|
|
||||||
// --- Functions ---
|
// --- Functions ---
|
||||||
"function.grammar.parallel_calls": {
|
"function.grammar.parallel_calls": {
|
||||||
|
|||||||
28
core/config/meta/registry_test.go
Normal file
28
core/config/meta/registry_test.go
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
package meta_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/mudler/LocalAI/core/config/meta"
|
||||||
|
|
||||||
|
. "github.com/onsi/ginkgo/v2"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = Describe("alias field metadata", func() {
|
||||||
|
It("registers the alias field as a model-select in the alias section", func() {
|
||||||
|
reg := meta.DefaultRegistry()
|
||||||
|
f, ok := reg["alias"]
|
||||||
|
Expect(ok).To(BeTrue(), "alias field should have a registry override")
|
||||||
|
Expect(f.Section).To(Equal("alias"))
|
||||||
|
Expect(f.Component).To(Equal("model-select"))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("defines an alias section", func() {
|
||||||
|
var found bool
|
||||||
|
for _, s := range meta.DefaultSections() {
|
||||||
|
if s.ID == "alias" {
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Expect(found).To(BeTrue(), "DefaultSections should include an alias section")
|
||||||
|
})
|
||||||
|
})
|
||||||
@@ -69,6 +69,7 @@ type FieldMetaOverride struct {
|
|||||||
func DefaultSections() []Section {
|
func DefaultSections() []Section {
|
||||||
return []Section{
|
return []Section{
|
||||||
{ID: "general", Label: "General", Icon: "settings", Order: 0},
|
{ID: "general", Label: "General", Icon: "settings", Order: 0},
|
||||||
|
{ID: "alias", Label: "Alias", Icon: "git-merge", Order: 5},
|
||||||
{ID: "llm", Label: "LLM", Icon: "cpu", Order: 10},
|
{ID: "llm", Label: "LLM", Icon: "cpu", Order: 10},
|
||||||
{ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20},
|
{ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20},
|
||||||
{ID: "templates", Label: "Templates", Icon: "file-text", Order: 30},
|
{ID: "templates", Label: "Templates", Icon: "file-text", Order: 30},
|
||||||
|
|||||||
@@ -37,6 +37,12 @@ type ModelConfig struct {
|
|||||||
schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
|
schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
|
||||||
Name string `yaml:"name,omitempty" json:"name,omitempty"`
|
Name string `yaml:"name,omitempty" json:"name,omitempty"`
|
||||||
|
|
||||||
|
// Alias, when set, makes this config a pure redirect: every request for
|
||||||
|
// Name is served by the model named here. All other fields are ignored.
|
||||||
|
// The target must be an existing, non-alias model (enforced at load and
|
||||||
|
// at create/swap time). See docs/content for Model Aliases.
|
||||||
|
Alias string `yaml:"alias,omitempty" json:"alias,omitempty"`
|
||||||
|
|
||||||
F16 *bool `yaml:"f16,omitempty" json:"f16,omitempty"`
|
F16 *bool `yaml:"f16,omitempty" json:"f16,omitempty"`
|
||||||
Threads *int `yaml:"threads,omitempty" json:"threads,omitempty"`
|
Threads *int `yaml:"threads,omitempty" json:"threads,omitempty"`
|
||||||
Debug *bool `yaml:"debug,omitempty" json:"debug,omitempty"`
|
Debug *bool `yaml:"debug,omitempty" json:"debug,omitempty"`
|
||||||
@@ -391,6 +397,10 @@ func (c *ModelConfig) HasRouter() bool {
|
|||||||
return len(c.Router.Candidates) > 0
|
return len(c.Router.Candidates) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsAlias reports whether this config is a pure redirect to another model.
|
||||||
|
// Value receiver so it is callable on non-addressable config values too.
|
||||||
|
func (c ModelConfig) IsAlias() bool { return c.Alias != "" }
|
||||||
|
|
||||||
// @Description PII filtering configuration. PII redaction is per-model so
|
// @Description PII filtering configuration. PII redaction is per-model so
|
||||||
// that local models don't pay the latency or behaviour change of regex
|
// that local models don't pay the latency or behaviour change of regex
|
||||||
// scanning, while cloud-bound traffic (cloud-proxy backend) can default to
|
// scanning, while cloud-bound traffic (cloud-proxy backend) can default to
|
||||||
@@ -594,6 +604,20 @@ type Pipeline struct {
|
|||||||
LLM string `yaml:"llm,omitempty" json:"llm,omitempty"`
|
LLM string `yaml:"llm,omitempty" json:"llm,omitempty"`
|
||||||
Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"`
|
Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"`
|
||||||
VAD string `yaml:"vad,omitempty" json:"vad,omitempty"`
|
VAD string `yaml:"vad,omitempty" json:"vad,omitempty"`
|
||||||
|
// SoundDetection names a sound-event-classification model (e.g. ced). When
|
||||||
|
// set, each VAD-committed realtime utterance is also run through it and the
|
||||||
|
// scored AudioSet tags are emitted as a conversation.item.sound_detection
|
||||||
|
// server event, alongside (and independent of) transcription.
|
||||||
|
SoundDetection string `yaml:"sound_detection,omitempty" json:"sound_detection,omitempty"`
|
||||||
|
|
||||||
|
// SoundDetectionWindowMs / SoundDetectionHopMs enable server-side windowing
|
||||||
|
// for a sound-detection-only realtime session: instead of the client
|
||||||
|
// committing audio buffers, the server classifies the last WindowMs of
|
||||||
|
// streamed audio every HopMs and emits a sound_detection event per hop. Both
|
||||||
|
// must be > 0 to activate; otherwise the session stays client-driven (the
|
||||||
|
// client commits windows via input_audio_buffer.commit).
|
||||||
|
SoundDetectionWindowMs int `yaml:"sound_detection_window_ms,omitempty" json:"sound_detection_window_ms,omitempty"`
|
||||||
|
SoundDetectionHopMs int `yaml:"sound_detection_hop_ms,omitempty" json:"sound_detection_hop_ms,omitempty"`
|
||||||
|
|
||||||
// ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for
|
// ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for
|
||||||
// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
|
// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
|
||||||
@@ -617,11 +641,32 @@ type Pipeline struct {
|
|||||||
// context fills.
|
// context fills.
|
||||||
MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
|
MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
|
||||||
|
|
||||||
|
// Compaction folds conversation items that age out of the live window
|
||||||
|
// (max_history_items) into a rolling summary instead of dropping them, so
|
||||||
|
// long realtime sessions stay cheap without losing earlier context. Nil
|
||||||
|
// (block absent) means disabled, preserving existing behavior.
|
||||||
|
Compaction *PipelineCompaction `yaml:"compaction,omitempty" json:"compaction,omitempty"`
|
||||||
|
|
||||||
// VoiceRecognition gates the pipeline behind speaker verification. Nil
|
// VoiceRecognition gates the pipeline behind speaker verification. Nil
|
||||||
// (block absent) means no gate, preserving existing behavior.
|
// (block absent) means no gate, preserving existing behavior.
|
||||||
VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
|
VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PipelineCompaction configures summarize-then-drop for a realtime pipeline.
|
||||||
|
type PipelineCompaction struct {
|
||||||
|
// Enabled turns summarize-then-drop on. Default false.
|
||||||
|
Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
|
||||||
|
// TriggerItems is the high-water mark: once live items exceed it, overflow
|
||||||
|
// above max_history_items is summarized and evicted. Must exceed
|
||||||
|
// max_history_items; clamped up if not. Default: 2x max_history_items.
|
||||||
|
TriggerItems int `yaml:"trigger_items,omitempty" json:"trigger_items,omitempty"`
|
||||||
|
// SummaryModel optionally names a smaller/cheaper model for the summary
|
||||||
|
// call. Empty uses the pipeline's own LLM.
|
||||||
|
SummaryModel string `yaml:"summary_model,omitempty" json:"summary_model,omitempty"`
|
||||||
|
// MaxSummaryTokens advises the summary length (fed to the prompt). Default 512.
|
||||||
|
MaxSummaryTokens int `yaml:"max_summary_tokens,omitempty" json:"max_summary_tokens,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
// ApplyReasoningEffort resolves the effective reasoning effort — a per-request
|
// ApplyReasoningEffort resolves the effective reasoning effort — a per-request
|
||||||
// value (requestEffort) overrides the config's own ReasoningEffort default —
|
// value (requestEffort) overrides the config's own ReasoningEffort default —
|
||||||
// stores it on the config so gRPCPredictOpts forwards it to the backend as the
|
// stores it on the config so gRPCPredictOpts forwards it to the backend as the
|
||||||
@@ -759,6 +804,13 @@ type PipelineVoiceRecognition struct {
|
|||||||
Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
|
Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
|
||||||
// References are the authorized reference speakers (verify mode).
|
// References are the authorized reference speakers (verify mode).
|
||||||
References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
|
References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
|
||||||
|
// Enforce controls the authorization gate. A nil value or true rejects
|
||||||
|
// unauthorized speakers (the historical behavior). false resolves the
|
||||||
|
// speaker's identity for surfacing/personalization but never drops a turn.
|
||||||
|
Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"`
|
||||||
|
// Identity surfaces the recognized speaker to the client and the LLM. It is
|
||||||
|
// independent of Enforce: identity can be surfaced without gating.
|
||||||
|
Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Description VoiceRecognitionAllow filters authorized registry identities.
|
// @Description VoiceRecognitionAllow filters authorized registry identities.
|
||||||
@@ -775,6 +827,25 @@ type VoiceReference struct {
|
|||||||
Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
|
Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime
|
||||||
|
// client and the LLM. When set, identity is resolved on every turn even if the
|
||||||
|
// gate's When is "first" (the gate still authorizes only once).
|
||||||
|
type VoiceIdentityConfig struct {
|
||||||
|
// Announce emits a conversation.item.speaker event to the client.
|
||||||
|
Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"`
|
||||||
|
// AnnounceUnknown also emits the event when there is no confident match.
|
||||||
|
AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"`
|
||||||
|
// Personalize informs the LLM who is speaking.
|
||||||
|
Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"`
|
||||||
|
// InjectName sets the per-message name field on each user turn.
|
||||||
|
InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"`
|
||||||
|
// InjectSystemNote maintains a "current speaker" note in the system message.
|
||||||
|
InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"`
|
||||||
|
// NoteUnknown adds a "the current speaker is unknown" note (enables the model
|
||||||
|
// to ask who it is talking to).
|
||||||
|
NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
// VoiceGateEnabled reports whether a voice-recognition gate is configured. The
|
// VoiceGateEnabled reports whether a voice-recognition gate is configured. The
|
||||||
// mere presence of the block is the intent signal: a present-but-incomplete
|
// mere presence of the block is the intent signal: a present-but-incomplete
|
||||||
// block (e.g. missing model) must fail closed at construction, not be silently
|
// block (e.g. missing model) must fail closed at construction, not be silently
|
||||||
@@ -783,6 +854,28 @@ func (p Pipeline) VoiceGateEnabled() bool {
|
|||||||
return p.VoiceRecognition != nil
|
return p.VoiceRecognition != nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// EnforceGate reports whether the gate rejects unauthorized speakers. A nil
|
||||||
|
// Enforce means "enforce" so existing configs keep gating.
|
||||||
|
func (p PipelineVoiceRecognition) EnforceGate() bool {
|
||||||
|
return p.Enforce == nil || *p.Enforce
|
||||||
|
}
|
||||||
|
|
||||||
|
// IdentityEnabled reports whether the speaker's identity must be resolved for
|
||||||
|
// surfacing or personalization.
|
||||||
|
func (p PipelineVoiceRecognition) IdentityEnabled() bool {
|
||||||
|
return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AnnounceEnabled reports whether to emit the conversation.item.speaker event.
|
||||||
|
func (p PipelineVoiceRecognition) AnnounceEnabled() bool {
|
||||||
|
return p.Identity != nil && p.Identity.Announce
|
||||||
|
}
|
||||||
|
|
||||||
|
// PersonalizeEnabled reports whether to inform the LLM of the speaker.
|
||||||
|
func (p PipelineVoiceRecognition) PersonalizeEnabled() bool {
|
||||||
|
return p.Identity != nil && p.Identity.Personalize
|
||||||
|
}
|
||||||
|
|
||||||
// Normalize fills in defaults in place for omitted fields.
|
// Normalize fills in defaults in place for omitted fields.
|
||||||
func (v *PipelineVoiceRecognition) Normalize() {
|
func (v *PipelineVoiceRecognition) Normalize() {
|
||||||
if v.Mode == "" {
|
if v.Mode == "" {
|
||||||
@@ -1111,107 +1204,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
|||||||
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
||||||
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
||||||
|
|
||||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
|
||||||
defaultTopP := 0.95
|
// Uses the local GPU here; in distributed mode the router re-applies the same
|
||||||
defaultTopK := 40
|
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
||||||
defaultMinP := 0.0
|
ApplyHardwareDefaults(cfg, localGPU())
|
||||||
defaultTemp := 0.9
|
|
||||||
// https://github.com/mudler/LocalAI/issues/2780
|
// Apply serving-policy defaults (device-independent): cross-request prefix
|
||||||
defaultMirostat := 0
|
// caching. Propagates to distributed nodes via the model options.
|
||||||
defaultMirostatTAU := 5.0
|
ApplyServingDefaults(cfg)
|
||||||
defaultMirostatETA := 0.1
|
|
||||||
defaultTypicalP := 1.0
|
// Generic fallback defaults (sampling params + runtime flags), applied after
|
||||||
defaultTFZ := 1.0
|
// the model-family / hardware / serving tiers above. Only fills unset values.
|
||||||
defaultZero := 0
|
ApplyGenericDefaults(cfg)
|
||||||
|
|
||||||
trueV := true
|
trueV := true
|
||||||
falseV := false
|
falseV := false
|
||||||
|
|
||||||
if cfg.Seed == nil {
|
|
||||||
// random number generator seed
|
|
||||||
defaultSeed := RAND_SEED
|
|
||||||
cfg.Seed = &defaultSeed
|
|
||||||
}
|
|
||||||
|
|
||||||
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
|
||||||
// native default differs (issue #6632). Only inject it for the llama.cpp
|
|
||||||
// family and the empty/auto backend; leave TopK nil for known non-llama
|
|
||||||
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
|
||||||
// is 0 rather than a silently-changed 40.
|
|
||||||
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
|
||||||
cfg.TopK = &defaultTopK
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.MinP == nil {
|
|
||||||
cfg.MinP = &defaultMinP
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.TypicalP == nil {
|
|
||||||
cfg.TypicalP = &defaultTypicalP
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.TFZ == nil {
|
|
||||||
cfg.TFZ = &defaultTFZ
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.MMap == nil {
|
|
||||||
// MMap is enabled by default
|
|
||||||
|
|
||||||
// Only exception is for Intel GPUs
|
|
||||||
if os.Getenv("XPU") != "" {
|
|
||||||
cfg.MMap = &falseV
|
|
||||||
} else {
|
|
||||||
cfg.MMap = &trueV
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.MMlock == nil {
|
|
||||||
// MMlock is disabled by default
|
|
||||||
cfg.MMlock = &falseV
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.TopP == nil {
|
|
||||||
cfg.TopP = &defaultTopP
|
|
||||||
}
|
|
||||||
if cfg.Temperature == nil {
|
|
||||||
cfg.Temperature = &defaultTemp
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.Maxtokens == nil {
|
|
||||||
cfg.Maxtokens = &defaultZero
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.Mirostat == nil {
|
|
||||||
cfg.Mirostat = &defaultMirostat
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.MirostatETA == nil {
|
|
||||||
cfg.MirostatETA = &defaultMirostatETA
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.MirostatTAU == nil {
|
|
||||||
cfg.MirostatTAU = &defaultMirostatTAU
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.LowVRAM == nil {
|
|
||||||
cfg.LowVRAM = &falseV
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.Embeddings == nil {
|
|
||||||
cfg.Embeddings = &falseV
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.Reranking == nil {
|
|
||||||
cfg.Reranking = &falseV
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.PromptCacheAll == nil {
|
|
||||||
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
|
||||||
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
|
||||||
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
|
||||||
cfg.PromptCacheAll = &trueV
|
|
||||||
}
|
|
||||||
|
|
||||||
if threads == 0 {
|
if threads == 0 {
|
||||||
// Threads can't be 0
|
// Threads can't be 0
|
||||||
threads = 4
|
threads = 4
|
||||||
@@ -1243,6 +1251,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *ModelConfig) Validate() (bool, error) {
|
func (c *ModelConfig) Validate() (bool, error) {
|
||||||
|
// An alias is a pure redirect: validate only its own shape here. Target
|
||||||
|
// existence and the no-chain rule need the full config set, so the loader
|
||||||
|
// (load-time) and the create/swap endpoints enforce those.
|
||||||
|
if c.IsAlias() {
|
||||||
|
if c.Name == "" {
|
||||||
|
return false, fmt.Errorf("alias config requires a name")
|
||||||
|
}
|
||||||
|
if c.Alias == c.Name {
|
||||||
|
return false, fmt.Errorf("alias %q cannot point to itself", c.Name)
|
||||||
|
}
|
||||||
|
if c.Backend != "" || c.Model != "" {
|
||||||
|
return false, fmt.Errorf("alias config %q must not set backend or parameters.model: an alias is a pure redirect", c.Name)
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
downloadedFileNames := []string{}
|
downloadedFileNames := []string{}
|
||||||
for _, f := range c.DownloadFiles {
|
for _, f := range c.DownloadFiles {
|
||||||
downloadedFileNames = append(downloadedFileNames, f.Filename)
|
downloadedFileNames = append(downloadedFileNames, f.Filename)
|
||||||
@@ -1463,6 +1487,11 @@ const (
|
|||||||
// so it may combine freely with other usecases.
|
// so it may combine freely with other usecases.
|
||||||
FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b1000000000000000000000
|
FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b1000000000000000000000
|
||||||
|
|
||||||
|
// Marks a model as wired for the SoundDetection gRPC primitive
|
||||||
|
// (audio tagging / sound-event classification — scored AudioSet
|
||||||
|
// labels via the SoundDetection RPC, e.g. ced).
|
||||||
|
FLAG_SOUND_CLASSIFICATION ModelConfigUsecase = 0b10000000000000000000000
|
||||||
|
|
||||||
// Common Subsets
|
// Common Subsets
|
||||||
FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
|
FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
|
||||||
)
|
)
|
||||||
@@ -1471,12 +1500,12 @@ const (
|
|||||||
// Flags within the same group are NOT orthogonal (e.g., chat and completion are
|
// Flags within the same group are NOT orthogonal (e.g., chat and completion are
|
||||||
// both text/language). A model is multimodal when its usecases span 2+ groups.
|
// both text/language). A model is multimodal when its usecases span 2+ groups.
|
||||||
var ModalityGroups = []ModelConfigUsecase{
|
var ModalityGroups = []ModelConfigUsecase{
|
||||||
FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
|
FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
|
||||||
FLAG_VISION | FLAG_DETECTION, // visual understanding
|
FLAG_VISION | FLAG_DETECTION, // visual understanding
|
||||||
FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO, // speech input — realtime_audio is any-to-any, so it counts here too
|
FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO | FLAG_SOUND_CLASSIFICATION, // audio input — realtime_audio is any-to-any, so it counts here too
|
||||||
FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
|
FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
|
||||||
FLAG_AUDIO_TRANSFORM, // audio in/out transforms
|
FLAG_AUDIO_TRANSFORM, // audio in/out transforms
|
||||||
FLAG_IMAGE | FLAG_VIDEO, // visual generation
|
FLAG_IMAGE | FLAG_VIDEO, // visual generation
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsMultimodal returns true if the given usecases span two or more orthogonal
|
// IsMultimodal returns true if the given usecases span two or more orthogonal
|
||||||
@@ -1499,29 +1528,30 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
|
|||||||
return map[string]ModelConfigUsecase{
|
return map[string]ModelConfigUsecase{
|
||||||
// Note: FLAG_ANY is intentionally excluded from this map
|
// Note: FLAG_ANY is intentionally excluded from this map
|
||||||
// because it's 0 and would always match in HasUsecases checks
|
// because it's 0 and would always match in HasUsecases checks
|
||||||
"FLAG_CHAT": FLAG_CHAT,
|
"FLAG_CHAT": FLAG_CHAT,
|
||||||
"FLAG_COMPLETION": FLAG_COMPLETION,
|
"FLAG_COMPLETION": FLAG_COMPLETION,
|
||||||
"FLAG_EDIT": FLAG_EDIT,
|
"FLAG_EDIT": FLAG_EDIT,
|
||||||
"FLAG_EMBEDDINGS": FLAG_EMBEDDINGS,
|
"FLAG_EMBEDDINGS": FLAG_EMBEDDINGS,
|
||||||
"FLAG_RERANK": FLAG_RERANK,
|
"FLAG_RERANK": FLAG_RERANK,
|
||||||
"FLAG_IMAGE": FLAG_IMAGE,
|
"FLAG_IMAGE": FLAG_IMAGE,
|
||||||
"FLAG_TRANSCRIPT": FLAG_TRANSCRIPT,
|
"FLAG_TRANSCRIPT": FLAG_TRANSCRIPT,
|
||||||
"FLAG_TTS": FLAG_TTS,
|
"FLAG_TTS": FLAG_TTS,
|
||||||
"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
|
"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
|
||||||
"FLAG_TOKENIZE": FLAG_TOKENIZE,
|
"FLAG_TOKENIZE": FLAG_TOKENIZE,
|
||||||
"FLAG_VAD": FLAG_VAD,
|
"FLAG_VAD": FLAG_VAD,
|
||||||
"FLAG_LLM": FLAG_LLM,
|
"FLAG_LLM": FLAG_LLM,
|
||||||
"FLAG_VIDEO": FLAG_VIDEO,
|
"FLAG_VIDEO": FLAG_VIDEO,
|
||||||
"FLAG_DETECTION": FLAG_DETECTION,
|
"FLAG_DETECTION": FLAG_DETECTION,
|
||||||
"FLAG_VISION": FLAG_VISION,
|
"FLAG_VISION": FLAG_VISION,
|
||||||
"FLAG_FACE_RECOGNITION": FLAG_FACE_RECOGNITION,
|
"FLAG_FACE_RECOGNITION": FLAG_FACE_RECOGNITION,
|
||||||
"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
|
"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
|
||||||
"FLAG_AUDIO_TRANSFORM": FLAG_AUDIO_TRANSFORM,
|
"FLAG_AUDIO_TRANSFORM": FLAG_AUDIO_TRANSFORM,
|
||||||
"FLAG_DIARIZATION": FLAG_DIARIZATION,
|
"FLAG_DIARIZATION": FLAG_DIARIZATION,
|
||||||
"FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO,
|
"FLAG_SOUND_CLASSIFICATION": FLAG_SOUND_CLASSIFICATION,
|
||||||
"FLAG_SCORE": FLAG_SCORE,
|
"FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO,
|
||||||
"FLAG_DEPTH": FLAG_DEPTH,
|
"FLAG_SCORE": FLAG_SCORE,
|
||||||
"FLAG_TOKEN_CLASSIFY": FLAG_TOKEN_CLASSIFY,
|
"FLAG_DEPTH": FLAG_DEPTH,
|
||||||
|
"FLAG_TOKEN_CLASSIFY": FLAG_TOKEN_CLASSIFY,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1724,6 +1754,16 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (u & FLAG_SOUND_CLASSIFICATION) == FLAG_SOUND_CLASSIFICATION {
|
||||||
|
// ced is a sound-event tagger (AudioSet labels) surfaced via the
|
||||||
|
// SoundDetection gRPC. Models without an explicit known_usecases
|
||||||
|
// still surface when they run on one of these backends.
|
||||||
|
soundClassificationBackends := []string{"ced"}
|
||||||
|
if !slices.Contains(soundClassificationBackends, c.Backend) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
|
if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
|
||||||
// Backends that own a single any-to-any loop and implement
|
// Backends that own a single any-to-any loop and implement
|
||||||
// AudioToAudioStream — listed here so models without an explicit
|
// AudioToAudioStream — listed here so models without an explicit
|
||||||
|
|||||||
@@ -294,6 +294,44 @@ func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelCon
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ResolveAlias follows a one-hop alias to its target config. Returns
|
||||||
|
// (resolved, wasAlias, err). Non-alias configs return (cfg, false, nil)
|
||||||
|
// unchanged. Strict: the target must exist and must not itself be an alias
|
||||||
|
// (chains are rejected). The returned config is a copy of the target.
|
||||||
|
func (bcl *ModelConfigLoader) ResolveAlias(cfg *ModelConfig) (*ModelConfig, bool, error) {
|
||||||
|
if cfg == nil || !cfg.IsAlias() {
|
||||||
|
return cfg, false, nil
|
||||||
|
}
|
||||||
|
target, exists := bcl.GetModelConfig(cfg.Alias)
|
||||||
|
if !exists {
|
||||||
|
return nil, true, fmt.Errorf("alias %q points to unknown model %q", cfg.Name, cfg.Alias)
|
||||||
|
}
|
||||||
|
if target.IsAlias() {
|
||||||
|
return nil, true, fmt.Errorf("alias %q points to another alias %q (chains are not allowed)", cfg.Name, cfg.Alias)
|
||||||
|
}
|
||||||
|
return &target, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ValidateAliasTarget checks an alias config's target at create/swap time:
|
||||||
|
// the target must exist, must not be an alias, and must not be disabled.
|
||||||
|
// Returns nil for non-alias configs.
|
||||||
|
func (bcl *ModelConfigLoader) ValidateAliasTarget(cfg *ModelConfig) error {
|
||||||
|
if cfg == nil || !cfg.IsAlias() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
target, exists := bcl.GetModelConfig(cfg.Alias)
|
||||||
|
if !exists {
|
||||||
|
return fmt.Errorf("alias target %q does not exist", cfg.Alias)
|
||||||
|
}
|
||||||
|
if target.IsAlias() {
|
||||||
|
return fmt.Errorf("alias target %q is itself an alias (chains are not allowed)", cfg.Alias)
|
||||||
|
}
|
||||||
|
if target.IsDisabled() {
|
||||||
|
return fmt.Errorf("alias target %q is disabled", cfg.Alias)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Preload prepare models if they are not local but url or huggingface repositories
|
// Preload prepare models if they are not local but url or huggingface repositories
|
||||||
func (bcl *ModelConfigLoader) Preload(modelPath string) error {
|
func (bcl *ModelConfigLoader) Preload(modelPath string) error {
|
||||||
bcl.Lock()
|
bcl.Lock()
|
||||||
@@ -475,5 +513,21 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Surface aliases whose targets are missing or themselves aliases. These
|
||||||
|
// resolve to a clear request-time error; warning here gives operators
|
||||||
|
// visibility without failing startup.
|
||||||
|
for name, c := range bcl.configs {
|
||||||
|
if !c.IsAlias() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
target, ok := bcl.configs[c.Alias]
|
||||||
|
switch {
|
||||||
|
case !ok:
|
||||||
|
xlog.Warn("alias points to unknown model", "alias", name, "target", c.Alias)
|
||||||
|
case target.IsAlias():
|
||||||
|
xlog.Warn("alias points to another alias (chains are not allowed)", "alias", name, "target", c.Alias)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -61,3 +61,51 @@ var _ = Describe("ModelConfigLoader.GetModelsConflictingWith", func() {
|
|||||||
Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b"))
|
Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b"))
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var _ = Describe("ModelConfigLoader alias resolution", func() {
|
||||||
|
var loader *ModelConfigLoader
|
||||||
|
|
||||||
|
BeforeEach(func() {
|
||||||
|
loader = NewModelConfigLoader("")
|
||||||
|
loader.configs["real"] = ModelConfig{Name: "real", Backend: "llama-cpp"}
|
||||||
|
loader.configs["gpt-4"] = ModelConfig{Name: "gpt-4", Alias: "real"}
|
||||||
|
loader.configs["chain"] = ModelConfig{Name: "chain", Alias: "gpt-4"}
|
||||||
|
loader.configs["dangling"] = ModelConfig{Name: "dangling", Alias: "nope"}
|
||||||
|
})
|
||||||
|
|
||||||
|
It("returns non-alias configs unchanged", func() {
|
||||||
|
cfg := loader.configs["real"]
|
||||||
|
got, was, err := loader.ResolveAlias(&cfg)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(was).To(BeFalse())
|
||||||
|
Expect(got.Name).To(Equal("real"))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("resolves an alias to its target", func() {
|
||||||
|
cfg := loader.configs["gpt-4"]
|
||||||
|
got, was, err := loader.ResolveAlias(&cfg)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(was).To(BeTrue())
|
||||||
|
Expect(got.Name).To(Equal("real"))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("rejects an alias chain", func() {
|
||||||
|
cfg := loader.configs["chain"]
|
||||||
|
_, was, err := loader.ResolveAlias(&cfg)
|
||||||
|
Expect(was).To(BeTrue())
|
||||||
|
Expect(err).To(MatchError(ContainSubstring("chains are not allowed")))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("rejects a dangling alias", func() {
|
||||||
|
cfg := loader.configs["dangling"]
|
||||||
|
_, _, err := loader.ResolveAlias(&cfg)
|
||||||
|
Expect(err).To(MatchError(ContainSubstring("unknown model")))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("ValidateAliasTarget passes for a real target and fails for a chain", func() {
|
||||||
|
good := loader.configs["gpt-4"]
|
||||||
|
Expect(loader.ValidateAliasTarget(&good)).ToNot(HaveOccurred())
|
||||||
|
bad := loader.configs["chain"]
|
||||||
|
Expect(loader.ValidateAliasTarget(&bad)).To(MatchError(ContainSubstring("itself an alias")))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|||||||
@@ -787,3 +787,32 @@ var _ = Describe("pattern detector config", func() {
|
|||||||
Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
|
Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var _ = Describe("ModelConfig alias", func() {
|
||||||
|
It("reports IsAlias when alias is set", func() {
|
||||||
|
c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
|
||||||
|
Expect(c.IsAlias()).To(BeTrue())
|
||||||
|
Expect(ModelConfig{Name: "real"}.IsAlias()).To(BeFalse())
|
||||||
|
})
|
||||||
|
|
||||||
|
It("validates a minimal alias config", func() {
|
||||||
|
c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
|
||||||
|
ok, err := c.Validate()
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(ok).To(BeTrue())
|
||||||
|
})
|
||||||
|
|
||||||
|
It("rejects an alias pointing to itself", func() {
|
||||||
|
c := ModelConfig{Name: "loop", Alias: "loop"}
|
||||||
|
ok, err := c.Validate()
|
||||||
|
Expect(ok).To(BeFalse())
|
||||||
|
Expect(err).To(MatchError(ContainSubstring("itself")))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("rejects an alias that also sets a backend", func() {
|
||||||
|
c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3", Backend: "llama-cpp"}
|
||||||
|
ok, err := c.Validate()
|
||||||
|
Expect(ok).To(BeFalse())
|
||||||
|
Expect(err).To(MatchError(ContainSubstring("pure redirect")))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ type RuntimeSettings struct {
|
|||||||
|
|
||||||
// Eviction settings
|
// Eviction settings
|
||||||
ForceEvictionWhenBusy *bool `json:"force_eviction_when_busy,omitempty"` // Force eviction even when models have active API calls (default: false for safety)
|
ForceEvictionWhenBusy *bool `json:"force_eviction_when_busy,omitempty"` // Force eviction even when models have active API calls (default: false for safety)
|
||||||
|
SizeAwareEviction *bool `json:"size_aware_eviction,omitempty"` // Evict largest models first rather than least-recently-used (default: false)
|
||||||
LRUEvictionMaxRetries *int `json:"lru_eviction_max_retries,omitempty"` // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
LRUEvictionMaxRetries *int `json:"lru_eviction_max_retries,omitempty"` // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
||||||
LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
|
LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"reflect"
|
||||||
)
|
)
|
||||||
|
|
||||||
// runtimeSettingsFile is the on-disk filename inside DynamicConfigsDir.
|
// runtimeSettingsFile is the on-disk filename inside DynamicConfigsDir.
|
||||||
@@ -33,6 +34,35 @@ func (o *ApplicationConfig) ReadPersistedSettings() (RuntimeSettings, error) {
|
|||||||
return settings, nil
|
return settings, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MergeNonNil overlays every set (non-nil) field of overlay onto the
|
||||||
|
// receiver, leaving the receiver's value untouched wherever overlay left a
|
||||||
|
// field unset. Every RuntimeSettings field is a pointer precisely so "set"
|
||||||
|
// can be told apart from "absent" (see the type doc), which makes this a
|
||||||
|
// faithful partial update: a caller that submits only the field it owns
|
||||||
|
// changes exactly that field and never clobbers unrelated settings.
|
||||||
|
//
|
||||||
|
// This is the read-modify-write contract the persistence helpers exist for.
|
||||||
|
// UpdateSettingsEndpoint reads the on-disk settings, merges the request body
|
||||||
|
// on top, and writes the result — so a focused admin page that POSTs only its
|
||||||
|
// own field (the Middleware page sends only mitm_listen; the detector table
|
||||||
|
// only pii_default_detectors) no longer nulls every other setting.
|
||||||
|
//
|
||||||
|
// Reflection keeps the merge total over the struct: a field added to
|
||||||
|
// RuntimeSettings later is merged automatically, so the persistence path can
|
||||||
|
// never silently drop a new setting the way a hand-maintained field list
|
||||||
|
// would. Non-pointer fields (none today) are skipped — they cannot express
|
||||||
|
// "absent", so the receiver wins.
|
||||||
|
func (s *RuntimeSettings) MergeNonNil(overlay RuntimeSettings) {
|
||||||
|
dst := reflect.ValueOf(s).Elem()
|
||||||
|
src := reflect.ValueOf(overlay)
|
||||||
|
for i := 0; i < src.NumField(); i++ {
|
||||||
|
f := src.Field(i)
|
||||||
|
if f.Kind() == reflect.Pointer && !f.IsNil() {
|
||||||
|
dst.Field(i).Set(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// WritePersistedSettings serialises the given RuntimeSettings to
|
// WritePersistedSettings serialises the given RuntimeSettings to
|
||||||
// runtime_settings.json with restricted permissions (it may carry API
|
// runtime_settings.json with restricted permissions (it may carry API
|
||||||
// keys and P2P tokens).
|
// keys and P2P tokens).
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func strPtr(s string) *string { return &s }
|
func strPtr(s string) *string { return &s }
|
||||||
|
func boolPtr(b bool) *bool { return &b }
|
||||||
|
|
||||||
var _ = Describe("RuntimeSettings persistence helpers", func() {
|
var _ = Describe("RuntimeSettings persistence helpers", func() {
|
||||||
var (
|
var (
|
||||||
@@ -51,6 +52,47 @@ var _ = Describe("RuntimeSettings persistence helpers", func() {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// MergeNonNil is the partial-update primitive UpdateSettingsEndpoint
|
||||||
|
// relies on: a focused admin page POSTs only the field it owns, and the
|
||||||
|
// handler reads the on-disk settings and overlays the request on top.
|
||||||
|
// Without it, the body would be written verbatim and every field the
|
||||||
|
// caller omitted would be nulled (the reported regression: changing
|
||||||
|
// mitm_listen wiped the galleries, api keys, watchdog config, etc.).
|
||||||
|
Describe("MergeNonNil partial update", func() {
|
||||||
|
It("overlays set fields and preserves unset ones", func() {
|
||||||
|
base := config.RuntimeSettings{
|
||||||
|
MITMListen: strPtr(":9000"),
|
||||||
|
Galleries: &[]config.Gallery{{Name: "g1", URL: "http://example/g1"}},
|
||||||
|
WatchdogIdleEnabled: boolPtr(true),
|
||||||
|
ApiKeys: &[]string{"persisted-key"},
|
||||||
|
PIIDefaultDetectors: &[]string{"det-a"},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate the Middleware proxy tab: only mitm_listen is sent.
|
||||||
|
overlay := config.RuntimeSettings{MITMListen: strPtr(":8443")}
|
||||||
|
base.MergeNonNil(overlay)
|
||||||
|
|
||||||
|
Expect(base.MITMListen).ToNot(BeNil())
|
||||||
|
Expect(*base.MITMListen).To(Equal(":8443"), "set field should be overlaid")
|
||||||
|
// Everything the overlay left unset must survive untouched.
|
||||||
|
Expect(base.Galleries).ToNot(BeNil(), "galleries were clobbered")
|
||||||
|
Expect(*base.Galleries).To(HaveLen(1))
|
||||||
|
Expect(base.WatchdogIdleEnabled).ToNot(BeNil())
|
||||||
|
Expect(*base.WatchdogIdleEnabled).To(BeTrue())
|
||||||
|
Expect(base.ApiKeys).ToNot(BeNil(), "api_keys were clobbered")
|
||||||
|
Expect(*base.ApiKeys).To(Equal([]string{"persisted-key"}))
|
||||||
|
Expect(base.PIIDefaultDetectors).ToNot(BeNil(), "pii_default_detectors were clobbered")
|
||||||
|
Expect(*base.PIIDefaultDetectors).To(Equal([]string{"det-a"}))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("lets an explicit empty slice clear a field", func() {
|
||||||
|
base := config.RuntimeSettings{PIIDefaultDetectors: &[]string{"det-a"}}
|
||||||
|
base.MergeNonNil(config.RuntimeSettings{PIIDefaultDetectors: &[]string{}})
|
||||||
|
Expect(base.PIIDefaultDetectors).ToNot(BeNil())
|
||||||
|
Expect(*base.PIIDefaultDetectors).To(BeEmpty(), "an explicit empty slice should clear, not preserve")
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
// MITM round trip pins the contract that loadRuntimeSettingsFromFile
|
// MITM round trip pins the contract that loadRuntimeSettingsFromFile
|
||||||
// MITM listener address must survive a write/read round trip so the
|
// MITM listener address must survive a write/read round trip so the
|
||||||
// next process restart can bring the listener back up. (Intercept
|
// next process restart can bring the listener back up. (Intercept
|
||||||
|
|||||||
56
core/config/serving_defaults.go
Normal file
56
core/config/serving_defaults.go
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/mudler/xlog"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Serving-policy model-config defaults.
|
||||||
|
//
|
||||||
|
// Sibling to hardware_defaults.go: those fill values driven by the target
|
||||||
|
// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
|
||||||
|
// that improve multi-request / multi-user *serving* regardless of the GPU. They
|
||||||
|
// run together from SetDefaults and only ever fill values the user left unset.
|
||||||
|
|
||||||
|
// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
|
||||||
|
// reuses across requests via KV-cache shifting. The llama.cpp backend ships this
|
||||||
|
// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
|
||||||
|
// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
|
||||||
|
// This is the universally-useful part of "paged attention" (cross-request prefix
|
||||||
|
// sharing) and needs none of the block-KV machinery.
|
||||||
|
const DefaultCacheReuse = 256
|
||||||
|
|
||||||
|
// ApplyServingDefaults fills serving-policy ModelConfig values the user left
|
||||||
|
// unset. Currently: enable cross-request prefix caching. Explicit
|
||||||
|
// cache_reuse/n_cache_reuse in the model options always wins.
|
||||||
|
func ApplyServingDefaults(cfg *ModelConfig) {
|
||||||
|
if cfg == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
|
||||||
|
cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
|
||||||
|
xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
|
||||||
|
"cache_reuse", DefaultCacheReuse)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// backendOptionSet reports whether the backend options already set any of names.
|
||||||
|
// Options are "name:value" strings (or bare "name"); used so we never override
|
||||||
|
// an explicit value. Shared with hardware_defaults.go.
|
||||||
|
func backendOptionSet(opts []string, names ...string) bool {
|
||||||
|
for _, o := range opts {
|
||||||
|
name := o
|
||||||
|
if i := strings.IndexByte(o, ':'); i >= 0 {
|
||||||
|
name = o[:i]
|
||||||
|
}
|
||||||
|
name = strings.TrimSpace(strings.ToLower(name))
|
||||||
|
for _, n := range names {
|
||||||
|
if name == n {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user