mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 08:08:52 -04:00
Compare commits
53 Commits
worktree-f
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
961eb8ce14 | ||
|
|
10184b5e28 | ||
|
|
fdf475ec5f | ||
|
|
9d54a599b0 | ||
|
|
63bcbf6c12 | ||
|
|
95b058e1c5 | ||
|
|
f2abcc7503 | ||
|
|
62c99c10b3 | ||
|
|
7226bb9f30 | ||
|
|
569d9bbd9e | ||
|
|
682fb2718c | ||
|
|
20c643e1f6 | ||
|
|
64a4351f3a | ||
|
|
b7d67f5779 | ||
|
|
600dafd20b | ||
|
|
ce8a3e9266 | ||
|
|
a88d9d2de3 | ||
|
|
1cf1bf32e1 | ||
|
|
f45c6acc54 | ||
|
|
1a1bd57469 | ||
|
|
1f29e96030 | ||
|
|
64560a974b | ||
|
|
32c47706ae | ||
|
|
e58870a573 | ||
|
|
8fab1d2e45 | ||
|
|
7b462a0d51 | ||
|
|
aed181e6c1 | ||
|
|
a556cd9afc | ||
|
|
b50b1fe418 | ||
|
|
b4c0dc67fe | ||
|
|
01fa12e0de | ||
|
|
cf7f9573a2 | ||
|
|
c6303104c7 | ||
|
|
3e96d811b7 | ||
|
|
23f225260c | ||
|
|
aef10723c9 | ||
|
|
9565db5f94 | ||
|
|
e19c43cf04 | ||
|
|
b081247d95 | ||
|
|
1be959ce30 | ||
|
|
518381278e | ||
|
|
93706fec57 | ||
|
|
11aee03a80 | ||
|
|
8915f2ab91 | ||
|
|
f143d7f688 | ||
|
|
dd928f0bdd | ||
|
|
c43a752afc | ||
|
|
079ac0e15a | ||
|
|
2e734bf560 | ||
|
|
72d46c1115 | ||
|
|
606128e4e9 | ||
|
|
59c7ad5153 | ||
|
|
78d682224a |
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
|
||||
- If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
|
||||
- Check similar backends to determine the correct context
|
||||
|
||||
## Documenting the backend (README + docs)
|
||||
|
||||
A backend is not "added" until it is discoverable. Update the user-facing docs:
|
||||
|
||||
- **`docs/content/features/backends.md`** - add the backend to the right
|
||||
category in the "LocalAI supports various types of backends" list (and add a
|
||||
new category if it introduces a new modality, e.g. sound classification).
|
||||
- If the backend introduces a **new API surface** (a new endpoint or a realtime
|
||||
capability), document it under `docs/content/` where its area lives (audio,
|
||||
vision, etc.) and follow the api-endpoints checklist in
|
||||
[api-endpoints-and-auth.md](api-endpoints-and-auth.md).
|
||||
|
||||
**If the backend is a native C/C++/GGML engine created and maintained by the
|
||||
LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
|
||||
`vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
|
||||
ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
|
||||
engines ... developed and maintained by the LocalAI project itself". Add a row
|
||||
linking the upstream engine repo with a one-line description. This is the
|
||||
project's showcase of its own engines; a new in-house backend that is missing
|
||||
from it is a documentation bug.
|
||||
|
||||
## 5. Verification Checklist
|
||||
|
||||
After adding a new backend, verify:
|
||||
@@ -211,6 +232,8 @@ After adding a new backend, verify:
|
||||
- [ ] No YAML syntax errors (check with linter)
|
||||
- [ ] No Makefile syntax errors (check with linter)
|
||||
- [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
|
||||
- [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
|
||||
- [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`
|
||||
|
||||
## Bundling runtime shared libraries (`package.sh`)
|
||||
|
||||
|
||||
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
|
||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
|
||||
# manifests. The LunarG SDK below only provides the loader and shader
|
||||
# tooling, not hardware drivers — without Mesa the packaged Vulkan backend
|
||||
# would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
|
||||
# .so files plus their deps into the backend so it stays self-contained.
|
||||
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||
if [ "amd64" = "${TARGETARCH:-}" ]; then
|
||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
|
||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
|
||||
|
||||
152
.github/backend-matrix.yml
vendored
152
.github/backend-matrix.yml
vendored
@@ -3575,6 +3575,154 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# ced
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-12-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-13-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
|
||||
base-image: "ubuntu:24.04"
|
||||
ubuntu-version: '2404'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-ced'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f32'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f32-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f16'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-intel-sycl-f16-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-ced'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'vulkan'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-vulkan-ced'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-arm64-ced'
|
||||
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: 'hipblas'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-rocm-hipblas-ced'
|
||||
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||
runs-on: 'ubuntu-latest'
|
||||
skip-drivers: 'false'
|
||||
backend: "ced"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# acestep-cpp
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
@@ -4754,6 +4902,10 @@ includeDarwin:
|
||||
tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "ced"
|
||||
tag-suffix: "-metal-darwin-arm64-ced"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "acestep-cpp"
|
||||
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
|
||||
build-type: "metal"
|
||||
|
||||
2
.github/workflows/backend.yml
vendored
2
.github/workflows/backend.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
|
||||
2
.github/workflows/backend_build.yml
vendored
2
.github/workflows/backend_build.yml
vendored
@@ -101,7 +101,7 @@ jobs:
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
|
||||
2
.github/workflows/backend_build_darwin.yml
vendored
2
.github/workflows/backend_build_darwin.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
HOMEBREW_NO_ANALYTICS: '1'
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
|
||||
2
.github/workflows/backend_merge.yml
vendored
2
.github/workflows/backend_merge.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
||||
# Sparse checkout: the merge job needs `.github/scripts/` (for the
|
||||
# keepalive cleanup script) but none of the source tree.
|
||||
- name: Checkout (.github/scripts only)
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
sparse-checkout: |
|
||||
.github/scripts
|
||||
|
||||
2
.github/workflows/backend_pr.yml
vendored
2
.github/workflows/backend_pr.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
has-merges-singlearch: ${{ steps.set-matrix.outputs['has-merges-singlearch'] }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
|
||||
2
.github/workflows/base-images.yml
vendored
2
.github/workflows/base-images.yml
vendored
@@ -127,7 +127,7 @@ jobs:
|
||||
# the original l4t matrix entry which set skip-drivers: 'true'.
|
||||
skip-drivers: 'true'
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: false
|
||||
- name: Free disk space
|
||||
|
||||
6
.github/workflows/build-test.yaml
vendored
6
.github/workflows/build-test.yaml
vendored
@@ -11,7 +11,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -25,7 +25,7 @@ jobs:
|
||||
runs-on: macos-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -47,7 +47,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Configure apt mirror on runner
|
||||
|
||||
@@ -14,7 +14,7 @@ jobs:
|
||||
bump:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
|
||||
8
.github/workflows/bump_deps.yaml
vendored
8
.github/workflows/bump_deps.yaml
vendored
@@ -42,6 +42,10 @@ jobs:
|
||||
variable: "PARAKEET_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/parakeet-cpp/Makefile"
|
||||
- repository: "mudler/ced.cpp"
|
||||
variable: "CED_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/ced/Makefile"
|
||||
- repository: "mudler/depth-anything.cpp"
|
||||
variable: "DEPTHANYTHING_VERSION"
|
||||
branch: "master"
|
||||
@@ -88,7 +92,7 @@ jobs:
|
||||
file: "backend/go/vibevoice-cpp/Makefile"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump dependencies 🔧
|
||||
id: bump
|
||||
run: |
|
||||
@@ -124,7 +128,7 @@ jobs:
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump vLLM cu130 wheel pin 🔧
|
||||
id: bump
|
||||
run: |
|
||||
|
||||
2
.github/workflows/bump_docs.yaml
vendored
2
.github/workflows/bump_docs.yaml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
- repository: "mudler/LocalAI"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump dependencies 🔧
|
||||
run: |
|
||||
bash .github/bump_docs.sh ${{ matrix.repository }}
|
||||
|
||||
2
.github/workflows/checksum_checker.yaml
vendored
2
.github/workflows/checksum_checker.yaml
vendored
@@ -8,7 +8,7 @@ jobs:
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Configure apt mirror on runner
|
||||
uses: ./.github/actions/configure-apt-mirror
|
||||
- name: Install dependencies
|
||||
|
||||
2
.github/workflows/deploy-explorer.yaml
vendored
2
.github/workflows/deploy-explorer.yaml
vendored
@@ -16,7 +16,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- uses: actions/setup-go@v5
|
||||
|
||||
2
.github/workflows/gallery-agent.yaml
vendored
2
.github/workflows/gallery-agent.yaml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
2
.github/workflows/generate_intel_image.yaml
vendored
2
.github/workflows/generate_intel_image.yaml
vendored
@@ -44,7 +44,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@master
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Cache Intel images
|
||||
uses: docker/build-push-action@v7
|
||||
|
||||
2
.github/workflows/gh-pages.yml
vendored
2
.github/workflows/gh-pages.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
HUGO_VERSION: "0.146.3"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0 # needed for enableGitInfo
|
||||
submodules: true
|
||||
|
||||
2
.github/workflows/image_build.yml
vendored
2
.github/workflows/image_build.yml
vendored
@@ -80,7 +80,7 @@ jobs:
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
|
||||
- name: Configure apt mirror on runner
|
||||
id: apt_mirror
|
||||
|
||||
2
.github/workflows/image_merge.yml
vendored
2
.github/workflows/image_merge.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
# Sparse checkout: needed for .github/scripts/ (the keepalive cleanup
|
||||
# script). Skips the rest of the source tree.
|
||||
- name: Checkout (.github/scripts only)
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
sparse-checkout: |
|
||||
.github/scripts
|
||||
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -20,7 +20,7 @@ jobs:
|
||||
golangci-lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
with:
|
||||
# Full history so golangci-lint's new-from-merge-base can reach
|
||||
# origin/master and compute the diff against it.
|
||||
|
||||
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@@ -10,7 +10,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -28,7 +28,7 @@ jobs:
|
||||
runs-on: macos-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Set up Go
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Configure apt mirror on runner
|
||||
|
||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -14,7 +14,7 @@ jobs:
|
||||
GO111MODULE: on
|
||||
steps:
|
||||
- name: Checkout Source
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
|
||||
86
.github/workflows/test-extra.yml
vendored
86
.github/workflows/test-extra.yml
vendored
@@ -50,7 +50,7 @@ jobs:
|
||||
parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
- name: Setup Bun
|
||||
uses: oven-sh/setup-bun@v2
|
||||
- name: Install dependencies
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -90,7 +90,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -113,7 +113,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -158,7 +158,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -178,7 +178,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -240,7 +240,7 @@ jobs:
|
||||
# sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
|
||||
# df -h
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -265,7 +265,7 @@ jobs:
|
||||
# runs-on: ubuntu-latest
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -288,7 +288,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -309,7 +309,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -330,7 +330,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -351,7 +351,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -373,7 +373,7 @@ jobs:
|
||||
# timeout-minutes: 45
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -394,7 +394,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -415,7 +415,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -436,7 +436,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -462,7 +462,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -484,7 +484,7 @@ jobs:
|
||||
timeout-minutes: 30
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -513,7 +513,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -530,7 +530,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -552,7 +552,7 @@ jobs:
|
||||
timeout-minutes: 20
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -579,7 +579,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -604,7 +604,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -625,7 +625,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -645,7 +645,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -664,7 +664,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -681,7 +681,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -698,7 +698,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -741,7 +741,7 @@ jobs:
|
||||
# timeout-minutes: 90
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -783,7 +783,7 @@ jobs:
|
||||
# timeout-minutes: 90
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v6
|
||||
# uses: actions/checkout@v7
|
||||
# with:
|
||||
# submodules: true
|
||||
# - name: Dependencies
|
||||
@@ -808,7 +808,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -840,7 +840,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -876,7 +876,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -915,7 +915,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -952,7 +952,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -987,7 +987,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -1013,7 +1013,7 @@ jobs:
|
||||
timeout-minutes: 150
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1042,7 +1042,7 @@ jobs:
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go
|
||||
@@ -1058,7 +1058,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1091,7 +1091,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1114,7 +1114,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -1140,7 +1140,7 @@ jobs:
|
||||
timeout-minutes: 90
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
|
||||
8
.github/workflows/test.yml
vendored
8
.github/workflows/test.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
go-version: ['1.26.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Free disk space
|
||||
@@ -71,7 +71,7 @@ jobs:
|
||||
if-no-files-found: ignore
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.23
|
||||
uses: mxschmitt/action-tmate@v3.24
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
go-version: ['1.26.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Setup Go ${{ matrix.go-version }}
|
||||
@@ -116,7 +116,7 @@ jobs:
|
||||
PATH="$PATH:$HOME/go/bin" BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.23
|
||||
uses: mxschmitt/action-tmate@v3.24
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
4
.github/workflows/tests-aio.yml
vendored
4
.github/workflows/tests-aio.yml
vendored
@@ -62,7 +62,7 @@ jobs:
|
||||
sudo rm -rfv build || true
|
||||
df -h
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
PATH="$PATH:$HOME/go/bin" make backends/local-store backends/silero-vad backends/llama-cpp backends/whisper backends/piper backends/stablediffusion-ggml docker-build-e2e e2e-aio
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.23
|
||||
uses: mxschmitt/action-tmate@v3.24
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
4
.github/workflows/tests-e2e.yml
vendored
4
.github/workflows/tests-e2e.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
go-version: ['1.25.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Configure apt mirror on runner
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
PATH="$PATH:$HOME/go/bin" make build-mock-backend test-e2e
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.23
|
||||
uses: mxschmitt/action-tmate@v3.24
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
97
.github/workflows/tests-pii-ner-e2e.yml
vendored
Normal file
97
.github/workflows/tests-pii-ner-e2e.yml
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
---
|
||||
name: 'PII NER tier E2E (live GGUF, CPU)'
|
||||
|
||||
# Runs the real privacy-filter GGUF NER tier end-to-end on CPU — the gap the
|
||||
# hermetic tests/e2e suite cannot cover (it only exercises the in-process
|
||||
# pattern tier). Heavy (builds the C++ backend image + downloads a ~2.7 GB
|
||||
# GGUF), so it is path-filtered on PRs and otherwise runs nightly / on demand.
|
||||
#
|
||||
# This drives the container-level harness (tests/e2e-backends) via
|
||||
# `make test-extra-backend-privacy-filter`: it builds the privacy-filter image,
|
||||
# downloads the model, loads it on CPU, and asserts byte-correct, UTF-8-aligned
|
||||
# TokenClassify spans. The complementary HTTP-path specs in tests/e2e
|
||||
# (e2e_pii_ner_test.go) Skip unless PII_NER_MODEL_GGUF is wired.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 3 * * *'
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- 'backend/cpp/privacy-filter/**'
|
||||
- 'backend/Dockerfile.privacy-filter'
|
||||
- 'core/services/routing/pii/**'
|
||||
- 'core/services/routing/piidetector/**'
|
||||
- 'core/backend/token_classify.go'
|
||||
- 'core/http/endpoints/localai/pii.go'
|
||||
- 'core/schema/pii.go'
|
||||
- 'tests/e2e-backends/**'
|
||||
- 'tests/e2e/e2e_pii_ner_test.go'
|
||||
- 'tests/e2e/e2e_suite_test.go'
|
||||
- '.github/workflows/tests-pii-ner-e2e.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'backend/cpp/privacy-filter/**'
|
||||
- 'backend/Dockerfile.privacy-filter'
|
||||
- 'core/services/routing/pii/**'
|
||||
- 'core/services/routing/piidetector/**'
|
||||
- 'core/backend/token_classify.go'
|
||||
- 'core/http/endpoints/localai/pii.go'
|
||||
- 'core/schema/pii.go'
|
||||
- 'tests/e2e-backends/**'
|
||||
- 'tests/e2e/e2e_pii_ner_test.go'
|
||||
- 'tests/e2e/e2e_suite_test.go'
|
||||
- '.github/workflows/tests-pii-ner-e2e.yml'
|
||||
|
||||
concurrency:
|
||||
group: ci-tests-pii-ner-e2e-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
jobs:
|
||||
tests-pii-ner-e2e:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
go-version: ['1.25.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Free disk space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL || true
|
||||
sudo docker image prune --all --force || true
|
||||
df -h
|
||||
- name: Configure apt mirror on runner
|
||||
uses: ./.github/actions/configure-apt-mirror
|
||||
- name: Setup Go ${{ matrix.go-version }}
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
cache: false
|
||||
- name: Proto Dependencies
|
||||
run: |
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential
|
||||
# Builds local-ai-backend:privacy-filter, downloads the GGUF, loads it on
|
||||
# CPU and runs the token_classify capability spec (byte-offset contract).
|
||||
- name: Run live PII NER backend E2E
|
||||
run: PATH="$PATH:$HOME/go/bin" make test-extra-backend-privacy-filter
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.24
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
limit-access-to-actor: true
|
||||
4
.github/workflows/tests-ui-e2e.yml
vendored
4
.github/workflows/tests-ui-e2e.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
go-version: ['1.26.x']
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v7
|
||||
with:
|
||||
submodules: true
|
||||
- name: Configure apt mirror on runner
|
||||
@@ -75,7 +75,7 @@ jobs:
|
||||
retention-days: 7
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.23
|
||||
uses: mxschmitt/action-tmate@v3.24
|
||||
with:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
|
||||
2
.github/workflows/update_swagger.yaml
vendored
2
.github/workflows/update_swagger.yaml
vendored
@@ -10,7 +10,7 @@ jobs:
|
||||
fail-fast: false
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v7
|
||||
- name: Configure apt mirror on runner
|
||||
uses: ./.github/actions/configure-apt-mirror
|
||||
- uses: actions/setup-go@v5
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -91,3 +91,6 @@ core/http/react-ui/test-results/
|
||||
|
||||
# Local worktrees
|
||||
.worktrees/
|
||||
|
||||
# SDD / brainstorm scratch (agent-driven development)
|
||||
.superpowers/
|
||||
|
||||
10
Makefile
10
Makefile
@@ -690,6 +690,16 @@ test-extra-backend-llama-cpp-transcription: docker-build-llama-cpp
|
||||
BACKEND_TEST_CTX_SIZE=2048 \
|
||||
$(MAKE) test-extra-backend
|
||||
|
||||
## privacy-filter: the PII/NER token-classification backend. Exercises the
|
||||
## TokenClassify RPC and asserts byte-correct, UTF-8-aligned span offsets
|
||||
## against the openai-privacy-filter multilingual GGUF (CPU-runnable, ~50M
|
||||
## active params). This is the live-backend coverage for the PII NER tier.
|
||||
test-extra-backend-privacy-filter: docker-build-privacy-filter
|
||||
BACKEND_IMAGE=local-ai-backend:privacy-filter \
|
||||
BACKEND_TEST_MODEL_URL=https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF/resolve/main/privacy-filter-multilingual-f16.gguf \
|
||||
BACKEND_TEST_CAPS=health,load,token_classify \
|
||||
$(MAKE) test-extra-backend
|
||||
|
||||
## vllm is resolved from a HuggingFace model id (no file download) and
|
||||
## exercises Predict + streaming + tool-call extraction via the hermes parser.
|
||||
## Requires a host CPU with the SIMD instructions the prebuilt vllm CPU
|
||||
|
||||
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
||||
| Backend | What it does |
|
||||
|---------|-------------|
|
||||
| [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
|
||||
| [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
|
||||
| [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
|
||||
| [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
|
||||
| [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
|
||||
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
||||
| [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
|
||||
| [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
|
||||
|
||||
We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
|
||||
|
||||
## Resources
|
||||
|
||||
- [Documentation](https://localai.io/)
|
||||
|
||||
@@ -65,7 +65,12 @@ RUN <<EOT bash
|
||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||
# LunarG SDK below only provides the loader and shader tooling, not
|
||||
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||
# bundle and the packaged backend finds no GPU at runtime.
|
||||
if [ "amd64" = "$TARGETARCH" ]; then
|
||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||
|
||||
@@ -66,7 +66,12 @@ RUN <<EOT bash
|
||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||
# LunarG SDK below only provides the loader and shader tooling, not
|
||||
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||
# bundle and the packaged backend finds no GPU at runtime.
|
||||
if [ "amd64" = "$TARGETARCH" ]; then
|
||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||
|
||||
@@ -24,6 +24,9 @@ service Backend {
|
||||
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
||||
rpc Status(HealthMessage) returns (StatusResponse) {}
|
||||
rpc Detect(DetectOptions) returns (DetectResponse) {}
|
||||
// SoundDetection runs an audio-tagging / sound-event-classification model
|
||||
// (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
|
||||
rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
|
||||
rpc Depth(DepthRequest) returns (DepthResponse) {}
|
||||
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
|
||||
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
|
||||
@@ -671,6 +674,24 @@ message DetectResponse {
|
||||
repeated Detection Detections = 1;
|
||||
}
|
||||
|
||||
// --- Sound-event classification / audio tagging messages (CED) ---
|
||||
|
||||
message SoundDetectionRequest {
|
||||
string src = 1; // audio file path (LocalAI writes the upload to disk)
|
||||
int32 top_k = 2; // number of top tags to return (0 = all classes)
|
||||
float threshold = 3; // optional: drop tags scoring below this
|
||||
}
|
||||
|
||||
message SoundClass {
|
||||
string label = 1; // AudioSet class name, e.g. "Baby cry, infant cry"
|
||||
float score = 2; // per-class probability (multi-label, independent)
|
||||
int32 index = 3; // class index in the model ontology
|
||||
}
|
||||
|
||||
message SoundDetectionResponse {
|
||||
repeated SoundClass detections = 1; // score-descending
|
||||
}
|
||||
|
||||
// --- Depth estimation messages (Depth Anything 3) ---
|
||||
|
||||
message DepthRequest {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
|
||||
IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
|
||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
|
||||
LLAMA_VERSION?=7c082bc417bbe53210a83df4ba5b49e18ce6193c
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -18,6 +18,18 @@
|
||||
#if __has_include("server-chat.cpp")
|
||||
#include "server-chat.cpp"
|
||||
#endif
|
||||
// server-schema.cpp exists only in llama.cpp after the upstream refactor that
|
||||
// extracted the JSON request-schema evaluation (previously the static
|
||||
// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
|
||||
// server-context.cpp and grpc-server.cpp both call into it, so its definitions
|
||||
// must be part of this translation unit or the link fails. __has_include keeps
|
||||
// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
|
||||
// predate the split and still expose params_from_json_cmpl (see the guarded
|
||||
// call sites below).
|
||||
#if __has_include("server-schema.cpp")
|
||||
#define LOCALAI_HAS_SERVER_SCHEMA 1
|
||||
#include "server-schema.cpp"
|
||||
#endif
|
||||
#include "server-context.cpp"
|
||||
|
||||
// LocalAI
|
||||
@@ -2102,7 +2114,11 @@ public:
|
||||
task.index = i;
|
||||
|
||||
task.tokens = std::move(inputs[i]);
|
||||
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||
task.params = server_schema::eval_llama_cmpl_schema(
|
||||
#else
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
#endif
|
||||
ctx_server.impl->vocab,
|
||||
params_base,
|
||||
ctx_server.get_meta().slot_n_ctx,
|
||||
@@ -2116,7 +2132,7 @@ public:
|
||||
// cannot detect tool calls or separate reasoning from content.
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
@@ -2940,7 +2956,11 @@ public:
|
||||
task.index = i;
|
||||
|
||||
task.tokens = std::move(inputs[i]);
|
||||
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||
task.params = server_schema::eval_llama_cmpl_schema(
|
||||
#else
|
||||
task.params = server_task::params_from_json_cmpl(
|
||||
#endif
|
||||
ctx_server.impl->vocab,
|
||||
params_base,
|
||||
ctx_server.get_meta().slot_n_ctx,
|
||||
@@ -2952,7 +2972,7 @@ public:
|
||||
// reasoning, tool calls, and content are classified into ChatDeltas.
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
# Local development: point at a working checkout instead of cloning, e.g.
|
||||
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
||||
|
||||
PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
|
||||
PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
|
||||
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
||||
PRIVACY_FILTER_SRC?=
|
||||
|
||||
|
||||
11
backend/go/ced/.gitignore
vendored
Normal file
11
backend/go/ced/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
.cache/
|
||||
sources/
|
||||
build/
|
||||
package/
|
||||
ced-grpc
|
||||
# build artifacts staged in-tree by the Makefile (cp from sources/) or
|
||||
# symlinked for local dev; the real sources live in ced.cpp upstream.
|
||||
*.so
|
||||
*.so.*
|
||||
ced_capi.h
|
||||
compile_commands.json
|
||||
77
backend/go/ced/Makefile
Normal file
77
backend/go/ced/Makefile
Normal file
@@ -0,0 +1,77 @@
|
||||
# ced sound-classification backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
|
||||
# and update it (matches the parakeet-cpp / whisper.cpp convention).
|
||||
#
|
||||
# Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
|
||||
# skip the clone/cmake steps entirely:
|
||||
# ln -sf /path/to/ced.cpp/build-shared/libced.so .
|
||||
# ln -sf /path/to/ced.cpp/include/ced_capi.h .
|
||||
# go build -o ced-grpc .
|
||||
|
||||
CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
|
||||
CED_REPO?=https://github.com/mudler/ced.cpp
|
||||
|
||||
GOCMD?=go
|
||||
GO_TAGS?=
|
||||
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=false
|
||||
|
||||
# Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
|
||||
# dlopen needs no libggml*.so alongside it, only system libs the runtime image
|
||||
# already provides.
|
||||
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
endif
|
||||
|
||||
# ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
|
||||
# "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
|
||||
else ifeq ($(BUILD_TYPE),openblas)
|
||||
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DCED_GGML_HIP=ON
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
|
||||
endif
|
||||
|
||||
.PHONY: ced-grpc package build clean purge test all
|
||||
|
||||
all: ced-grpc
|
||||
|
||||
sources/ced.cpp:
|
||||
mkdir -p sources/ced.cpp
|
||||
cd sources/ced.cpp && \
|
||||
git init -q && \
|
||||
git remote add origin $(CED_REPO) && \
|
||||
git fetch --depth 1 origin $(CED_VERSION) && \
|
||||
git checkout FETCH_HEAD && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
libced.so: sources/ced.cpp
|
||||
cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
|
||||
cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
|
||||
cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
|
||||
cp -fv sources/ced.cpp/include/ced_capi.h ./
|
||||
|
||||
ced-grpc: libced.so main.go goced.go
|
||||
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
|
||||
|
||||
package: ced-grpc
|
||||
bash package.sh
|
||||
|
||||
build: package
|
||||
|
||||
test:
|
||||
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
|
||||
|
||||
clean: purge
|
||||
rm -rf libced.so* ced_capi.h package ced-grpc
|
||||
|
||||
purge:
|
||||
rm -rf sources/ced.cpp
|
||||
130
backend/go/ced/goced.go
Normal file
130
backend/go/ced/goced.go
Normal file
@@ -0,0 +1,130 @@
|
||||
package main
|
||||
|
||||
// Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
|
||||
// SoundDetection implementation.
|
||||
//
|
||||
// SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
|
||||
// `make protogen-go`). The C side is single-threaded per ctx, so we guard the
|
||||
// engine with engineMu; LocalAI also serializes via base.SingleThread.
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
)
|
||||
|
||||
// purego-bound entry points from libced.so. Names match ced_capi.h exactly.
|
||||
var (
|
||||
CppAbiVersion func() int32
|
||||
CppLoad func(ggufPath string) uintptr
|
||||
CppFree func(ctx uintptr)
|
||||
CppLastError func(ctx uintptr) string
|
||||
CppNumClasses func(ctx uintptr) int32
|
||||
CppSampleRate func(ctx uintptr) int32
|
||||
CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
|
||||
CppClassifyPcmJSON func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
|
||||
CppFreeString func(s uintptr)
|
||||
)
|
||||
|
||||
// cstr copies a malloc'd C string (returned as uintptr) into a Go string and
|
||||
// frees the original via ced_capi_free_string. Empty/0 -> "".
|
||||
func cstr(p uintptr) string {
|
||||
if p == 0 {
|
||||
return ""
|
||||
}
|
||||
defer CppFreeString(p)
|
||||
var b []byte
|
||||
for i := 0; ; i++ {
|
||||
ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
|
||||
if ch == 0 {
|
||||
break
|
||||
}
|
||||
b = append(b, ch)
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// Ced is the gRPC backend. One loaded CED model per instance.
|
||||
type Ced struct {
|
||||
base.Base
|
||||
ctxPtr uintptr
|
||||
engineMu sync.Mutex
|
||||
}
|
||||
|
||||
// Load resolves the GGUF and opens the C-API context.
|
||||
func (c *Ced) Load(opts *pb.ModelOptions) error {
|
||||
if opts.ModelFile == "" {
|
||||
return errors.New("ced: ModelFile is required")
|
||||
}
|
||||
ctx := CppLoad(opts.ModelFile)
|
||||
if ctx == 0 {
|
||||
return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
|
||||
}
|
||||
c.ctxPtr = ctx
|
||||
return nil
|
||||
}
|
||||
|
||||
// jsonTag mirrors the ced_capi JSON tag objects.
|
||||
type jsonTag struct {
|
||||
Index int `json:"index"`
|
||||
Score float32 `json:"score"`
|
||||
Label string `json:"label"`
|
||||
}
|
||||
|
||||
// SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
|
||||
func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
|
||||
if c.ctxPtr == 0 {
|
||||
return nil, errors.New("ced: model not loaded")
|
||||
}
|
||||
if req.GetSrc() == "" {
|
||||
return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
|
||||
}
|
||||
topK := req.GetTopK()
|
||||
if topK <= 0 {
|
||||
topK = 10 // sensible default for a tagging response
|
||||
}
|
||||
|
||||
c.engineMu.Lock()
|
||||
out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
|
||||
lastErr := CppLastError(c.ctxPtr)
|
||||
c.engineMu.Unlock()
|
||||
|
||||
if out == "" {
|
||||
return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
|
||||
}
|
||||
var tags []jsonTag
|
||||
if err := json.Unmarshal([]byte(out), &tags); err != nil {
|
||||
return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
|
||||
}
|
||||
|
||||
thr := req.GetThreshold()
|
||||
resp := &pb.SoundDetectionResponse{}
|
||||
for _, t := range tags {
|
||||
if t.Score < thr {
|
||||
continue
|
||||
}
|
||||
resp.Detections = append(resp.Detections, &pb.SoundClass{
|
||||
Label: t.Label, Score: t.Score, Index: int32(t.Index),
|
||||
})
|
||||
}
|
||||
sort.Slice(resp.Detections, func(i, j int) bool {
|
||||
return resp.Detections[i].Score > resp.Detections[j].Score
|
||||
})
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func (c *Ced) Free() error {
|
||||
c.engineMu.Lock()
|
||||
defer c.engineMu.Unlock()
|
||||
if c.ctxPtr != 0 {
|
||||
CppFree(c.ctxPtr)
|
||||
c.ctxPtr = 0
|
||||
}
|
||||
return nil
|
||||
}
|
||||
59
backend/go/ced/main.go
Normal file
59
backend/go/ced/main.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package main
|
||||
|
||||
// ced sound-classification backend. Started internally by LocalAI: one gRPC
|
||||
// server per loaded model. Loads libced.so via purego and registers the flat
|
||||
// C-API declared in ced_capi.h. The library name can be overridden with
|
||||
// CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
|
||||
// for the .so next to this binary.
|
||||
//
|
||||
// SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
|
||||
// addition, and a built libced.so (see Makefile). See DESIGN.md.
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
)
|
||||
|
||||
var addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
|
||||
type libFunc struct {
|
||||
ptr any
|
||||
name string
|
||||
}
|
||||
|
||||
func main() {
|
||||
libName := os.Getenv("CED_LIBRARY")
|
||||
if libName == "" {
|
||||
libName = "libced.so"
|
||||
}
|
||||
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
|
||||
}
|
||||
|
||||
// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
|
||||
// so we can free the same pointer with ced_capi_free_string after copying
|
||||
// (purego's string return would copy and leak the original).
|
||||
for _, lf := range []libFunc{
|
||||
{&CppAbiVersion, "ced_capi_abi_version"},
|
||||
{&CppLoad, "ced_capi_load"},
|
||||
{&CppFree, "ced_capi_free"},
|
||||
{&CppLastError, "ced_capi_last_error"},
|
||||
{&CppNumClasses, "ced_capi_num_classes"},
|
||||
{&CppSampleRate, "ced_capi_sample_rate"},
|
||||
{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
|
||||
{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
|
||||
{&CppFreeString, "ced_capi_free_string"},
|
||||
} {
|
||||
purego.RegisterLibFunc(lf.ptr, lib, lf.name)
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
|
||||
flag.Parse()
|
||||
if err := grpc.StartServer(*addr, &Ced{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
60
backend/go/ced/package.sh
Executable file
60
backend/go/ced/package.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
|
||||
# libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
|
||||
# is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
|
||||
# the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
mkdir -p "$CURDIR/package/lib"
|
||||
|
||||
cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
|
||||
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||
|
||||
cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||
echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||
echo "Detected Darwin"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||
15
backend/go/ced/run.sh
Executable file
15
backend/go/ced/run.sh
Executable file
@@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
|
||||
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
|
||||
|
||||
# If a self-contained ld.so was packaged, route through it so the packaged
|
||||
# libc / libstdc++ are used instead of the host's (matches the sibling backends).
|
||||
if [ -f "$CURDIR/lib/ld.so" ]; then
|
||||
echo "Using lib/ld.so"
|
||||
exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
|
||||
fi
|
||||
|
||||
exec "$CURDIR/ced-grpc" "$@"
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# CrispASR version (release tag)
|
||||
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
|
||||
CRISPASR_VERSION?=d745bda4386ae0f9d1d2f23fff8ec95d76428221
|
||||
CRISPASR_VERSION?=7a8cb80907341c0204bd0488c1244764f4163883
|
||||
SO_TARGET?=libgocrispasr.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
@@ -67,7 +67,7 @@ sources/CrispASR:
|
||||
# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
|
||||
# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
|
||||
# which is correct both standalone and as a subproject. Idempotent.
|
||||
sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
|
||||
sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
|
||||
|
||||
# Detect OS
|
||||
UNAME_S := $(shell uname -s)
|
||||
|
||||
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
|
||||
g_abort.store(v, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// --- word-level timestamp accessors ---
|
||||
extern "C" {
|
||||
int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
|
||||
const char *crispasr_session_result_word_text(crispasr_session_result *r,
|
||||
int seg_i, int word_i);
|
||||
int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
|
||||
int word_i);
|
||||
int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
|
||||
int word_i);
|
||||
|
||||
// Parakeet-specific word accessors
|
||||
int crispasr_parakeet_result_n_words(void *r);
|
||||
const char *crispasr_parakeet_result_word_text(void *r, int word_i);
|
||||
int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
|
||||
int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
|
||||
}
|
||||
|
||||
void *get_result(void) { return g_result; }
|
||||
|
||||
int get_word_count(int seg_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_n_words(g_result, seg_i);
|
||||
}
|
||||
|
||||
const char *get_word_text(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return "";
|
||||
return crispasr_session_result_word_text(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
int64_t get_word_t0(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_word_t0(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
int64_t get_word_t1(int seg_i, int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_session_result_word_t1(g_result, seg_i, word_i);
|
||||
}
|
||||
|
||||
// Parakeet-specific word accessors
|
||||
int get_parakeet_word_count(void) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_n_words(g_result);
|
||||
}
|
||||
|
||||
const char *get_parakeet_word_text(int word_i) {
|
||||
if (!g_result)
|
||||
return "";
|
||||
return crispasr_parakeet_result_word_text(g_result, word_i);
|
||||
}
|
||||
|
||||
int64_t get_parakeet_word_t0(int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_word_t0(g_result, word_i);
|
||||
}
|
||||
|
||||
int64_t get_parakeet_word_t1(int word_i) {
|
||||
if (!g_result)
|
||||
return 0;
|
||||
return crispasr_parakeet_result_word_t1(g_result, word_i);
|
||||
}
|
||||
|
||||
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
||||
void *data) {
|
||||
const char *level_str;
|
||||
|
||||
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
|
||||
void tts_free(float *pcm);
|
||||
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
||||
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
||||
|
||||
// --- word-level timestamp accessors ---
|
||||
// Session-based (works for whisper-like backends)
|
||||
void *get_result(void);
|
||||
int get_word_count(int seg_i);
|
||||
const char *get_word_text(int seg_i, int word_i);
|
||||
int64_t get_word_t0(int seg_i, int word_i);
|
||||
int64_t get_word_t1(int seg_i, int word_i);
|
||||
|
||||
// Parakeet-specific (global word list, no segment index)
|
||||
int get_parakeet_word_count(void);
|
||||
const char *get_parakeet_word_text(int word_i);
|
||||
int64_t get_parakeet_word_t0(int word_i);
|
||||
int64_t get_parakeet_word_t1(int word_i);
|
||||
}
|
||||
|
||||
@@ -34,6 +34,18 @@ var (
|
||||
CppTTSFree func(ptr uintptr)
|
||||
CppTTSSetVoice func(name string) int
|
||||
CppTTSSetVoiceFile func(path string, refText string) int
|
||||
|
||||
// Word-level timestamp accessors (session-based, per-segment)
|
||||
CppGetWordCount func(segI int) int
|
||||
CppGetWordText func(segI int, wordI int) string
|
||||
CppGetWordT0 func(segI int, wordI int) int64
|
||||
CppGetWordT1 func(segI int, wordI int) int64
|
||||
|
||||
// Parakeet-specific word accessors (global, no segment index)
|
||||
CppGetParakeetWordCount func() int
|
||||
CppGetParakeetWordText func(wordI int) string
|
||||
CppGetParakeetWordT0 func(wordI int) int64
|
||||
CppGetParakeetWordT1 func(wordI int) int64
|
||||
)
|
||||
|
||||
type CrispASR struct {
|
||||
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
// isValidWord reports whether a TranscriptWord contains recognisable speech
|
||||
// content. The parakeet-specific word accessors can return stale initialisation
|
||||
// data (model name, binary blobs) when a segment has no real speech. A word is
|
||||
// considered valid only when:
|
||||
// - the text is non-empty after trimming,
|
||||
// - it contains no U+FFFD replacement characters (from binary data scrubbing),
|
||||
// - both timestamps are non-negative,
|
||||
// - the word has positive duration (end > start).
|
||||
func isValidWord(w *pb.TranscriptWord) bool {
|
||||
txt := strings.TrimSpace(w.Text)
|
||||
if txt == "" {
|
||||
return false
|
||||
}
|
||||
if strings.ContainsRune(txt, '\uFFFD') {
|
||||
return false
|
||||
}
|
||||
if w.Start < 0 || w.End < 0 || w.End <= w.Start {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
|
||||
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
|
||||
// IDs, so Tokens is left empty.
|
||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||
|
||||
// Populate word-level timestamps. Try session-based functions first
|
||||
// (per-segment); fall back to parakeet-specific functions (global word
|
||||
// list with no segment index — only populated on the first segment to
|
||||
// avoid duplication).
|
||||
words := []*pb.TranscriptWord{}
|
||||
wordCount := CppGetWordCount(i)
|
||||
if wordCount == 0 && i == 0 {
|
||||
wordCount = CppGetParakeetWordCount()
|
||||
for j := 0; j < wordCount; j++ {
|
||||
w := &pb.TranscriptWord{
|
||||
Start: CppGetParakeetWordT0(j) * (10000000),
|
||||
End: CppGetParakeetWordT1(j) * (10000000),
|
||||
Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
|
||||
}
|
||||
if isValidWord(w) {
|
||||
words = append(words, w)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for j := 0; j < wordCount; j++ {
|
||||
w := &pb.TranscriptWord{
|
||||
Start: CppGetWordT0(i, j) * (10000000),
|
||||
End: CppGetWordT1(i, j) * (10000000),
|
||||
Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
|
||||
}
|
||||
if isValidWord(w) {
|
||||
words = append(words, w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip empty segments with no recognisable content (e.g. trailing
|
||||
// silence segments that parakeet emits with stale init data).
|
||||
trimmed := strings.TrimSpace(txt)
|
||||
if trimmed == "" && len(words) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
segment := &pb.TranscriptSegment{
|
||||
Id: int32(i),
|
||||
Text: txt,
|
||||
Start: s, End: t,
|
||||
Words: words,
|
||||
}
|
||||
|
||||
segments = append(segments, segment)
|
||||
|
||||
text += " " + strings.TrimSpace(txt)
|
||||
text += " " + trimmed
|
||||
}
|
||||
|
||||
return pb.TranscriptResult{
|
||||
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
|
||||
s := CppGetSegmentStart(i) * 10000000
|
||||
t := CppGetSegmentEnd(i) * 10000000
|
||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||
|
||||
// Skip empty segments (e.g. trailing silence that parakeet emits
|
||||
// with stale init data).
|
||||
trimmed := strings.TrimSpace(txt)
|
||||
if trimmed == "" && s == t {
|
||||
continue
|
||||
}
|
||||
|
||||
segments = append(segments, &pb.TranscriptSegment{
|
||||
Id: int32(i),
|
||||
Text: txt,
|
||||
Start: s, End: t,
|
||||
})
|
||||
|
||||
trimmed := strings.TrimSpace(txt)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -44,6 +44,14 @@ func main() {
|
||||
{&CppTTSFree, "tts_free"},
|
||||
{&CppTTSSetVoice, "tts_set_voice"},
|
||||
{&CppTTSSetVoiceFile, "tts_set_voice_file"},
|
||||
{&CppGetWordCount, "get_word_count"},
|
||||
{&CppGetWordText, "get_word_text"},
|
||||
{&CppGetWordT0, "get_word_t0"},
|
||||
{&CppGetWordT1, "get_word_t1"},
|
||||
{&CppGetParakeetWordCount, "get_parakeet_word_count"},
|
||||
{&CppGetParakeetWordText, "get_parakeet_word_text"},
|
||||
{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
|
||||
{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
|
||||
}
|
||||
|
||||
for _, lf := range libFuncs {
|
||||
|
||||
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
|
||||
# merge upstream can orphan a branch, so the native version is pinned by SHA.
|
||||
# This SHA adds the nested two-file metric C-API (abi_version 4,
|
||||
# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
|
||||
# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
|
||||
# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
|
||||
# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
|
||||
# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
|
||||
# It is kept alive by the upstream tag da2-support (survives a squash-merge);
|
||||
# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
|
||||
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
|
||||
DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
|
||||
DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118
|
||||
|
||||
ifeq ($(NATIVE),false)
|
||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# omnivoice.cpp version
|
||||
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
||||
OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
|
||||
OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
|
||||
SO_TARGET?=libgomnivoicecpp.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# parakeet-cpp backend Makefile.
|
||||
#
|
||||
# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
||||
# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||
# (.github/bump_deps.sh) can find and update it - matches the
|
||||
# whisper.cpp / ds4 / vibevoice-cpp convention.
|
||||
#
|
||||
@@ -15,7 +15,7 @@
|
||||
# That's what the L0 smoke test uses. The default target below does the
|
||||
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
|
||||
|
||||
PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
||||
PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
|
||||
|
||||
GOCMD?=go
|
||||
|
||||
@@ -1,23 +1,68 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
|
||||
# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
|
||||
# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
|
||||
# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
|
||||
# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
|
||||
# BUILD_TYPE so the package is self-contained. Mirrors
|
||||
# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
|
||||
# through lib/ld.so so the packaged libc is used instead of the host's.
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
mkdir -p "$CURDIR/package/lib"
|
||||
|
||||
cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
|
||||
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||
|
||||
# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
|
||||
# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
|
||||
# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
|
||||
cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||
echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "L0 package layout (full ldd walk lands in L3):"
|
||||
# Detect architecture and copy the core runtime libs libparakeet.so links
|
||||
# against, plus the matching dynamic loader as lib/ld.so.
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||
echo "Detected Darwin"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
|
||||
# based on BUILD_TYPE so the backend can reach the GPU without the runtime
|
||||
# base image shipping those drivers.
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# qwentts.cpp version
|
||||
QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
|
||||
QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4
|
||||
QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
|
||||
SO_TARGET?=libgoqwen3ttscpp.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# stablediffusion.cpp (ggml)
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=7f0e728b7d42f2490dfa5dd9539082d904f2f6b2
|
||||
STABLEDIFFUSION_GGML_VERSION?=b12098f5d09fc83da36e65c784f7bdb16a5a5ebf
|
||||
|
||||
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84
|
||||
WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97
|
||||
SO_TARGET?=libgowhisper.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -178,6 +178,37 @@
|
||||
nvidia-cuda-12: "cuda12-parakeet-cpp"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
|
||||
- &ced
|
||||
name: "ced"
|
||||
alias: "ced"
|
||||
license: mit
|
||||
icon: https://avatars.githubusercontent.com/u/95302084
|
||||
description: |
|
||||
CED sound-event classification / audio tagging (527-class AudioSet).
|
||||
ced.cpp is a C++/ggml port that performs audio tagging over the AudioSet
|
||||
taxonomy, exposed through the SoundDetection gRPC rpc and the
|
||||
/v1/audio/classification REST endpoint. It runs on CPU, NVIDIA CUDA,
|
||||
AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
|
||||
urls:
|
||||
- https://github.com/mudler/ced.cpp
|
||||
tags:
|
||||
- audio-classification
|
||||
- CPU
|
||||
- GPU
|
||||
- CUDA
|
||||
- HIP
|
||||
capabilities:
|
||||
default: "cpu-ced"
|
||||
nvidia: "cuda12-ced"
|
||||
intel: "intel-sycl-f16-ced"
|
||||
metal: "metal-ced"
|
||||
amd: "rocm-ced"
|
||||
vulkan: "vulkan-ced"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-ced"
|
||||
nvidia-cuda-13: "cuda13-ced"
|
||||
nvidia-cuda-12: "cuda12-ced"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
|
||||
- &voxtral
|
||||
name: "voxtral"
|
||||
alias: "voxtral"
|
||||
@@ -2650,6 +2681,121 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
|
||||
## ced
|
||||
- !!merge <<: *ced
|
||||
name: "ced-development"
|
||||
capabilities:
|
||||
default: "cpu-ced-development"
|
||||
nvidia: "cuda12-ced-development"
|
||||
intel: "intel-sycl-f16-ced-development"
|
||||
metal: "metal-ced-development"
|
||||
amd: "rocm-ced-development"
|
||||
vulkan: "vulkan-ced-development"
|
||||
nvidia-l4t: "nvidia-l4t-arm64-ced-development"
|
||||
nvidia-cuda-13: "cuda13-ced-development"
|
||||
nvidia-cuda-12: "cuda12-ced-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced-development"
|
||||
- !!merge <<: *ced
|
||||
name: "nvidia-l4t-arm64-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "nvidia-l4t-arm64-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-nvidia-l4t-arm64-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-nvidia-l4t-arm64-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cpu-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cpu-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-ced
|
||||
- !!merge <<: *ced
|
||||
name: "metal-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "metal-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda12-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-12-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda12-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-12-ced
|
||||
- !!merge <<: *ced
|
||||
name: "rocm-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-rocm-hipblas-ced
|
||||
- !!merge <<: *ced
|
||||
name: "rocm-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-rocm-hipblas-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f32-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f32-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f32-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f32-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f16-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-intel-sycl-f16-ced
|
||||
- !!merge <<: *ced
|
||||
name: "intel-sycl-f16-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-intel-sycl-f16-ced
|
||||
- !!merge <<: *ced
|
||||
name: "vulkan-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-vulkan-ced
|
||||
- !!merge <<: *ced
|
||||
name: "vulkan-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-vulkan-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-ced"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-13-ced
|
||||
- !!merge <<: *ced
|
||||
name: "cuda13-ced-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-ced
|
||||
## stablediffusion-ggml
|
||||
- !!merge <<: *stablediffusionggml
|
||||
name: "cpu-stablediffusion-ggml"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
torchvision==0.22.1
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torch==2.7.1
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu121
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
torchvision
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torch
|
||||
ftfy
|
||||
optimum-quanto
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
torchvision
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torch
|
||||
ftfy
|
||||
optimum-quanto
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,17 +1,23 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm7.0
|
||||
torch==2.10.0+rocm7.0
|
||||
torchvision==0.25.0+rocm7.0
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -3,18 +3,24 @@ torch
|
||||
torchvision
|
||||
optimum[openvino]
|
||||
setuptools
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
|
||||
torch
|
||||
git+https://github.com/huggingface/diffusers
|
||||
transformers
|
||||
diffusers==0.38.0
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
optimum-quanto
|
||||
@@ -9,9 +9,15 @@ numpy<2
|
||||
sentencepiece
|
||||
torchvision
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||
torch
|
||||
git+https://github.com/huggingface/diffusers
|
||||
transformers
|
||||
diffusers==0.38.0
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
optimum-quanto
|
||||
@@ -10,9 +10,15 @@ sentencepiece
|
||||
torchvision
|
||||
ftfy
|
||||
chardet
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
|
||||
@@ -1,16 +1,22 @@
|
||||
torch==2.7.1
|
||||
torchvision==0.22.1
|
||||
git+https://github.com/huggingface/diffusers
|
||||
diffusers==0.38.0
|
||||
opencv-python
|
||||
transformers
|
||||
transformers==4.57.6
|
||||
accelerate
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
ftfy
|
||||
# TODO: re-add compel once it supports transformers >= 5.
|
||||
# Tracking: https://github.com/damian0815/compel/pull/129
|
||||
# https://github.com/damian0815/compel/issues/128
|
||||
# compel currently pins transformers~=4.25, which forced pip into multi-hour
|
||||
# resolver backtracking storms in CI. backend.py imports it lazily and gates
|
||||
# the COMPEL=1 env var on the import succeeding, so dropping it here is safe.
|
||||
# diffusers and transformers are pinned together on purpose. transformers v5
|
||||
# restructured CLIPTextModel and dropped the `.text_model` attribute, which
|
||||
# breaks single-file Stable Diffusion loading on every released diffusers
|
||||
# (<=0.38.0); only unreleased diffusers main supports transformers v5. Tracking
|
||||
# main via git froze whichever broken pair existed at image-build time. Pin the
|
||||
# last known-good released pair so builds are reproducible and can't drift into
|
||||
# the broken window. See https://github.com/mudler/LocalAI/issues/9979
|
||||
#
|
||||
# compel is intentionally omitted: it pins transformers~=4.25, which conflicts
|
||||
# with this pin and previously forced pip into multi-hour resolver backtracking
|
||||
# storms in CI. backend.py imports it lazily and gates the COMPEL=1 env var on
|
||||
# the import succeeding, so dropping it here is safe.
|
||||
@@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def _get_stride_seconds(self):
|
||||
"""Compute the seconds-per-frame stride for the loaded model.
|
||||
|
||||
stride = preprocessor_window_stride * encoder_subsampling_factor
|
||||
"""
|
||||
try:
|
||||
preprocessor = self.model.preprocessor
|
||||
window_stride = preprocessor._cfg.get('window_stride', 0.01)
|
||||
subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8)
|
||||
return window_stride * subsampling_factor
|
||||
except (AttributeError, KeyError, TypeError) as err:
|
||||
print(
|
||||
f"Warning: could not compute stride from model config ({err}), "
|
||||
f"falling back to 0.08s/frame",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 0.08
|
||||
|
||||
def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None):
|
||||
"""Build TranscriptSegment list from a NeMo Hypothesis with timestamps.
|
||||
|
||||
Supports two granularity modes:
|
||||
- "word": one TranscriptSegment per word, each with a single TranscriptWord entry
|
||||
- "segment" (default): merge consecutive words into sentence-level segments,
|
||||
splitting at word-level time gaps that exceed a dynamic threshold.
|
||||
"""
|
||||
if not hypothesis or not isinstance(hypothesis.timestamp, dict):
|
||||
return []
|
||||
|
||||
word_offsets = hypothesis.timestamp.get('word', [])
|
||||
if not word_offsets:
|
||||
return []
|
||||
|
||||
granularities = list(timestamp_granularities) if timestamp_granularities else []
|
||||
granularity = "word" if "word" in granularities else "segment"
|
||||
|
||||
# Build a flat list of (text, start_ns, end_ns) from NeMo word offsets
|
||||
transcript_words = []
|
||||
for wo in word_offsets:
|
||||
word_text = wo.get('word', '')
|
||||
if not word_text:
|
||||
continue
|
||||
start_offset = wo.get('start_offset', 0)
|
||||
end_offset = wo.get('end_offset', start_offset)
|
||||
start_ns = int(start_offset * stride * 1_000_000_000)
|
||||
end_ns = int(end_offset * stride * 1_000_000_000)
|
||||
transcript_words.append({
|
||||
'text': word_text,
|
||||
'start': start_ns,
|
||||
'end': end_ns,
|
||||
})
|
||||
|
||||
if not transcript_words:
|
||||
return []
|
||||
|
||||
if granularity == "word":
|
||||
# One segment per word
|
||||
result = []
|
||||
for idx, tw in enumerate(transcript_words):
|
||||
word = backend_pb2.TranscriptWord(
|
||||
start=tw['start'], end=tw['end'], text=tw['text']
|
||||
)
|
||||
result.append(backend_pb2.TranscriptSegment(
|
||||
id=idx,
|
||||
start=tw['start'],
|
||||
end=tw['end'],
|
||||
text=tw['text'],
|
||||
words=[word],
|
||||
))
|
||||
return result
|
||||
|
||||
# segment mode — merge at word-level time-gap boundaries
|
||||
# Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s
|
||||
gaps = []
|
||||
for i in range(1, len(transcript_words)):
|
||||
gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000
|
||||
if gap > 0:
|
||||
gaps.append(gap)
|
||||
if gaps:
|
||||
gaps.sort()
|
||||
median_gap = gaps[len(gaps) // 2]
|
||||
threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000)
|
||||
else:
|
||||
threshold_ns = int(0.5 * 1_000_000_000)
|
||||
|
||||
result = []
|
||||
buf_words = [] # list of TranscriptWord protobuf
|
||||
buf_start = None
|
||||
buf_end = 0
|
||||
buf_text = []
|
||||
prev_end = None
|
||||
|
||||
for tw in transcript_words:
|
||||
# Detect word-level time gap
|
||||
if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text:
|
||||
seg_text = ' '.join(buf_text)
|
||||
result.append(backend_pb2.TranscriptSegment(
|
||||
id=len(result),
|
||||
start=buf_start,
|
||||
end=buf_end,
|
||||
text=seg_text,
|
||||
words=list(buf_words),
|
||||
))
|
||||
buf_words = []
|
||||
buf_text = []
|
||||
buf_start = None
|
||||
|
||||
if buf_start is None:
|
||||
buf_start = tw['start']
|
||||
buf_end = tw['end']
|
||||
buf_text.append(tw['text'])
|
||||
buf_words.append(backend_pb2.TranscriptWord(
|
||||
start=tw['start'], end=tw['end'], text=tw['text']
|
||||
))
|
||||
prev_end = tw['end']
|
||||
|
||||
# flush remaining
|
||||
if buf_text and buf_start is not None:
|
||||
seg_text = ' '.join(buf_text)
|
||||
result.append(backend_pb2.TranscriptSegment(
|
||||
id=len(result),
|
||||
start=buf_start,
|
||||
end=buf_end,
|
||||
text=seg_text,
|
||||
words=list(buf_words),
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
def AudioTranscription(self, request, context):
|
||||
result_segments = []
|
||||
text = ""
|
||||
@@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
print(f"Error: Audio file not found: {audio_path}", file=sys.stderr)
|
||||
return backend_pb2.TranscriptResult(segments=[], text="")
|
||||
|
||||
# NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
|
||||
results = self.model.transcribe([audio_path])
|
||||
# Determine requested timestamp granularity
|
||||
timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
|
||||
want_timestamps = bool(timestamp_granularities)
|
||||
|
||||
if not results or len(results) == 0:
|
||||
return backend_pb2.TranscriptResult(segments=[], text="")
|
||||
if want_timestamps:
|
||||
# Request timestamps from NeMo.
|
||||
# timestamps=True forces NeMo to return Hypothesis objects with
|
||||
# the timestamp dict populated, so we omit return_hypotheses to
|
||||
# let NeMo choose the correct return type.
|
||||
results = self.model.transcribe([audio_path], timestamps=True)
|
||||
|
||||
# Get the transcript text from the first result.
|
||||
# CTC models return List[str], TDT/RNNT models return List[Hypothesis]
|
||||
# where the actual text lives in Hypothesis.text.
|
||||
result = results[0]
|
||||
if isinstance(result, str):
|
||||
text = result
|
||||
if results and len(results) > 0:
|
||||
hypotheses = results[0] if isinstance(results[0], list) else results
|
||||
if hypotheses and len(hypotheses) > 0:
|
||||
hypothesis = hypotheses[0]
|
||||
|
||||
# Hypothesis object should have .timestamp populated
|
||||
if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict):
|
||||
print(
|
||||
"Warning: timestamps were requested but NeMo did not return "
|
||||
"Hypothesis objects; falling back to untimestamped output",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Extract text
|
||||
if hasattr(hypothesis, 'text'):
|
||||
text = hypothesis.text or ""
|
||||
elif isinstance(hypothesis, str):
|
||||
text = hypothesis
|
||||
|
||||
# Build segments with word-level timestamps
|
||||
stride = self._get_stride_seconds()
|
||||
result_segments = self._build_segments_with_words(
|
||||
hypothesis, stride, timestamp_granularities
|
||||
)
|
||||
|
||||
# If no word offsets but we have text, fall back to single segment
|
||||
if not result_segments and text:
|
||||
result_segments.append(backend_pb2.TranscriptSegment(
|
||||
id=0, start=0, end=0, text=text
|
||||
))
|
||||
else:
|
||||
text = getattr(result, 'text', None) or ""
|
||||
# Simple transcription without timestamps
|
||||
# NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts
|
||||
results = self.model.transcribe([audio_path])
|
||||
|
||||
if text:
|
||||
# Create a single segment with the full transcription
|
||||
result_segments.append(backend_pb2.TranscriptSegment(
|
||||
id=0, start=0, end=0, text=text
|
||||
))
|
||||
if results and len(results) > 0:
|
||||
# Get the transcript text from the first result.
|
||||
# CTC models return List[str], TDT/RNNT models return List[Hypothesis]
|
||||
# where the actual text lives in Hypothesis.text.
|
||||
result = results[0]
|
||||
if isinstance(result, str):
|
||||
text = result
|
||||
else:
|
||||
text = getattr(result, 'text', None) or ""
|
||||
|
||||
if text:
|
||||
# Create a single segment with the full transcription
|
||||
result_segments.append(backend_pb2.TranscriptSegment(
|
||||
id=0, start=0, end=0, text=text
|
||||
))
|
||||
|
||||
except Exception as err:
|
||||
print(f"Error in AudioTranscription: {err}", file=sys.stderr)
|
||||
|
||||
@@ -309,6 +309,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
dataset_split = request.dataset_split or "train"
|
||||
if os.path.exists(request.dataset_source):
|
||||
_allowed_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_DATASET_DIR", os.getcwd())))
|
||||
_real_path = os.path.realpath(os.path.abspath(request.dataset_source))
|
||||
if not (_real_path == _allowed_dir or _real_path.startswith(_allowed_dir + os.sep)):
|
||||
raise ValueError("Dataset source path is outside the allowed directory")
|
||||
if request.dataset_source.endswith('.json') or request.dataset_source.endswith('.jsonl'):
|
||||
dataset = load_dataset("json", data_files=request.dataset_source, split=dataset_split)
|
||||
elif request.dataset_source.endswith('.csv'):
|
||||
@@ -687,6 +691,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def ExportModel(self, request, context):
|
||||
export_format = request.export_format or "lora"
|
||||
output_path = request.output_path
|
||||
_allowed_output_dir = os.path.realpath(os.path.abspath(os.environ.get("LOCALAI_OUTPUT_DIR", os.getcwd())))
|
||||
_real_output_path = os.path.realpath(os.path.abspath(output_path))
|
||||
if not (_real_output_path == _allowed_output_dir or _real_output_path.startswith(_allowed_output_dir + os.sep)):
|
||||
raise ValueError("Output path is outside the allowed directory")
|
||||
output_path = _real_output_path
|
||||
checkpoint_path = request.checkpoint_path
|
||||
|
||||
# Extract HF token for gated model access
|
||||
@@ -807,7 +816,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
env = os.environ.copy()
|
||||
env["NO_LOCAL_GGUF"] = "1"
|
||||
cmd = [sys.executable, convert_script, merge_dir, "--outtype", outtype, "--outfile", gguf_path]
|
||||
conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env)
|
||||
conv_result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600, env=env, shell=False) # nosemgrep: python.django.security.injection.command.subprocess-injection.subprocess-injection
|
||||
if conv_result.returncode != 0:
|
||||
diag = f"stdout: {conv_result.stdout[-300:]}\nstderr: {conv_result.stderr[-500:]}"
|
||||
return backend_pb2.Result(success=False,
|
||||
|
||||
@@ -48,8 +48,10 @@ try:
|
||||
except ImportError:
|
||||
HAS_REASONING_PARSERS = False
|
||||
|
||||
# vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the
|
||||
# SamplingParams field guided_decoding -> structured_outputs.
|
||||
try:
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
HAS_GUIDED_DECODING = True
|
||||
except ImportError:
|
||||
HAS_GUIDED_DECODING = False
|
||||
@@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if value not in (None, 0, [], False):
|
||||
setattr(sampling_params, param_field, value)
|
||||
|
||||
# Guided decoding: use Grammar field to pass JSON schema or BNF
|
||||
# Structured-output decoding: use Grammar field to pass JSON schema or BNF
|
||||
if HAS_GUIDED_DECODING and request.Grammar:
|
||||
try:
|
||||
json.loads(request.Grammar) # valid JSON = JSON schema
|
||||
sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar)
|
||||
sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar)
|
||||
except json.JSONDecodeError:
|
||||
sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar)
|
||||
sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar)
|
||||
|
||||
# Extract image paths and process images
|
||||
prompt = request.Prompt
|
||||
@@ -596,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
|
||||
# Stream the results
|
||||
generated_text = ""
|
||||
generated_token_ids: list[int] = []
|
||||
last_output = None
|
||||
|
||||
# Tool-parsing strategy decision (made once, before the loop):
|
||||
#
|
||||
# When a tool parser is active, the model's raw tool-call markup
|
||||
# (e.g. <tool_call>...) must not be streamed verbatim as delta.content
|
||||
# — clients would see the unparsed syntax. Two paths:
|
||||
#
|
||||
# (A) native streaming via parser.extract_tool_calls_streaming. All
|
||||
# concrete tool parsers shipped with vLLM 0.23+ implement this
|
||||
# (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes,
|
||||
# llama3_json, mistral, …). The parser decides per-delta whether
|
||||
# to emit content or suppress tool-call markup, and emits a
|
||||
# structured DeltaMessage(tool_calls=[...]) when a call is ready.
|
||||
# (B) buffer fallback — used only when the parser surprisingly lacks
|
||||
# the streaming method or it raises mid-stream. The post-loop
|
||||
# extract_tool_calls assembles the final chat_delta. Same correctness
|
||||
# guarantee as a non-streaming response, at the cost of a delayed
|
||||
# final chunk.
|
||||
has_tool_parser = bool(self.tool_parser_cls and request.Tools)
|
||||
tp_instance = None
|
||||
tp_request = None
|
||||
native_streaming = False
|
||||
native_streaming_error = False
|
||||
if has_tool_parser:
|
||||
try:
|
||||
tools_for_parser = json.loads(request.Tools)
|
||||
except json.JSONDecodeError:
|
||||
tools_for_parser = []
|
||||
try:
|
||||
tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser)
|
||||
except TypeError:
|
||||
tp_instance = self.tool_parser_cls(self.tokenizer)
|
||||
# Build a minimal ChatCompletionRequest so the streaming method
|
||||
# sees the tools list. We do not need any other request fields —
|
||||
# parsers only read .tools (and sometimes .tool_choice, which we
|
||||
# leave at default).
|
||||
try:
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest as _CCR,
|
||||
)
|
||||
tp_request = _CCR(
|
||||
model="local",
|
||||
messages=[{"role": "user", "content": ""}],
|
||||
tools=tools_for_parser or None,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Could not build ChatCompletionRequest for streaming parser: {e}",
|
||||
file=sys.stderr)
|
||||
tp_request = None
|
||||
native_streaming = (
|
||||
tp_request is not None
|
||||
and hasattr(tp_instance, "extract_tool_calls_streaming")
|
||||
)
|
||||
|
||||
try:
|
||||
async for request_output in outputs:
|
||||
iteration_text = request_output.outputs[0].text
|
||||
last_output = request_output
|
||||
|
||||
if streaming:
|
||||
# Remove text already sent as vllm concatenates the text from previous yields
|
||||
delta_iteration_text = iteration_text.removeprefix(generated_text)
|
||||
# Send the partial result
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(delta_iteration_text, encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
|
||||
)
|
||||
new_token_ids = list(request_output.outputs[0].token_ids)
|
||||
delta_token_ids = new_token_ids[len(generated_token_ids):]
|
||||
|
||||
# Keep track of text generated
|
||||
if not has_tool_parser:
|
||||
# Plain streaming — unchanged from pre-tool-parser path.
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(delta_iteration_text, encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)],
|
||||
)
|
||||
elif native_streaming and not native_streaming_error:
|
||||
# (A) Native vLLM extract_tool_calls_streaming.
|
||||
try:
|
||||
msg = tp_instance.extract_tool_calls_streaming(
|
||||
previous_text=generated_text,
|
||||
current_text=iteration_text,
|
||||
delta_text=delta_iteration_text,
|
||||
previous_token_ids=generated_token_ids,
|
||||
current_token_ids=new_token_ids,
|
||||
delta_token_ids=delta_token_ids,
|
||||
request=tp_request,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Streaming tool parser error (falling back to "
|
||||
f"buffer for the rest of the stream): {e}",
|
||||
file=sys.stderr)
|
||||
native_streaming_error = True
|
||||
msg = None
|
||||
if msg is not None:
|
||||
tc_protos = []
|
||||
for tc in (msg.tool_calls or []):
|
||||
fn = tc.function or None
|
||||
tc_protos.append(backend_pb2.ToolCallDelta(
|
||||
index=tc.index,
|
||||
id=tc.id or "",
|
||||
name=(fn.name if fn and fn.name else "") or "",
|
||||
arguments=(fn.arguments if fn and fn.arguments else "") or "",
|
||||
))
|
||||
cd_kwargs = {}
|
||||
if msg.content:
|
||||
cd_kwargs["content"] = msg.content
|
||||
if msg.reasoning:
|
||||
cd_kwargs["reasoning_content"] = msg.reasoning
|
||||
if tc_protos:
|
||||
cd_kwargs["tool_calls"] = tc_protos
|
||||
if cd_kwargs:
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(msg.content or "", encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)],
|
||||
)
|
||||
# (B) buffer fallback — emit nothing during the stream.
|
||||
# The post-loop extract_tool_calls block builds the final chunk.
|
||||
|
||||
# Keep track of text + token_ids generated
|
||||
generated_text = iteration_text
|
||||
generated_token_ids = list(request_output.outputs[0].token_ids)
|
||||
finally:
|
||||
await outputs.aclose()
|
||||
|
||||
@@ -637,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
except Exception as e:
|
||||
print(f"Reasoning parser error: {e}", file=sys.stderr)
|
||||
|
||||
if self.tool_parser_cls and request.Tools:
|
||||
# When (A) native streaming ran cleanly, per-delta yields above already
|
||||
# delivered everything — do NOT extract again on the full text or we'd
|
||||
# duplicate content/tool_calls into the final chunk.
|
||||
if has_tool_parser and not (native_streaming and not native_streaming_error):
|
||||
try:
|
||||
tools = json.loads(request.Tools)
|
||||
# Some concrete parsers only accept the tokenizer; only the
|
||||
# abstract base declares the tools kwarg. Try with tools first,
|
||||
# fall back to tokenizer-only.
|
||||
try:
|
||||
tp = self.tool_parser_cls(self.tokenizer, tools=tools)
|
||||
except TypeError:
|
||||
tp = self.tool_parser_cls(self.tokenizer)
|
||||
tp = tp_instance
|
||||
if tp is None:
|
||||
# Defensive: tp_instance build failed earlier; reconstruct.
|
||||
tools = json.loads(request.Tools)
|
||||
try:
|
||||
tp = self.tool_parser_cls(self.tokenizer, tools=tools)
|
||||
except TypeError:
|
||||
tp = self.tool_parser_cls(self.tokenizer)
|
||||
info = tp.extract_tool_calls(content, request=None)
|
||||
if info.tools_called:
|
||||
content = info.content or ""
|
||||
@@ -659,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
))
|
||||
except Exception as e:
|
||||
print(f"Tool parser error: {e}", file=sys.stderr)
|
||||
elif native_streaming and not native_streaming_error:
|
||||
# Per-delta path already emitted content + tool_calls; the final
|
||||
# chat_delta should carry only metadata (token counts, logprobs).
|
||||
content = ""
|
||||
|
||||
# Extract token counts
|
||||
prompt_tokens = 0
|
||||
@@ -698,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
|
||||
if streaming:
|
||||
# Final chunk with structured data
|
||||
# Final chunk with structured data.
|
||||
#
|
||||
# If we used the buffer fallback (has_tool_parser=True AND native
|
||||
# streaming did NOT run cleanly) and the parser found no tool call,
|
||||
# flush the buffered content as ONE content delta — and clear the
|
||||
# final chat_delta's content so the metadata chunk does not repeat
|
||||
# what we just sent. This is the plain-text-with-tool-parser path.
|
||||
buffered_fallback = (
|
||||
has_tool_parser
|
||||
and not (native_streaming and not native_streaming_error)
|
||||
)
|
||||
if buffered_fallback and not tool_calls_proto and content:
|
||||
yield backend_pb2.Reply(
|
||||
message=bytes(content, encoding='utf-8'),
|
||||
chat_deltas=[backend_pb2.ChatDelta(content=content)],
|
||||
)
|
||||
chat_delta = backend_pb2.ChatDelta(
|
||||
reasoning_content=reasoning_content,
|
||||
tool_calls=tool_calls_proto,
|
||||
)
|
||||
yield backend_pb2.Reply(
|
||||
message=b"",
|
||||
prompt_tokens=prompt_tokens,
|
||||
|
||||
@@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase):
|
||||
print(err)
|
||||
self.fail("Embedding service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
self.tearDown()
|
||||
|
||||
|
||||
class TestStreamingToolParser(unittest.TestCase):
|
||||
"""
|
||||
Server-less unit tests for the streaming + tool-parser machinery in
|
||||
BackendServicer._predict. These tests instantiate BackendServicer
|
||||
directly and mock the vLLM engine + tool parser, so they do not need
|
||||
a GPU, a model, or a running gRPC server. Kept in a separate class to
|
||||
avoid the parent setUp() which spawns a subprocess.
|
||||
|
||||
Covers #582 (follow-up to #10346):
|
||||
1. Markup-leak prevention with a non-streaming parser (buffer fallback)
|
||||
2. No content duplication on the plain-text path with the buffer fallback
|
||||
3. Native streaming progressive plain-text emission
|
||||
4. Native streaming structured tool_call, no markup leak
|
||||
5. Parser exception → graceful fallback to buffer, still no markup
|
||||
6. No-tool-parser regression: unchanged per-delta content stream
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _make_generate(chunks):
|
||||
"""Build a fake vLLM engine.generate that yields cumulative chunks."""
|
||||
from types import SimpleNamespace
|
||||
async def gen(*a, **k):
|
||||
for i, t in enumerate(chunks):
|
||||
yield SimpleNamespace(
|
||||
outputs=[SimpleNamespace(
|
||||
text=t,
|
||||
token_ids=list(range(i + 1)),
|
||||
logprobs=None,
|
||||
)],
|
||||
prompt_token_ids=[0],
|
||||
)
|
||||
return lambda *a, **k: gen()
|
||||
|
||||
@staticmethod
|
||||
def _collect(servicer, req):
|
||||
import asyncio
|
||||
async def run():
|
||||
return [r async for r in servicer._predict(req, None, streaming=True)]
|
||||
return asyncio.run(run())
|
||||
|
||||
def _new_servicer(self):
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from backend import BackendServicer
|
||||
s = BackendServicer()
|
||||
s.reasoning_parser_cls = None
|
||||
s.tool_parser_cls = None
|
||||
s.tokenizer = None
|
||||
return s
|
||||
|
||||
# ── Case 1+2: parser without streaming method → buffer fallback ──
|
||||
def test_buffer_path_no_markup_no_duplication(self):
|
||||
from types import SimpleNamespace
|
||||
|
||||
def parser_cls(called, content_text, calls):
|
||||
class _P:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
pass
|
||||
# NOTE: NO extract_tool_calls_streaming → takes the buffer path
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
return SimpleNamespace(
|
||||
tools_called=called, content=content_text, tool_calls=calls,
|
||||
)
|
||||
return _P
|
||||
|
||||
tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]'
|
||||
|
||||
# Tool-call case: no raw markup in any delta.content
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
'<tool_call>\n{"name": "calc"',
|
||||
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||
]))
|
||||
call = SimpleNamespace(id="call_1",
|
||||
function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
|
||||
s.tool_parser_cls = parser_cls(True, "", [call])
|
||||
req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
|
||||
replies = self._collect(s, req)
|
||||
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||
self.assertFalse(
|
||||
any("<tool_call" in c for c in contents),
|
||||
f"markup leaked: {contents!r}",
|
||||
)
|
||||
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
|
||||
self.assertIn("calc", names, "tool_call missing from final chunk")
|
||||
|
||||
# Plain-text-with-tools case: full content delivered exactly once
|
||||
s2 = self._new_servicer()
|
||||
s2.llm = SimpleNamespace(generate=self._make_generate([
|
||||
"The capital ",
|
||||
"The capital of France is Paris.",
|
||||
]))
|
||||
s2.tool_parser_cls = parser_cls(False, "", [])
|
||||
req2 = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json)
|
||||
joined = "".join(
|
||||
cd.content for r in self._collect(s2, req2)
|
||||
for cd in r.chat_deltas if cd.content
|
||||
)
|
||||
self.assertEqual(
|
||||
joined.count("The capital of France is Paris."), 1,
|
||||
f"buffered content duplicated: {joined!r}",
|
||||
)
|
||||
|
||||
# ── Case 3: native streaming, progressive plain text ──
|
||||
def test_native_streaming_progressive_plain_text(self):
|
||||
from types import SimpleNamespace
|
||||
|
||||
class _DeltaMsg:
|
||||
def __init__(self, content=None, reasoning=None, tool_calls=None):
|
||||
self.content = content
|
||||
self.reasoning = reasoning
|
||||
self.tool_calls = tool_calls or []
|
||||
|
||||
class StreamingParser:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
pass
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
# Should NOT be called when native streaming runs successfully.
|
||||
raise AssertionError("extract_tool_calls invoked on native-streaming path")
|
||||
def extract_tool_calls_streaming(
|
||||
self, previous_text, current_text, delta_text,
|
||||
previous_token_ids, current_token_ids, delta_token_ids, request,
|
||||
):
|
||||
if not delta_text:
|
||||
return None
|
||||
return _DeltaMsg(content=delta_text)
|
||||
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
"Paris ",
|
||||
"Paris is ",
|
||||
"Paris is the capital of France.",
|
||||
]))
|
||||
s.tool_parser_cls = StreamingParser
|
||||
req = backend_pb2.PredictOptions(
|
||||
Prompt="x",
|
||||
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||
)
|
||||
replies = self._collect(s, req)
|
||||
|
||||
intermediate_content = [
|
||||
cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
|
||||
]
|
||||
self.assertTrue(
|
||||
len(intermediate_content) > 0,
|
||||
"Plain-text response not streamed progressively (native streaming inactive?)",
|
||||
)
|
||||
assembled = "".join(
|
||||
cd.content for r in replies for cd in r.chat_deltas if cd.content
|
||||
)
|
||||
self.assertEqual(
|
||||
assembled, "Paris is the capital of France.",
|
||||
f"Assembled content wrong: {assembled!r}",
|
||||
)
|
||||
|
||||
# ── Case 4: native streaming, structured tool_call, no markup ──
|
||||
def test_native_streaming_tool_call_no_markup_leak(self):
|
||||
from types import SimpleNamespace
|
||||
|
||||
class _DeltaMsg:
|
||||
def __init__(self, content=None, reasoning=None, tool_calls=None):
|
||||
self.content = content
|
||||
self.reasoning = reasoning
|
||||
self.tool_calls = tool_calls or []
|
||||
|
||||
class _ToolCallStreamer:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
self._emitted = False
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
raise AssertionError("extract_tool_calls invoked on native-streaming path")
|
||||
def extract_tool_calls_streaming(
|
||||
self, previous_text, current_text, delta_text,
|
||||
previous_token_ids, current_token_ids, delta_token_ids, request,
|
||||
):
|
||||
if "</tool_call>" in current_text and not self._emitted:
|
||||
self._emitted = True
|
||||
fn = SimpleNamespace(name="calc", arguments='{"x": 1}')
|
||||
tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn)
|
||||
return _DeltaMsg(tool_calls=[tc])
|
||||
return None
|
||||
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
'<tool_call>\n',
|
||||
'<tool_call>\n{"name": "calc"',
|
||||
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||
]))
|
||||
s.tool_parser_cls = _ToolCallStreamer
|
||||
req = backend_pb2.PredictOptions(
|
||||
Prompt="x",
|
||||
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||
)
|
||||
replies = self._collect(s, req)
|
||||
|
||||
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||
self.assertFalse(
|
||||
any("<tool_call" in c or "</tool_call>" in c for c in contents),
|
||||
f"markup leaked as content: {contents!r}",
|
||||
)
|
||||
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name]
|
||||
args = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments]
|
||||
self.assertIn("calc", names, f"tool_call name missing; got {names!r}")
|
||||
self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}")
|
||||
|
||||
# ── Case 5: parser exception → fallback to buffer, no leak ──
|
||||
def test_native_streaming_parser_exception_falls_back_to_buffer(self):
|
||||
from types import SimpleNamespace
|
||||
call = SimpleNamespace(id="call_1",
|
||||
function=SimpleNamespace(name="calc", arguments='{"x": 1}'))
|
||||
|
||||
class _BrokenStreamer:
|
||||
def __init__(self, tokenizer, tools=None):
|
||||
pass
|
||||
def extract_tool_calls(self, c, request=None):
|
||||
return SimpleNamespace(tools_called=True, content="", tool_calls=[call])
|
||||
def extract_tool_calls_streaming(self, *a, **kw):
|
||||
raise RuntimeError("simulated parser bug")
|
||||
|
||||
s = self._new_servicer()
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
'<tool_call>\n{"name": "calc"',
|
||||
'<tool_call>\n{"name": "calc", "arguments": {"x": 1}}\n</tool_call>',
|
||||
]))
|
||||
s.tool_parser_cls = _BrokenStreamer
|
||||
req = backend_pb2.PredictOptions(
|
||||
Prompt="x",
|
||||
Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]',
|
||||
)
|
||||
replies = self._collect(s, req)
|
||||
|
||||
contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content]
|
||||
self.assertFalse(
|
||||
any("<tool_call" in c for c in contents),
|
||||
f"markup leaked after parser exception: {contents!r}",
|
||||
)
|
||||
names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls]
|
||||
self.assertIn("calc", names, "tool_call missing from final chunk after fallback")
|
||||
|
||||
# ── Case 6: no tool parser → unchanged per-delta content stream ──
|
||||
def test_no_tool_parser_unchanged_per_delta_stream(self):
|
||||
from types import SimpleNamespace
|
||||
s = self._new_servicer() # tool_parser_cls already None
|
||||
s.llm = SimpleNamespace(generate=self._make_generate([
|
||||
"Hello ", "Hello world", "Hello world!",
|
||||
]))
|
||||
req = backend_pb2.PredictOptions(Prompt="x", Tools="")
|
||||
replies = self._collect(s, req)
|
||||
|
||||
intermediate = [
|
||||
cd.content for r in replies[:-1] for cd in r.chat_deltas if cd.content
|
||||
]
|
||||
self.assertEqual(
|
||||
intermediate, ["Hello ", "world", "!"],
|
||||
f"plain streaming changed; got {intermediate!r}",
|
||||
)
|
||||
|
||||
@@ -341,11 +341,9 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
|
||||
}
|
||||
appCfg := a.ApplicationConfig()
|
||||
|
||||
if cfg.PII.Enabled != nil {
|
||||
enabled = *cfg.PII.Enabled
|
||||
} else {
|
||||
enabled = cfg.PIIIsEnabled() // backend default (cloud-proxy)
|
||||
}
|
||||
// PIIIsEnabled already encodes "explicit pii.enabled wins, else backend
|
||||
// default (cloud-proxy)" — the single source of that rule.
|
||||
enabled = cfg.PIIIsEnabled()
|
||||
if !enabled {
|
||||
return false, nil
|
||||
}
|
||||
@@ -354,7 +352,7 @@ func (a *Application) ResolvePIIPolicy(cfg *config.ModelConfig) (enabled bool, d
|
||||
if len(detectors) == 0 {
|
||||
detectors = append([]string(nil), appCfg.PIIDefaultDetectors...)
|
||||
}
|
||||
return enabled, detectors
|
||||
return true, detectors // enabled is necessarily true past the !enabled guard
|
||||
}
|
||||
|
||||
// PIIPolicyResolver adapts ResolvePIIPolicy to pii.PolicyResolver for
|
||||
|
||||
@@ -357,6 +357,15 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
|
||||
Pressure: pressure,
|
||||
})
|
||||
|
||||
// Wire staging-progress broadcasting so file-staging shows up on every
|
||||
// replica, not just the one performing the transfer. Without this, a
|
||||
// /api/operations poll that round-robins onto a peer sees no staging row and
|
||||
// the progress flickers. The origin publishes; peers mirror via the wildcard.
|
||||
router.StagingTracker().SetPublisher(natsClient)
|
||||
if _, err := router.StagingTracker().SubscribeBroadcasts(natsClient); err != nil {
|
||||
xlog.Warn("Failed to subscribe to staging progress broadcasts", "error", err)
|
||||
}
|
||||
|
||||
// Create ReplicaReconciler for auto-scaling model replicas. Adapter +
|
||||
// RegistrationToken feed the state-reconciliation passes: pending op
|
||||
// drain uses the adapter, and model health probes use the token to auth
|
||||
|
||||
@@ -25,6 +25,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/services/storage"
|
||||
coreStartup "github.com/mudler/LocalAI/core/startup"
|
||||
"github.com/mudler/LocalAI/internal"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/signals"
|
||||
"github.com/mudler/LocalAI/pkg/vram"
|
||||
|
||||
@@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to create ModelPath: %q", err)
|
||||
}
|
||||
|
||||
// Reap *.partial downloads abandoned by a previous run (killed mid-transfer
|
||||
// by an OOM/restart, or stalled before cleanup could run). The 24h window
|
||||
// is well beyond any legitimate in-flight download, so this never trims an
|
||||
// active transfer; it just stops dead partials accumulating on the volume.
|
||||
if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil {
|
||||
xlog.Warn("Failed to reap stale partial downloads", "error", cErr)
|
||||
} else if removed > 0 {
|
||||
xlog.Info("Reaped stale partial downloads", "count", removed)
|
||||
}
|
||||
if options.GeneratedContentDir != "" {
|
||||
err := os.MkdirAll(options.GeneratedContentDir, 0o750)
|
||||
if err != nil {
|
||||
@@ -633,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
|
||||
options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||
}
|
||||
}
|
||||
if settings.SizeAwareEviction != nil {
|
||||
// Only apply if current value is default (false), suggesting it wasn't set from env var
|
||||
if !options.SizeAwareEviction {
|
||||
options.SizeAwareEviction = *settings.SizeAwareEviction
|
||||
}
|
||||
}
|
||||
if settings.LRUEvictionMaxRetries != nil {
|
||||
// Only apply if current value is default (30), suggesting it wasn't set from env var
|
||||
if options.LRUEvictionMaxRetries == 0 {
|
||||
@@ -836,6 +853,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
|
||||
model.WithLRULimit(lruLimit),
|
||||
model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
|
||||
model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
|
||||
model.WithSizeAwareEviction(options.SizeAwareEviction),
|
||||
)
|
||||
application.ModelLoader().SetWatchDog(wd)
|
||||
|
||||
|
||||
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
|
||||
model.WithLRULimit(lruLimit),
|
||||
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
|
||||
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
|
||||
model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
|
||||
)
|
||||
|
||||
// Create new stop channel BEFORE setting up any goroutines
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/rand/v2"
|
||||
@@ -12,7 +13,9 @@ import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/trace"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/vram"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
|
||||
})
|
||||
}
|
||||
|
||||
// estimateModelSizeBytes uses the unified EstimateModel entry point to compute
|
||||
// the total weight-file size for a model config. It collects all weight files
|
||||
// from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
|
||||
// repo ID so EstimateModel can fall back to the HF API when local file
|
||||
// metadata is unavailable (e.g. not-yet-downloaded models).
|
||||
func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
|
||||
seen := make(map[string]bool)
|
||||
input := vram.ModelEstimateInput{}
|
||||
|
||||
addFile := func(uri string) {
|
||||
if !vram.IsWeightFile(uri) {
|
||||
return
|
||||
}
|
||||
resolved := uri
|
||||
if !strings.Contains(uri, "://") {
|
||||
resolved = "file://" + filepath.Join(modelsPath, uri)
|
||||
}
|
||||
if seen[resolved] {
|
||||
return
|
||||
}
|
||||
seen[resolved] = true
|
||||
input.Files = append(input.Files, vram.FileInput{URI: resolved})
|
||||
}
|
||||
|
||||
// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
|
||||
// then extracts the org/model repo ID for use as the HF fallback path.
|
||||
tryHFRepo := func(uri string) {
|
||||
if input.HFRepo != "" {
|
||||
return
|
||||
}
|
||||
resolved := downloader.URI(uri).ResolveURL()
|
||||
if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
|
||||
input.HFRepo = repoID
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range c.DownloadFiles {
|
||||
uriStr := string(f.URI)
|
||||
addFile(uriStr)
|
||||
tryHFRepo(uriStr)
|
||||
}
|
||||
addFile(c.Model)
|
||||
tryHFRepo(c.Model)
|
||||
if c.MMProj != "" {
|
||||
addFile(c.MMProj)
|
||||
}
|
||||
|
||||
if len(input.Files) == 0 && input.HFRepo == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
result, err := vram.EstimateModelMultiContext(ctx, input, nil)
|
||||
if err != nil || result.SizeBytes == 0 {
|
||||
return 0
|
||||
}
|
||||
return int64(result.SizeBytes)
|
||||
}
|
||||
|
||||
func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
|
||||
defOpts := []model.Option{
|
||||
model.WithBackendString(c.Backend),
|
||||
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
|
||||
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
||||
}
|
||||
|
||||
if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
|
||||
defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
|
||||
}
|
||||
|
||||
return append(defOpts, opts...)
|
||||
}
|
||||
|
||||
@@ -90,10 +158,11 @@ func getSeed(c config.ModelConfig) int32 {
|
||||
// DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a
|
||||
// model config leaves them unset. Exported so callers that must respect the
|
||||
// effective decode window — notably the router's prompt trimmer — resolve the
|
||||
// same numbers grpcModelOpts does instead of guessing.
|
||||
// same numbers grpcModelOpts does instead of guessing. The values are owned by
|
||||
// core/config (single source of truth shared with the config default tiers).
|
||||
const (
|
||||
DefaultContextSize = 4096
|
||||
DefaultBatchSize = 512
|
||||
DefaultContextSize = config.DefaultContextSize
|
||||
DefaultBatchSize = config.DefaultPhysicalBatch
|
||||
)
|
||||
|
||||
// EffectiveContextSize is the context window the backend will run with: the
|
||||
@@ -129,7 +198,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
|
||||
ctxSize := EffectiveContextSize(c)
|
||||
b := EffectiveBatchSize(c)
|
||||
|
||||
flashAttention := "auto"
|
||||
flashAttention := config.DefaultFlashAttention
|
||||
|
||||
if c.FlashAttention != nil {
|
||||
flashAttention = *c.FlashAttention
|
||||
@@ -175,7 +244,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
|
||||
mmlock = *c.MMlock
|
||||
}
|
||||
|
||||
nGPULayers := 9999999
|
||||
nGPULayers := config.DefaultNGPULayers
|
||||
if c.NGPULayers != nil {
|
||||
nGPULayers = *c.NGPULayers
|
||||
}
|
||||
|
||||
88
core/backend/sound_classification.go
Normal file
88
core/backend/sound_classification.go
Normal file
@@ -0,0 +1,88 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
grpcPkg "github.com/mudler/LocalAI/pkg/grpc"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
// SoundDetectionRequest carries the knobs the HTTP layer collects for an
|
||||
// audio-tagging / sound-event-classification call. Audio is the path to the
|
||||
// uploaded clip on disk; TopK and Threshold are optional (0 = backend default).
|
||||
type SoundDetectionRequest struct {
|
||||
Audio string
|
||||
TopK int32
|
||||
Threshold float32
|
||||
}
|
||||
|
||||
func (r *SoundDetectionRequest) toProto() *proto.SoundDetectionRequest {
|
||||
return &proto.SoundDetectionRequest{
|
||||
Src: r.Audio,
|
||||
TopK: r.TopK,
|
||||
Threshold: r.Threshold,
|
||||
}
|
||||
}
|
||||
|
||||
func loadSoundDetectionModel(ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (grpcPkg.Backend, error) {
|
||||
if modelConfig.Backend == "" {
|
||||
return nil, fmt.Errorf("sound classification: model %q has no backend set; supported backends include ced", modelConfig.Name)
|
||||
}
|
||||
opts := ModelOptions(modelConfig, appConfig)
|
||||
m, err := ml.Load(opts...)
|
||||
if err != nil {
|
||||
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
|
||||
return nil, err
|
||||
}
|
||||
if m == nil {
|
||||
return nil, fmt.Errorf("could not load sound classification model")
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// ModelSoundDetection runs the SoundDetection RPC against the configured
|
||||
// backend and returns a normalized schema.SoundClassificationResult.
|
||||
func ModelSoundDetection(ctx context.Context, req SoundDetectionRequest, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.SoundClassificationResult, error) {
|
||||
m, err := loadSoundDetectionModel(ml, modelConfig, appConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
r, err := m.SoundDetection(ctx, req.toProto())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return soundClassificationResultFromProto(modelConfig.Name, r), nil
|
||||
}
|
||||
|
||||
// soundClassificationResultFromProto maps the backend detections to the
|
||||
// HTTP-facing schema, keeping the backend's score-descending order.
|
||||
func soundClassificationResultFromProto(modelName string, r *proto.SoundDetectionResponse) *schema.SoundClassificationResult {
|
||||
out := &schema.SoundClassificationResult{
|
||||
Model: modelName,
|
||||
Detections: []schema.SoundClassification{},
|
||||
}
|
||||
if r == nil {
|
||||
return out
|
||||
}
|
||||
for _, d := range r.Detections {
|
||||
if d == nil {
|
||||
continue
|
||||
}
|
||||
out.Detections = append(out.Detections, schema.SoundClassification{
|
||||
Index: int(d.Index),
|
||||
Label: d.Label,
|
||||
Score: d.Score,
|
||||
})
|
||||
}
|
||||
sort.SliceStable(out.Detections, func(i, j int) bool {
|
||||
return out.Detections[i].Score > out.Detections[j].Score
|
||||
})
|
||||
return out
|
||||
}
|
||||
@@ -93,6 +93,7 @@ type RunCMD struct {
|
||||
EnableMemoryReclaimer bool `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
|
||||
MemoryReclaimerThreshold float64 `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
|
||||
ForceEvictionWhenBusy bool `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
|
||||
SizeAwareEviction bool `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
|
||||
LRUEvictionMaxRetries int `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
|
||||
LRUEvictionRetryInterval string `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
|
||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||
@@ -564,6 +565,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
if r.ForceEvictionWhenBusy {
|
||||
opts = append(opts, config.WithForceEvictionWhenBusy(true))
|
||||
}
|
||||
if r.SizeAwareEviction {
|
||||
opts = append(opts, config.WithSizeAwareEviction(true))
|
||||
}
|
||||
if r.LRUEvictionMaxRetries > 0 {
|
||||
opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
|
||||
}
|
||||
|
||||
@@ -119,6 +119,7 @@ type ApplicationConfig struct {
|
||||
|
||||
// Eviction settings
|
||||
ForceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||
SizeAwareEviction bool // Evict largest models first rather than least-recently-used (default: false)
|
||||
LRUEvictionMaxRetries int // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
||||
LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
|
||||
|
||||
@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
|
||||
}
|
||||
}
|
||||
|
||||
// WithSizeAwareEviction enables size-aware eviction ordering.
|
||||
// When true, the watchdog evicts the largest loaded model first rather than the
|
||||
// least-recently-used one, keeping small utility models resident and maximizing
|
||||
// memory freed per eviction.
|
||||
func WithSizeAwareEviction(enabled bool) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.SizeAwareEviction = enabled
|
||||
}
|
||||
}
|
||||
|
||||
// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
|
||||
func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
@@ -1028,6 +1039,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
||||
memoryReclaimerEnabled := o.MemoryReclaimerEnabled
|
||||
memoryReclaimerThreshold := o.MemoryReclaimerThreshold
|
||||
forceEvictionWhenBusy := o.ForceEvictionWhenBusy
|
||||
sizeAwareEviction := o.SizeAwareEviction
|
||||
lruEvictionMaxRetries := o.LRUEvictionMaxRetries
|
||||
threads := o.Threads
|
||||
contextSize := o.ContextSize
|
||||
@@ -1120,6 +1132,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
||||
MemoryReclaimerEnabled: &memoryReclaimerEnabled,
|
||||
MemoryReclaimerThreshold: &memoryReclaimerThreshold,
|
||||
ForceEvictionWhenBusy: &forceEvictionWhenBusy,
|
||||
SizeAwareEviction: &sizeAwareEviction,
|
||||
LRUEvictionMaxRetries: &lruEvictionMaxRetries,
|
||||
LRUEvictionRetryInterval: &lruEvictionRetryInterval,
|
||||
Threads: &threads,
|
||||
@@ -1244,6 +1257,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
|
||||
o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
}
|
||||
if settings.SizeAwareEviction != nil {
|
||||
o.SizeAwareEviction = *settings.SizeAwareEviction
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
}
|
||||
if settings.LRUEvictionMaxRetries != nil {
|
||||
o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
|
||||
@@ -8,27 +8,28 @@ import (
|
||||
// Usecase name constants — the canonical string values used in gallery entries,
|
||||
// model configs (known_usecases), and UsecaseInfoMap keys.
|
||||
const (
|
||||
UsecaseChat = "chat"
|
||||
UsecaseCompletion = "completion"
|
||||
UsecaseEdit = "edit"
|
||||
UsecaseVision = "vision"
|
||||
UsecaseEmbeddings = "embeddings"
|
||||
UsecaseTokenize = "tokenize"
|
||||
UsecaseImage = "image"
|
||||
UsecaseVideo = "video"
|
||||
UsecaseTranscript = "transcript"
|
||||
UsecaseTTS = "tts"
|
||||
UsecaseSoundGeneration = "sound_generation"
|
||||
UsecaseRerank = "rerank"
|
||||
UsecaseDetection = "detection"
|
||||
UsecaseDepth = "depth"
|
||||
UsecaseVAD = "vad"
|
||||
UsecaseAudioTransform = "audio_transform"
|
||||
UsecaseDiarization = "diarization"
|
||||
UsecaseRealtimeAudio = "realtime_audio"
|
||||
UsecaseFaceRecognition = "face_recognition"
|
||||
UsecaseSpeakerRecognition = "speaker_recognition"
|
||||
UsecaseTokenClassify = "token_classify"
|
||||
UsecaseChat = "chat"
|
||||
UsecaseCompletion = "completion"
|
||||
UsecaseEdit = "edit"
|
||||
UsecaseVision = "vision"
|
||||
UsecaseEmbeddings = "embeddings"
|
||||
UsecaseTokenize = "tokenize"
|
||||
UsecaseImage = "image"
|
||||
UsecaseVideo = "video"
|
||||
UsecaseTranscript = "transcript"
|
||||
UsecaseTTS = "tts"
|
||||
UsecaseSoundGeneration = "sound_generation"
|
||||
UsecaseRerank = "rerank"
|
||||
UsecaseDetection = "detection"
|
||||
UsecaseDepth = "depth"
|
||||
UsecaseVAD = "vad"
|
||||
UsecaseAudioTransform = "audio_transform"
|
||||
UsecaseDiarization = "diarization"
|
||||
UsecaseSoundClassification = "sound_classification"
|
||||
UsecaseRealtimeAudio = "realtime_audio"
|
||||
UsecaseFaceRecognition = "face_recognition"
|
||||
UsecaseSpeakerRecognition = "speaker_recognition"
|
||||
UsecaseTokenClassify = "token_classify"
|
||||
)
|
||||
|
||||
// GRPCMethod identifies a Backend service RPC from backend.proto.
|
||||
@@ -51,6 +52,7 @@ const (
|
||||
MethodVAD GRPCMethod = "VAD"
|
||||
MethodAudioTransform GRPCMethod = "AudioTransform"
|
||||
MethodDiarize GRPCMethod = "Diarize"
|
||||
MethodSoundDetection GRPCMethod = "SoundDetection"
|
||||
MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
|
||||
MethodFaceVerify GRPCMethod = "FaceVerify"
|
||||
MethodFaceAnalyze GRPCMethod = "FaceAnalyze"
|
||||
@@ -165,6 +167,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
|
||||
GRPCMethod: MethodDiarize,
|
||||
Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
|
||||
},
|
||||
UsecaseSoundClassification: {
|
||||
Flag: FLAG_SOUND_CLASSIFICATION,
|
||||
GRPCMethod: MethodSoundDetection,
|
||||
Description: "Sound-event classification / audio tagging (scored AudioSet labels like baby cry, glass breaking, alarms) via the SoundDetection RPC.",
|
||||
},
|
||||
UsecaseRealtimeAudio: {
|
||||
Flag: FLAG_REALTIME_AUDIO,
|
||||
GRPCMethod: MethodAudioToAudioStream,
|
||||
|
||||
30
core/config/defaults.go
Normal file
30
core/config/defaults.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package config
|
||||
|
||||
// Canonical default values.
|
||||
//
|
||||
// These are owned here so the two layers that need them share a single source
|
||||
// of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which
|
||||
// *decide* defaults) and core/backend/options.go (which *translates* a
|
||||
// ModelConfig to the backend wire format and supplies the same fallbacks
|
||||
// defensively). Previously these were duplicated as literals across both
|
||||
// packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch
|
||||
// constants of 512). core/backend imports core/config, so backend references
|
||||
// these; config never imports backend.
|
||||
const (
|
||||
// DefaultContextSize is the fallback context window when none is configured
|
||||
// or estimable from the model.
|
||||
DefaultContextSize = 4096
|
||||
|
||||
// GGUFFallbackContextSize is the context window for a GGUF model whose
|
||||
// metadata yields no usable estimate (see guessGGUFFromFile). Deliberately
|
||||
// smaller than DefaultContextSize to stay conservative on memory there.
|
||||
GGUFFallbackContextSize = 1024
|
||||
|
||||
// DefaultNGPULayers means "offload all layers"; the backend (fit_params)
|
||||
// clamps to what actually fits in device memory.
|
||||
DefaultNGPULayers = 99999999
|
||||
|
||||
// DefaultFlashAttention is the flash-attention mode default; "auto" lets the
|
||||
// backend enable it when the model + backend support it.
|
||||
DefaultFlashAttention = "auto"
|
||||
)
|
||||
115
core/config/generic_defaults.go
Normal file
115
core/config/generic_defaults.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package config
|
||||
|
||||
import "os"
|
||||
|
||||
// ApplyGenericDefaults fills the generic fallback values applied after the
|
||||
// higher-priority tiers (ApplyInferenceDefaults for the model family,
|
||||
// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving
|
||||
// policy): sampling parameters and a few runtime flags. Like the other tiers it
|
||||
// only fills values still left unset, so model-family / explicit config wins.
|
||||
func ApplyGenericDefaults(cfg *ModelConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||
defaultTopP := 0.95
|
||||
defaultTopK := 40
|
||||
defaultMinP := 0.0
|
||||
defaultTemp := 0.9
|
||||
// https://github.com/mudler/LocalAI/issues/2780
|
||||
defaultMirostat := 0
|
||||
defaultMirostatTAU := 5.0
|
||||
defaultMirostatETA := 0.1
|
||||
defaultTypicalP := 1.0
|
||||
defaultTFZ := 1.0
|
||||
defaultZero := 0
|
||||
|
||||
trueV := true
|
||||
falseV := false
|
||||
|
||||
if cfg.Seed == nil {
|
||||
// random number generator seed
|
||||
defaultSeed := RAND_SEED
|
||||
cfg.Seed = &defaultSeed
|
||||
}
|
||||
|
||||
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
||||
// native default differs (issue #6632). Only inject it for the llama.cpp
|
||||
// family and the empty/auto backend; leave TopK nil for known non-llama
|
||||
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
||||
// is 0 rather than a silently-changed 40.
|
||||
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
||||
cfg.TopK = &defaultTopK
|
||||
}
|
||||
|
||||
if cfg.MinP == nil {
|
||||
cfg.MinP = &defaultMinP
|
||||
}
|
||||
|
||||
if cfg.TypicalP == nil {
|
||||
cfg.TypicalP = &defaultTypicalP
|
||||
}
|
||||
|
||||
if cfg.TFZ == nil {
|
||||
cfg.TFZ = &defaultTFZ
|
||||
}
|
||||
|
||||
if cfg.MMap == nil {
|
||||
// MMap is enabled by default
|
||||
|
||||
// Only exception is for Intel GPUs
|
||||
if os.Getenv("XPU") != "" {
|
||||
cfg.MMap = &falseV
|
||||
} else {
|
||||
cfg.MMap = &trueV
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.MMlock == nil {
|
||||
// MMlock is disabled by default
|
||||
cfg.MMlock = &falseV
|
||||
}
|
||||
|
||||
if cfg.TopP == nil {
|
||||
cfg.TopP = &defaultTopP
|
||||
}
|
||||
if cfg.Temperature == nil {
|
||||
cfg.Temperature = &defaultTemp
|
||||
}
|
||||
|
||||
if cfg.Maxtokens == nil {
|
||||
cfg.Maxtokens = &defaultZero
|
||||
}
|
||||
|
||||
if cfg.Mirostat == nil {
|
||||
cfg.Mirostat = &defaultMirostat
|
||||
}
|
||||
|
||||
if cfg.MirostatETA == nil {
|
||||
cfg.MirostatETA = &defaultMirostatETA
|
||||
}
|
||||
|
||||
if cfg.MirostatTAU == nil {
|
||||
cfg.MirostatTAU = &defaultMirostatTAU
|
||||
}
|
||||
|
||||
if cfg.LowVRAM == nil {
|
||||
cfg.LowVRAM = &falseV
|
||||
}
|
||||
|
||||
if cfg.Embeddings == nil {
|
||||
cfg.Embeddings = &falseV
|
||||
}
|
||||
|
||||
if cfg.Reranking == nil {
|
||||
cfg.Reranking = &falseV
|
||||
}
|
||||
|
||||
if cfg.PromptCacheAll == nil {
|
||||
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
||||
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
||||
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
||||
cfg.PromptCacheAll = &trueV
|
||||
}
|
||||
}
|
||||
36
core/config/generic_defaults_test.go
Normal file
36
core/config/generic_defaults_test.go
Normal file
@@ -0,0 +1,36 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() {
|
||||
It("fills sampling + runtime fallbacks when unset", func() {
|
||||
cfg := &ModelConfig{} // empty backend uses the llama sampler defaults
|
||||
ApplyGenericDefaults(cfg)
|
||||
Expect(cfg.TopP).ToNot(BeNil())
|
||||
Expect(*cfg.TopP).To(Equal(0.95))
|
||||
Expect(*cfg.TopK).To(Equal(40))
|
||||
Expect(*cfg.Temperature).To(Equal(0.9))
|
||||
Expect(*cfg.MMap).To(BeTrue())
|
||||
Expect(*cfg.MMlock).To(BeFalse())
|
||||
Expect(*cfg.PromptCacheAll).To(BeTrue())
|
||||
})
|
||||
|
||||
It("never overrides explicit values", func() {
|
||||
tk := 7
|
||||
tp := 0.5
|
||||
cfg := &ModelConfig{}
|
||||
cfg.TopK = &tk
|
||||
cfg.TopP = &tp
|
||||
ApplyGenericDefaults(cfg)
|
||||
Expect(*cfg.TopK).To(Equal(7))
|
||||
Expect(*cfg.TopP).To(Equal(0.5))
|
||||
})
|
||||
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
@@ -14,11 +14,6 @@ import (
|
||||
"github.com/gpustack/gguf-parser-go/util/ptr"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultContextSize = 1024
|
||||
defaultNGPULayers = 99999999
|
||||
)
|
||||
|
||||
// reservedNonChatModel reports whether the operator reserved this model for an
|
||||
// internal primitive — the router score classifier or the PII NER
|
||||
// token_classify tier. Such a model has no chat template and must not be
|
||||
@@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||
cSize := int(ctxSize)
|
||||
cfg.ContextSize = &cSize
|
||||
} else {
|
||||
defaultCtx = defaultContextSize
|
||||
defaultCtx = GGUFFallbackContextSize
|
||||
cfg.ContextSize = &defaultCtx
|
||||
}
|
||||
}
|
||||
@@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||
|
||||
if cfg.NGPULayers == nil {
|
||||
// we assume we want to offload all layers
|
||||
defaultHigh := defaultNGPULayers
|
||||
defaultHigh := DefaultNGPULayers
|
||||
cfg.NGPULayers = &defaultHigh
|
||||
}
|
||||
|
||||
|
||||
180
core/config/hardware_defaults.go
Normal file
180
core/config/hardware_defaults.go
Normal file
@@ -0,0 +1,180 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// Hardware-driven model-config defaults.
|
||||
//
|
||||
// This sits alongside the other config overriders (ApplyInferenceDefaults for
|
||||
// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
|
||||
// heuristically fill ModelConfig values the user left unset. Hardware tuning is
|
||||
// the same domain — "adjust the config from the device that will run it" — so
|
||||
// it lives here rather than scattered into the backend or a separate package.
|
||||
//
|
||||
// The heuristics are parameterized on a GPU descriptor (not on direct
|
||||
// detection) so they apply in both deployment shapes: SetDefaults passes the
|
||||
// LocalGPU on a single host, and the distributed router passes the *selected
|
||||
// node's* reported GPU before loading there (the frontend that loaded the
|
||||
// config may have no GPU at all).
|
||||
|
||||
// GPU describes the device that will run a model.
|
||||
type GPU struct {
|
||||
// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
|
||||
Vendor string
|
||||
// ComputeCapability is the NVIDIA compute capability as "major.minor"
|
||||
// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
|
||||
ComputeCapability string
|
||||
// VRAM is total device memory in bytes (0 = unknown).
|
||||
VRAM uint64
|
||||
}
|
||||
|
||||
// Physical batch (n_batch / n_ubatch) defaults.
|
||||
const (
|
||||
// DefaultPhysicalBatch is the conservative default when no hardware-specific
|
||||
// tuning applies. core/backend.DefaultBatchSize references this (single source).
|
||||
DefaultPhysicalBatch = 512
|
||||
// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
|
||||
// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
|
||||
// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
|
||||
// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
|
||||
BlackwellPhysicalBatch = 2048
|
||||
)
|
||||
|
||||
// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
|
||||
// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
|
||||
// reports a different compute capability and is intentionally not matched.
|
||||
func (g GPU) IsNVIDIABlackwell() bool {
|
||||
maj, _ := parseComputeCapability(g.ComputeCapability)
|
||||
return maj >= 12
|
||||
}
|
||||
|
||||
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
||||
// given hardware, used when the model config leaves batch unset.
|
||||
func PhysicalBatch(g GPU) int {
|
||||
if g.IsNVIDIABlackwell() {
|
||||
return BlackwellPhysicalBatch
|
||||
}
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
|
||||
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||
// Callers that re-tune a value chosen by an upstream host (the distributed
|
||||
// router correcting the frontend's guess) use this to avoid clobbering an
|
||||
// explicit user batch such as 1024.
|
||||
func IsManagedPhysicalBatch(n int) bool {
|
||||
return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
|
||||
}
|
||||
|
||||
// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
|
||||
// n_parallel=1 (the backend default) and only auto-enables continuous batching
|
||||
// when n_parallel > 1 — so a single-slot default makes concurrent requests
|
||||
// queue. We default a slot count by GPU size so multi-user serving works out of
|
||||
// the box. With the backend's unified KV cache the slots SHARE the context
|
||||
// budget, so more slots add concurrency without multiplying KV memory.
|
||||
const (
|
||||
parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
|
||||
parallelSlotsVRAMMid = uint64(8) << 30 // >=8 GiB -> 4 slots
|
||||
parallelSlotsVRAMLow = uint64(4) << 30 // >=4 GiB -> 2 slots
|
||||
)
|
||||
|
||||
// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
|
||||
// 1 (no concurrency) when VRAM is unknown or too small, so we never change
|
||||
// behavior on CPU-only / tiny devices.
|
||||
func DefaultParallelSlots(g GPU) int {
|
||||
switch {
|
||||
case g.VRAM >= parallelSlotsVRAMHigh:
|
||||
return 8
|
||||
case g.VRAM >= parallelSlotsVRAMMid:
|
||||
return 4
|
||||
case g.VRAM >= parallelSlotsVRAMLow:
|
||||
return 2
|
||||
default:
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
|
||||
// model doesn't already set one (and the GPU warrants concurrency). Returns the
|
||||
// possibly-extended options. Shared by the single-host config path
|
||||
// (ApplyHardwareDefaults) and the distributed router (per selected node).
|
||||
func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||
if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
|
||||
return append(opts, fmt.Sprintf("parallel:%d", slots))
|
||||
}
|
||||
return opts
|
||||
}
|
||||
|
||||
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
||||
// so we never override an explicit value (helper shared with serving_defaults.go).
|
||||
func hasParallelOption(opts []string) bool {
|
||||
return backendOptionSet(opts, "parallel", "n_parallel")
|
||||
}
|
||||
|
||||
// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
|
||||
// a single host (the distributed router builds it from the selected node's
|
||||
// reported info instead). It is a package var so tests can inject a
|
||||
// deterministic device — detection does a live nvidia-smi call.
|
||||
var localGPU = func() GPU {
|
||||
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||
vram, _ := xsysinfo.TotalAvailableVRAM()
|
||||
return GPU{
|
||||
Vendor: vendor,
|
||||
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||
VRAM: vram,
|
||||
}
|
||||
}
|
||||
|
||||
// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
|
||||
// and were left unset by the user. Currently: a larger physical batch on
|
||||
// Blackwell. Explicit config always wins (we only touch zero values).
|
||||
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
||||
}
|
||||
|
||||
// Enable concurrent serving by default on a capable GPU: without this the
|
||||
// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
|
||||
// (continuous batching stays off). Unified KV means the slots share the
|
||||
// context budget, so this is concurrency without extra KV memory. Explicit
|
||||
// parallel/n_parallel in the model options always wins.
|
||||
if before := len(cfg.Options); true {
|
||||
cfg.Options = EnsureParallelOption(cfg.Options, gpu)
|
||||
if len(cfg.Options) > before {
|
||||
xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
|
||||
"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseComputeCapability splits a "major.minor" string into integer parts.
|
||||
// Returns (-1, -1) when it can't be parsed.
|
||||
func parseComputeCapability(cc string) (int, int) {
|
||||
cc = strings.TrimSpace(cc)
|
||||
if cc == "" {
|
||||
return -1, -1
|
||||
}
|
||||
majStr, minStr := cc, "0"
|
||||
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
|
||||
majStr, minStr = cc[:dot], cc[dot+1:]
|
||||
}
|
||||
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
|
||||
if err != nil {
|
||||
return -1, -1
|
||||
}
|
||||
min, err := strconv.Atoi(strings.TrimSpace(minStr))
|
||||
if err != nil {
|
||||
min = 0
|
||||
}
|
||||
return maj, min
|
||||
}
|
||||
37
core/config/hardware_defaults_internal_test.go
Normal file
37
core/config/hardware_defaults_internal_test.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Single-instance path: SetDefaults applies hardware defaults from the local
|
||||
// GPU. The detection seam (localGPU) is injected so the path is deterministic
|
||||
// without a real GPU.
|
||||
var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
|
||||
var orig func() GPU
|
||||
BeforeEach(func() { orig = localGPU })
|
||||
AfterEach(func() { localGPU = orig })
|
||||
|
||||
It("sets the physical batch on a local Blackwell GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
|
||||
It("leaves batch unset on a non-Blackwell local GPU", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
|
||||
It("never overrides an explicit batch", func() {
|
||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
cfg.SetDefaults()
|
||||
Expect(cfg.Batch).To(Equal(1024))
|
||||
})
|
||||
})
|
||||
97
core/config/hardware_defaults_test.go
Normal file
97
core/config/hardware_defaults_test.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("Hardware-driven config defaults", func() {
|
||||
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
||||
func(cc string, want bool) {
|
||||
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
||||
},
|
||||
Entry("GB10 12.1", "12.1", true),
|
||||
Entry("RTX 50 12.0", "12.0", true),
|
||||
Entry("future 13.0", "13.0", true),
|
||||
Entry("Hopper 9.0", "9.0", false),
|
||||
Entry("Ada 8.9", "8.9", false),
|
||||
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
|
||||
Entry("unknown", "", false),
|
||||
)
|
||||
|
||||
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
|
||||
It("returns the Blackwell batch on Blackwell", func() {
|
||||
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("returns the default batch otherwise", func() {
|
||||
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
|
||||
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
It("recognizes managed defaults but not explicit values", func() {
|
||||
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
|
||||
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
|
||||
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ApplyHardwareDefaults", func() {
|
||||
It("raises an unset batch to 2048 on Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("leaves batch unset on non-Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
It("never overrides an explicit batch", func() {
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
Expect(cfg.Batch).To(Equal(1024))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||
func(vramGiB uint64, want int) {
|
||||
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
||||
},
|
||||
Entry("GB10 119 GiB", uint64(119), 8),
|
||||
Entry("48 GiB", uint64(48), 8),
|
||||
Entry("24 GiB", uint64(24), 4),
|
||||
Entry("8 GiB", uint64(8), 4),
|
||||
Entry("6 GiB", uint64(6), 2),
|
||||
Entry("2 GiB", uint64(2), 1),
|
||||
Entry("unknown 0", uint64(0), 1),
|
||||
)
|
||||
|
||||
Describe("ApplyHardwareDefaults parallel slots", func() {
|
||||
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Options).To(ContainElement("parallel:8"))
|
||||
})
|
||||
It("scales the slot count down with VRAM", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
|
||||
Expect(cfg.Options).To(ContainElement("parallel:4"))
|
||||
})
|
||||
It("adds no parallel option on small/unknown VRAM", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
|
||||
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
||||
})
|
||||
It("never overrides an explicit parallel option", func() {
|
||||
cfg := &ModelConfig{Options: []string{"parallel:2"}}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
|
||||
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) {
|
||||
// Default context size if not set, regardless of whether GGUF parsing succeeds
|
||||
defer func() {
|
||||
if cfg.ContextSize == nil {
|
||||
ctx := defaultContextSize
|
||||
ctx := GGUFFallbackContextSize
|
||||
cfg.ContextSize = &ctx
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -68,6 +68,7 @@ var UsecaseOptions = []FieldOption{
|
||||
{Value: "face_recognition", Label: "Face Recognition"},
|
||||
{Value: "transcript", Label: "Transcript"},
|
||||
{Value: "diarization", Label: "Diarization"},
|
||||
{Value: "sound_classification", Label: "Sound Classification"},
|
||||
{Value: "speaker_recognition", Label: "Speaker Recognition"},
|
||||
{Value: "tts", Label: "TTS"},
|
||||
{Value: "sound_generation", Label: "Sound Generation"},
|
||||
|
||||
@@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
Order: 45,
|
||||
},
|
||||
|
||||
// --- Alias ---
|
||||
"alias": {
|
||||
Section: "alias",
|
||||
Label: "Alias target",
|
||||
Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.",
|
||||
Component: "model-select",
|
||||
Order: 0,
|
||||
},
|
||||
|
||||
// --- Pipeline ---
|
||||
"pipeline.llm": {
|
||||
Section: "pipeline",
|
||||
@@ -319,6 +328,30 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
AutocompleteProvider: ProviderModelsVAD,
|
||||
Order: 63,
|
||||
},
|
||||
"pipeline.sound_detection": {
|
||||
Section: "pipeline",
|
||||
Label: "Sound Detection Model",
|
||||
Description: "Model to use for sound-event classification (audio tagging, e.g. ced) in the pipeline. When set, committed realtime audio is also classified and the scored AudioSet tags are emitted as a conversation.item.sound_detection event.",
|
||||
Component: "model-select",
|
||||
AutocompleteProvider: ProviderModels,
|
||||
Order: 64,
|
||||
},
|
||||
"pipeline.sound_detection_window_ms": {
|
||||
Section: "pipeline",
|
||||
Label: "Sound Detection Window (ms)",
|
||||
Description: "Server-side windowing for a sound-only realtime session: length in ms of the audio window classified each hop. 0 = client-driven (the client commits windows).",
|
||||
Component: "number",
|
||||
Min: f64(0),
|
||||
Order: 65,
|
||||
},
|
||||
"pipeline.sound_detection_hop_ms": {
|
||||
Section: "pipeline",
|
||||
Label: "Sound Detection Hop (ms)",
|
||||
Description: "Server-side windowing hop in ms: how often the server classifies the last window. 0 = client-driven.",
|
||||
Component: "number",
|
||||
Min: f64(0),
|
||||
Order: 66,
|
||||
},
|
||||
"pipeline.reasoning_effort": {
|
||||
Section: "pipeline",
|
||||
Label: "Reasoning Effort",
|
||||
@@ -448,6 +481,55 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
Component: "json-editor",
|
||||
Order: 78,
|
||||
},
|
||||
"pipeline.voice_recognition.enforce": {
|
||||
Section: "pipeline",
|
||||
Label: "Voice Gate Enforce",
|
||||
Description: "Whether the gate rejects unauthorized speakers. Enabled (default) drops unauthorized utterances before the LLM. Disabled still resolves and surfaces the speaker (for the conversation.item.speaker event and personalization) but never drops a turn.",
|
||||
Component: "toggle",
|
||||
Order: 80,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.announce": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Announce",
|
||||
Description: "Emit a conversation.item.speaker event to the client naming the recognized speaker. When set, identity is resolved on every turn even if 'when' is 'first'.",
|
||||
Component: "toggle",
|
||||
Order: 81,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.announce_unknown": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Announce Unknown",
|
||||
Description: "Also emit the conversation.item.speaker event (with matched=false) when no confident match is found. Default only announces on a match.",
|
||||
Component: "toggle",
|
||||
Order: 82,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.personalize": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Personalize",
|
||||
Description: "Inform the LLM who is speaking so it can tailor replies. Enables the name and system-note injection below.",
|
||||
Component: "toggle",
|
||||
Order: 83,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.inject_name": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Inject Name",
|
||||
Description: "Personalization: set the per-message OpenAI 'name' field on each user turn to the recognized speaker.",
|
||||
Component: "toggle",
|
||||
Order: 84,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.inject_system_note": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Inject System Note",
|
||||
Description: "Personalization: append a 'The current speaker is <name>.' note to the system message reflecting the latest speaker.",
|
||||
Component: "toggle",
|
||||
Order: 85,
|
||||
},
|
||||
"pipeline.voice_recognition.identity.note_unknown": {
|
||||
Section: "pipeline",
|
||||
Label: "Speaker Identity Note Unknown",
|
||||
Description: "Personalization: when the speaker is unidentified, append 'The current speaker is unknown.' to the system message so the model can ask who it is talking to.",
|
||||
Component: "toggle",
|
||||
Order: 86,
|
||||
},
|
||||
"pipeline.max_history_items": {
|
||||
Section: "pipeline",
|
||||
Label: "Max History Items",
|
||||
@@ -455,6 +537,36 @@ func DefaultRegistry() map[string]FieldMetaOverride {
|
||||
Component: "number",
|
||||
Order: 79,
|
||||
},
|
||||
"pipeline.compaction.enabled": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Enabled",
|
||||
Description: "Fold conversation items that age out of the live window (Max History Items) into a rolling summary instead of dropping them, so long realtime sessions stay cheap without losing earlier context. Off by default.",
|
||||
Component: "toggle",
|
||||
Order: 80,
|
||||
},
|
||||
"pipeline.compaction.trigger_items": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Trigger Items",
|
||||
Description: "High-water mark: once the live conversation exceeds this many items, the overflow above Max History Items is summarized and evicted. Must be greater than Max History Items; defaults to twice it. The gap controls how often summarization runs.",
|
||||
Component: "number",
|
||||
Order: 81,
|
||||
},
|
||||
"pipeline.compaction.summary_model": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Summary Model",
|
||||
Description: "Optional smaller/cheaper model used to produce the rolling summary. Empty reuses the pipeline's own LLM. On CPU, a tiny model here keeps compaction from competing with the conversation LLM.",
|
||||
Component: "input",
|
||||
Advanced: true,
|
||||
Order: 82,
|
||||
},
|
||||
"pipeline.compaction.max_summary_tokens": {
|
||||
Section: "pipeline",
|
||||
Label: "Compaction Max Summary Tokens",
|
||||
Description: "Advisory cap on the rolling summary length (fed to the summarizer prompt). Defaults to 512.",
|
||||
Component: "number",
|
||||
Advanced: true,
|
||||
Order: 83,
|
||||
},
|
||||
|
||||
// --- Functions ---
|
||||
"function.grammar.parallel_calls": {
|
||||
|
||||
28
core/config/meta/registry_test.go
Normal file
28
core/config/meta/registry_test.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package meta_test
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config/meta"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("alias field metadata", func() {
|
||||
It("registers the alias field as a model-select in the alias section", func() {
|
||||
reg := meta.DefaultRegistry()
|
||||
f, ok := reg["alias"]
|
||||
Expect(ok).To(BeTrue(), "alias field should have a registry override")
|
||||
Expect(f.Section).To(Equal("alias"))
|
||||
Expect(f.Component).To(Equal("model-select"))
|
||||
})
|
||||
|
||||
It("defines an alias section", func() {
|
||||
var found bool
|
||||
for _, s := range meta.DefaultSections() {
|
||||
if s.ID == "alias" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
Expect(found).To(BeTrue(), "DefaultSections should include an alias section")
|
||||
})
|
||||
})
|
||||
@@ -69,6 +69,7 @@ type FieldMetaOverride struct {
|
||||
func DefaultSections() []Section {
|
||||
return []Section{
|
||||
{ID: "general", Label: "General", Icon: "settings", Order: 0},
|
||||
{ID: "alias", Label: "Alias", Icon: "git-merge", Order: 5},
|
||||
{ID: "llm", Label: "LLM", Icon: "cpu", Order: 10},
|
||||
{ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20},
|
||||
{ID: "templates", Label: "Templates", Icon: "file-text", Order: 30},
|
||||
|
||||
@@ -37,6 +37,12 @@ type ModelConfig struct {
|
||||
schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"`
|
||||
Name string `yaml:"name,omitempty" json:"name,omitempty"`
|
||||
|
||||
// Alias, when set, makes this config a pure redirect: every request for
|
||||
// Name is served by the model named here. All other fields are ignored.
|
||||
// The target must be an existing, non-alias model (enforced at load and
|
||||
// at create/swap time). See docs/content for Model Aliases.
|
||||
Alias string `yaml:"alias,omitempty" json:"alias,omitempty"`
|
||||
|
||||
F16 *bool `yaml:"f16,omitempty" json:"f16,omitempty"`
|
||||
Threads *int `yaml:"threads,omitempty" json:"threads,omitempty"`
|
||||
Debug *bool `yaml:"debug,omitempty" json:"debug,omitempty"`
|
||||
@@ -391,6 +397,10 @@ func (c *ModelConfig) HasRouter() bool {
|
||||
return len(c.Router.Candidates) > 0
|
||||
}
|
||||
|
||||
// IsAlias reports whether this config is a pure redirect to another model.
|
||||
// Value receiver so it is callable on non-addressable config values too.
|
||||
func (c ModelConfig) IsAlias() bool { return c.Alias != "" }
|
||||
|
||||
// @Description PII filtering configuration. PII redaction is per-model so
|
||||
// that local models don't pay the latency or behaviour change of regex
|
||||
// scanning, while cloud-bound traffic (cloud-proxy backend) can default to
|
||||
@@ -594,6 +604,20 @@ type Pipeline struct {
|
||||
LLM string `yaml:"llm,omitempty" json:"llm,omitempty"`
|
||||
Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"`
|
||||
VAD string `yaml:"vad,omitempty" json:"vad,omitempty"`
|
||||
// SoundDetection names a sound-event-classification model (e.g. ced). When
|
||||
// set, each VAD-committed realtime utterance is also run through it and the
|
||||
// scored AudioSet tags are emitted as a conversation.item.sound_detection
|
||||
// server event, alongside (and independent of) transcription.
|
||||
SoundDetection string `yaml:"sound_detection,omitempty" json:"sound_detection,omitempty"`
|
||||
|
||||
// SoundDetectionWindowMs / SoundDetectionHopMs enable server-side windowing
|
||||
// for a sound-detection-only realtime session: instead of the client
|
||||
// committing audio buffers, the server classifies the last WindowMs of
|
||||
// streamed audio every HopMs and emits a sound_detection event per hop. Both
|
||||
// must be > 0 to activate; otherwise the session stays client-driven (the
|
||||
// client commits windows via input_audio_buffer.commit).
|
||||
SoundDetectionWindowMs int `yaml:"sound_detection_window_ms,omitempty" json:"sound_detection_window_ms,omitempty"`
|
||||
SoundDetectionHopMs int `yaml:"sound_detection_hop_ms,omitempty" json:"sound_detection_hop_ms,omitempty"`
|
||||
|
||||
// ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for
|
||||
// the pipeline's LLM without editing the LLM model config. Overrides the LLM's
|
||||
@@ -617,11 +641,32 @@ type Pipeline struct {
|
||||
// context fills.
|
||||
MaxHistoryItems *int `yaml:"max_history_items,omitempty" json:"max_history_items,omitempty"`
|
||||
|
||||
// Compaction folds conversation items that age out of the live window
|
||||
// (max_history_items) into a rolling summary instead of dropping them, so
|
||||
// long realtime sessions stay cheap without losing earlier context. Nil
|
||||
// (block absent) means disabled, preserving existing behavior.
|
||||
Compaction *PipelineCompaction `yaml:"compaction,omitempty" json:"compaction,omitempty"`
|
||||
|
||||
// VoiceRecognition gates the pipeline behind speaker verification. Nil
|
||||
// (block absent) means no gate, preserving existing behavior.
|
||||
VoiceRecognition *PipelineVoiceRecognition `yaml:"voice_recognition,omitempty" json:"voice_recognition,omitempty"`
|
||||
}
|
||||
|
||||
// PipelineCompaction configures summarize-then-drop for a realtime pipeline.
|
||||
type PipelineCompaction struct {
|
||||
// Enabled turns summarize-then-drop on. Default false.
|
||||
Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
|
||||
// TriggerItems is the high-water mark: once live items exceed it, overflow
|
||||
// above max_history_items is summarized and evicted. Must exceed
|
||||
// max_history_items; clamped up if not. Default: 2x max_history_items.
|
||||
TriggerItems int `yaml:"trigger_items,omitempty" json:"trigger_items,omitempty"`
|
||||
// SummaryModel optionally names a smaller/cheaper model for the summary
|
||||
// call. Empty uses the pipeline's own LLM.
|
||||
SummaryModel string `yaml:"summary_model,omitempty" json:"summary_model,omitempty"`
|
||||
// MaxSummaryTokens advises the summary length (fed to the prompt). Default 512.
|
||||
MaxSummaryTokens int `yaml:"max_summary_tokens,omitempty" json:"max_summary_tokens,omitempty"`
|
||||
}
|
||||
|
||||
// ApplyReasoningEffort resolves the effective reasoning effort — a per-request
|
||||
// value (requestEffort) overrides the config's own ReasoningEffort default —
|
||||
// stores it on the config so gRPCPredictOpts forwards it to the backend as the
|
||||
@@ -759,6 +804,13 @@ type PipelineVoiceRecognition struct {
|
||||
Allow VoiceRecognitionAllow `yaml:"allow,omitempty" json:"allow,omitempty"`
|
||||
// References are the authorized reference speakers (verify mode).
|
||||
References []VoiceReference `yaml:"references,omitempty" json:"references,omitempty"`
|
||||
// Enforce controls the authorization gate. A nil value or true rejects
|
||||
// unauthorized speakers (the historical behavior). false resolves the
|
||||
// speaker's identity for surfacing/personalization but never drops a turn.
|
||||
Enforce *bool `yaml:"enforce,omitempty" json:"enforce,omitempty"`
|
||||
// Identity surfaces the recognized speaker to the client and the LLM. It is
|
||||
// independent of Enforce: identity can be surfaced without gating.
|
||||
Identity *VoiceIdentityConfig `yaml:"identity,omitempty" json:"identity,omitempty"`
|
||||
}
|
||||
|
||||
// @Description VoiceRecognitionAllow filters authorized registry identities.
|
||||
@@ -775,6 +827,25 @@ type VoiceReference struct {
|
||||
Audio string `yaml:"audio,omitempty" json:"audio,omitempty"`
|
||||
}
|
||||
|
||||
// @Description VoiceIdentityConfig surfaces the recognized speaker to the realtime
|
||||
// client and the LLM. When set, identity is resolved on every turn even if the
|
||||
// gate's When is "first" (the gate still authorizes only once).
|
||||
type VoiceIdentityConfig struct {
|
||||
// Announce emits a conversation.item.speaker event to the client.
|
||||
Announce bool `yaml:"announce,omitempty" json:"announce,omitempty"`
|
||||
// AnnounceUnknown also emits the event when there is no confident match.
|
||||
AnnounceUnknown bool `yaml:"announce_unknown,omitempty" json:"announce_unknown,omitempty"`
|
||||
// Personalize informs the LLM who is speaking.
|
||||
Personalize bool `yaml:"personalize,omitempty" json:"personalize,omitempty"`
|
||||
// InjectName sets the per-message name field on each user turn.
|
||||
InjectName bool `yaml:"inject_name,omitempty" json:"inject_name,omitempty"`
|
||||
// InjectSystemNote maintains a "current speaker" note in the system message.
|
||||
InjectSystemNote bool `yaml:"inject_system_note,omitempty" json:"inject_system_note,omitempty"`
|
||||
// NoteUnknown adds a "the current speaker is unknown" note (enables the model
|
||||
// to ask who it is talking to).
|
||||
NoteUnknown bool `yaml:"note_unknown,omitempty" json:"note_unknown,omitempty"`
|
||||
}
|
||||
|
||||
// VoiceGateEnabled reports whether a voice-recognition gate is configured. The
|
||||
// mere presence of the block is the intent signal: a present-but-incomplete
|
||||
// block (e.g. missing model) must fail closed at construction, not be silently
|
||||
@@ -783,6 +854,28 @@ func (p Pipeline) VoiceGateEnabled() bool {
|
||||
return p.VoiceRecognition != nil
|
||||
}
|
||||
|
||||
// EnforceGate reports whether the gate rejects unauthorized speakers. A nil
|
||||
// Enforce means "enforce" so existing configs keep gating.
|
||||
func (p PipelineVoiceRecognition) EnforceGate() bool {
|
||||
return p.Enforce == nil || *p.Enforce
|
||||
}
|
||||
|
||||
// IdentityEnabled reports whether the speaker's identity must be resolved for
|
||||
// surfacing or personalization.
|
||||
func (p PipelineVoiceRecognition) IdentityEnabled() bool {
|
||||
return p.Identity != nil && (p.Identity.Announce || p.Identity.Personalize)
|
||||
}
|
||||
|
||||
// AnnounceEnabled reports whether to emit the conversation.item.speaker event.
|
||||
func (p PipelineVoiceRecognition) AnnounceEnabled() bool {
|
||||
return p.Identity != nil && p.Identity.Announce
|
||||
}
|
||||
|
||||
// PersonalizeEnabled reports whether to inform the LLM of the speaker.
|
||||
func (p PipelineVoiceRecognition) PersonalizeEnabled() bool {
|
||||
return p.Identity != nil && p.Identity.Personalize
|
||||
}
|
||||
|
||||
// Normalize fills in defaults in place for omitted fields.
|
||||
func (v *PipelineVoiceRecognition) Normalize() {
|
||||
if v.Mode == "" {
|
||||
@@ -1111,107 +1204,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
||||
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||
defaultTopP := 0.95
|
||||
defaultTopK := 40
|
||||
defaultMinP := 0.0
|
||||
defaultTemp := 0.9
|
||||
// https://github.com/mudler/LocalAI/issues/2780
|
||||
defaultMirostat := 0
|
||||
defaultMirostatTAU := 5.0
|
||||
defaultMirostatETA := 0.1
|
||||
defaultTypicalP := 1.0
|
||||
defaultTFZ := 1.0
|
||||
defaultZero := 0
|
||||
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
|
||||
// Uses the local GPU here; in distributed mode the router re-applies the same
|
||||
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
||||
ApplyHardwareDefaults(cfg, localGPU())
|
||||
|
||||
// Apply serving-policy defaults (device-independent): cross-request prefix
|
||||
// caching. Propagates to distributed nodes via the model options.
|
||||
ApplyServingDefaults(cfg)
|
||||
|
||||
// Generic fallback defaults (sampling params + runtime flags), applied after
|
||||
// the model-family / hardware / serving tiers above. Only fills unset values.
|
||||
ApplyGenericDefaults(cfg)
|
||||
|
||||
trueV := true
|
||||
falseV := false
|
||||
|
||||
if cfg.Seed == nil {
|
||||
// random number generator seed
|
||||
defaultSeed := RAND_SEED
|
||||
cfg.Seed = &defaultSeed
|
||||
}
|
||||
|
||||
// top_k=40 is llama.cpp's sampling default and is wrong for backends whose
|
||||
// native default differs (issue #6632). Only inject it for the llama.cpp
|
||||
// family and the empty/auto backend; leave TopK nil for known non-llama
|
||||
// backends (e.g. mlx, whose intended default is top_k=0) so the wire value
|
||||
// is 0 rather than a silently-changed 40.
|
||||
if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) {
|
||||
cfg.TopK = &defaultTopK
|
||||
}
|
||||
|
||||
if cfg.MinP == nil {
|
||||
cfg.MinP = &defaultMinP
|
||||
}
|
||||
|
||||
if cfg.TypicalP == nil {
|
||||
cfg.TypicalP = &defaultTypicalP
|
||||
}
|
||||
|
||||
if cfg.TFZ == nil {
|
||||
cfg.TFZ = &defaultTFZ
|
||||
}
|
||||
|
||||
if cfg.MMap == nil {
|
||||
// MMap is enabled by default
|
||||
|
||||
// Only exception is for Intel GPUs
|
||||
if os.Getenv("XPU") != "" {
|
||||
cfg.MMap = &falseV
|
||||
} else {
|
||||
cfg.MMap = &trueV
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.MMlock == nil {
|
||||
// MMlock is disabled by default
|
||||
cfg.MMlock = &falseV
|
||||
}
|
||||
|
||||
if cfg.TopP == nil {
|
||||
cfg.TopP = &defaultTopP
|
||||
}
|
||||
if cfg.Temperature == nil {
|
||||
cfg.Temperature = &defaultTemp
|
||||
}
|
||||
|
||||
if cfg.Maxtokens == nil {
|
||||
cfg.Maxtokens = &defaultZero
|
||||
}
|
||||
|
||||
if cfg.Mirostat == nil {
|
||||
cfg.Mirostat = &defaultMirostat
|
||||
}
|
||||
|
||||
if cfg.MirostatETA == nil {
|
||||
cfg.MirostatETA = &defaultMirostatETA
|
||||
}
|
||||
|
||||
if cfg.MirostatTAU == nil {
|
||||
cfg.MirostatTAU = &defaultMirostatTAU
|
||||
}
|
||||
|
||||
if cfg.LowVRAM == nil {
|
||||
cfg.LowVRAM = &falseV
|
||||
}
|
||||
|
||||
if cfg.Embeddings == nil {
|
||||
cfg.Embeddings = &falseV
|
||||
}
|
||||
|
||||
if cfg.Reranking == nil {
|
||||
cfg.Reranking = &falseV
|
||||
}
|
||||
|
||||
if cfg.PromptCacheAll == nil {
|
||||
// Match upstream llama.cpp's default (common/common.h: cache_prompt = true)
|
||||
// and let cache_idle_slots / kv_unified actually do useful work; users can
|
||||
// opt out with an explicit `prompt_cache_all: false` in the model YAML.
|
||||
cfg.PromptCacheAll = &trueV
|
||||
}
|
||||
|
||||
if threads == 0 {
|
||||
// Threads can't be 0
|
||||
threads = 4
|
||||
@@ -1243,6 +1251,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
}
|
||||
|
||||
func (c *ModelConfig) Validate() (bool, error) {
|
||||
// An alias is a pure redirect: validate only its own shape here. Target
|
||||
// existence and the no-chain rule need the full config set, so the loader
|
||||
// (load-time) and the create/swap endpoints enforce those.
|
||||
if c.IsAlias() {
|
||||
if c.Name == "" {
|
||||
return false, fmt.Errorf("alias config requires a name")
|
||||
}
|
||||
if c.Alias == c.Name {
|
||||
return false, fmt.Errorf("alias %q cannot point to itself", c.Name)
|
||||
}
|
||||
if c.Backend != "" || c.Model != "" {
|
||||
return false, fmt.Errorf("alias config %q must not set backend or parameters.model: an alias is a pure redirect", c.Name)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
downloadedFileNames := []string{}
|
||||
for _, f := range c.DownloadFiles {
|
||||
downloadedFileNames = append(downloadedFileNames, f.Filename)
|
||||
@@ -1463,6 +1487,11 @@ const (
|
||||
// so it may combine freely with other usecases.
|
||||
FLAG_TOKEN_CLASSIFY ModelConfigUsecase = 0b1000000000000000000000
|
||||
|
||||
// Marks a model as wired for the SoundDetection gRPC primitive
|
||||
// (audio tagging / sound-event classification — scored AudioSet
|
||||
// labels via the SoundDetection RPC, e.g. ced).
|
||||
FLAG_SOUND_CLASSIFICATION ModelConfigUsecase = 0b10000000000000000000000
|
||||
|
||||
// Common Subsets
|
||||
FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
|
||||
)
|
||||
@@ -1471,12 +1500,12 @@ const (
|
||||
// Flags within the same group are NOT orthogonal (e.g., chat and completion are
|
||||
// both text/language). A model is multimodal when its usecases span 2+ groups.
|
||||
var ModalityGroups = []ModelConfigUsecase{
|
||||
FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
|
||||
FLAG_VISION | FLAG_DETECTION, // visual understanding
|
||||
FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO, // speech input — realtime_audio is any-to-any, so it counts here too
|
||||
FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
|
||||
FLAG_AUDIO_TRANSFORM, // audio in/out transforms
|
||||
FLAG_IMAGE | FLAG_VIDEO, // visual generation
|
||||
FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
|
||||
FLAG_VISION | FLAG_DETECTION, // visual understanding
|
||||
FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO | FLAG_SOUND_CLASSIFICATION, // audio input — realtime_audio is any-to-any, so it counts here too
|
||||
FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
|
||||
FLAG_AUDIO_TRANSFORM, // audio in/out transforms
|
||||
FLAG_IMAGE | FLAG_VIDEO, // visual generation
|
||||
}
|
||||
|
||||
// IsMultimodal returns true if the given usecases span two or more orthogonal
|
||||
@@ -1499,29 +1528,30 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
|
||||
return map[string]ModelConfigUsecase{
|
||||
// Note: FLAG_ANY is intentionally excluded from this map
|
||||
// because it's 0 and would always match in HasUsecases checks
|
||||
"FLAG_CHAT": FLAG_CHAT,
|
||||
"FLAG_COMPLETION": FLAG_COMPLETION,
|
||||
"FLAG_EDIT": FLAG_EDIT,
|
||||
"FLAG_EMBEDDINGS": FLAG_EMBEDDINGS,
|
||||
"FLAG_RERANK": FLAG_RERANK,
|
||||
"FLAG_IMAGE": FLAG_IMAGE,
|
||||
"FLAG_TRANSCRIPT": FLAG_TRANSCRIPT,
|
||||
"FLAG_TTS": FLAG_TTS,
|
||||
"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
|
||||
"FLAG_TOKENIZE": FLAG_TOKENIZE,
|
||||
"FLAG_VAD": FLAG_VAD,
|
||||
"FLAG_LLM": FLAG_LLM,
|
||||
"FLAG_VIDEO": FLAG_VIDEO,
|
||||
"FLAG_DETECTION": FLAG_DETECTION,
|
||||
"FLAG_VISION": FLAG_VISION,
|
||||
"FLAG_FACE_RECOGNITION": FLAG_FACE_RECOGNITION,
|
||||
"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
|
||||
"FLAG_AUDIO_TRANSFORM": FLAG_AUDIO_TRANSFORM,
|
||||
"FLAG_DIARIZATION": FLAG_DIARIZATION,
|
||||
"FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO,
|
||||
"FLAG_SCORE": FLAG_SCORE,
|
||||
"FLAG_DEPTH": FLAG_DEPTH,
|
||||
"FLAG_TOKEN_CLASSIFY": FLAG_TOKEN_CLASSIFY,
|
||||
"FLAG_CHAT": FLAG_CHAT,
|
||||
"FLAG_COMPLETION": FLAG_COMPLETION,
|
||||
"FLAG_EDIT": FLAG_EDIT,
|
||||
"FLAG_EMBEDDINGS": FLAG_EMBEDDINGS,
|
||||
"FLAG_RERANK": FLAG_RERANK,
|
||||
"FLAG_IMAGE": FLAG_IMAGE,
|
||||
"FLAG_TRANSCRIPT": FLAG_TRANSCRIPT,
|
||||
"FLAG_TTS": FLAG_TTS,
|
||||
"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
|
||||
"FLAG_TOKENIZE": FLAG_TOKENIZE,
|
||||
"FLAG_VAD": FLAG_VAD,
|
||||
"FLAG_LLM": FLAG_LLM,
|
||||
"FLAG_VIDEO": FLAG_VIDEO,
|
||||
"FLAG_DETECTION": FLAG_DETECTION,
|
||||
"FLAG_VISION": FLAG_VISION,
|
||||
"FLAG_FACE_RECOGNITION": FLAG_FACE_RECOGNITION,
|
||||
"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
|
||||
"FLAG_AUDIO_TRANSFORM": FLAG_AUDIO_TRANSFORM,
|
||||
"FLAG_DIARIZATION": FLAG_DIARIZATION,
|
||||
"FLAG_SOUND_CLASSIFICATION": FLAG_SOUND_CLASSIFICATION,
|
||||
"FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO,
|
||||
"FLAG_SCORE": FLAG_SCORE,
|
||||
"FLAG_DEPTH": FLAG_DEPTH,
|
||||
"FLAG_TOKEN_CLASSIFY": FLAG_TOKEN_CLASSIFY,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1724,6 +1754,16 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
|
||||
}
|
||||
}
|
||||
|
||||
if (u & FLAG_SOUND_CLASSIFICATION) == FLAG_SOUND_CLASSIFICATION {
|
||||
// ced is a sound-event tagger (AudioSet labels) surfaced via the
|
||||
// SoundDetection gRPC. Models without an explicit known_usecases
|
||||
// still surface when they run on one of these backends.
|
||||
soundClassificationBackends := []string{"ced"}
|
||||
if !slices.Contains(soundClassificationBackends, c.Backend) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
|
||||
// Backends that own a single any-to-any loop and implement
|
||||
// AudioToAudioStream — listed here so models without an explicit
|
||||
|
||||
@@ -294,6 +294,44 @@ func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelCon
|
||||
}
|
||||
}
|
||||
|
||||
// ResolveAlias follows a one-hop alias to its target config. Returns
|
||||
// (resolved, wasAlias, err). Non-alias configs return (cfg, false, nil)
|
||||
// unchanged. Strict: the target must exist and must not itself be an alias
|
||||
// (chains are rejected). The returned config is a copy of the target.
|
||||
func (bcl *ModelConfigLoader) ResolveAlias(cfg *ModelConfig) (*ModelConfig, bool, error) {
|
||||
if cfg == nil || !cfg.IsAlias() {
|
||||
return cfg, false, nil
|
||||
}
|
||||
target, exists := bcl.GetModelConfig(cfg.Alias)
|
||||
if !exists {
|
||||
return nil, true, fmt.Errorf("alias %q points to unknown model %q", cfg.Name, cfg.Alias)
|
||||
}
|
||||
if target.IsAlias() {
|
||||
return nil, true, fmt.Errorf("alias %q points to another alias %q (chains are not allowed)", cfg.Name, cfg.Alias)
|
||||
}
|
||||
return &target, true, nil
|
||||
}
|
||||
|
||||
// ValidateAliasTarget checks an alias config's target at create/swap time:
|
||||
// the target must exist, must not be an alias, and must not be disabled.
|
||||
// Returns nil for non-alias configs.
|
||||
func (bcl *ModelConfigLoader) ValidateAliasTarget(cfg *ModelConfig) error {
|
||||
if cfg == nil || !cfg.IsAlias() {
|
||||
return nil
|
||||
}
|
||||
target, exists := bcl.GetModelConfig(cfg.Alias)
|
||||
if !exists {
|
||||
return fmt.Errorf("alias target %q does not exist", cfg.Alias)
|
||||
}
|
||||
if target.IsAlias() {
|
||||
return fmt.Errorf("alias target %q is itself an alias (chains are not allowed)", cfg.Alias)
|
||||
}
|
||||
if target.IsDisabled() {
|
||||
return fmt.Errorf("alias target %q is disabled", cfg.Alias)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Preload prepare models if they are not local but url or huggingface repositories
|
||||
func (bcl *ModelConfigLoader) Preload(modelPath string) error {
|
||||
bcl.Lock()
|
||||
@@ -475,5 +513,21 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
|
||||
}
|
||||
}
|
||||
|
||||
// Surface aliases whose targets are missing or themselves aliases. These
|
||||
// resolve to a clear request-time error; warning here gives operators
|
||||
// visibility without failing startup.
|
||||
for name, c := range bcl.configs {
|
||||
if !c.IsAlias() {
|
||||
continue
|
||||
}
|
||||
target, ok := bcl.configs[c.Alias]
|
||||
switch {
|
||||
case !ok:
|
||||
xlog.Warn("alias points to unknown model", "alias", name, "target", c.Alias)
|
||||
case target.IsAlias():
|
||||
xlog.Warn("alias points to another alias (chains are not allowed)", "alias", name, "target", c.Alias)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -61,3 +61,51 @@ var _ = Describe("ModelConfigLoader.GetModelsConflictingWith", func() {
|
||||
Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b"))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("ModelConfigLoader alias resolution", func() {
|
||||
var loader *ModelConfigLoader
|
||||
|
||||
BeforeEach(func() {
|
||||
loader = NewModelConfigLoader("")
|
||||
loader.configs["real"] = ModelConfig{Name: "real", Backend: "llama-cpp"}
|
||||
loader.configs["gpt-4"] = ModelConfig{Name: "gpt-4", Alias: "real"}
|
||||
loader.configs["chain"] = ModelConfig{Name: "chain", Alias: "gpt-4"}
|
||||
loader.configs["dangling"] = ModelConfig{Name: "dangling", Alias: "nope"}
|
||||
})
|
||||
|
||||
It("returns non-alias configs unchanged", func() {
|
||||
cfg := loader.configs["real"]
|
||||
got, was, err := loader.ResolveAlias(&cfg)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(was).To(BeFalse())
|
||||
Expect(got.Name).To(Equal("real"))
|
||||
})
|
||||
|
||||
It("resolves an alias to its target", func() {
|
||||
cfg := loader.configs["gpt-4"]
|
||||
got, was, err := loader.ResolveAlias(&cfg)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(was).To(BeTrue())
|
||||
Expect(got.Name).To(Equal("real"))
|
||||
})
|
||||
|
||||
It("rejects an alias chain", func() {
|
||||
cfg := loader.configs["chain"]
|
||||
_, was, err := loader.ResolveAlias(&cfg)
|
||||
Expect(was).To(BeTrue())
|
||||
Expect(err).To(MatchError(ContainSubstring("chains are not allowed")))
|
||||
})
|
||||
|
||||
It("rejects a dangling alias", func() {
|
||||
cfg := loader.configs["dangling"]
|
||||
_, _, err := loader.ResolveAlias(&cfg)
|
||||
Expect(err).To(MatchError(ContainSubstring("unknown model")))
|
||||
})
|
||||
|
||||
It("ValidateAliasTarget passes for a real target and fails for a chain", func() {
|
||||
good := loader.configs["gpt-4"]
|
||||
Expect(loader.ValidateAliasTarget(&good)).ToNot(HaveOccurred())
|
||||
bad := loader.configs["chain"]
|
||||
Expect(loader.ValidateAliasTarget(&bad)).To(MatchError(ContainSubstring("itself an alias")))
|
||||
})
|
||||
})
|
||||
|
||||
@@ -787,3 +787,32 @@ var _ = Describe("pattern detector config", func() {
|
||||
Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\"")))
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("ModelConfig alias", func() {
|
||||
It("reports IsAlias when alias is set", func() {
|
||||
c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
|
||||
Expect(c.IsAlias()).To(BeTrue())
|
||||
Expect(ModelConfig{Name: "real"}.IsAlias()).To(BeFalse())
|
||||
})
|
||||
|
||||
It("validates a minimal alias config", func() {
|
||||
c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"}
|
||||
ok, err := c.Validate()
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ok).To(BeTrue())
|
||||
})
|
||||
|
||||
It("rejects an alias pointing to itself", func() {
|
||||
c := ModelConfig{Name: "loop", Alias: "loop"}
|
||||
ok, err := c.Validate()
|
||||
Expect(ok).To(BeFalse())
|
||||
Expect(err).To(MatchError(ContainSubstring("itself")))
|
||||
})
|
||||
|
||||
It("rejects an alias that also sets a backend", func() {
|
||||
c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3", Backend: "llama-cpp"}
|
||||
ok, err := c.Validate()
|
||||
Expect(ok).To(BeFalse())
|
||||
Expect(err).To(MatchError(ContainSubstring("pure redirect")))
|
||||
})
|
||||
})
|
||||
|
||||
@@ -28,6 +28,7 @@ type RuntimeSettings struct {
|
||||
|
||||
// Eviction settings
|
||||
ForceEvictionWhenBusy *bool `json:"force_eviction_when_busy,omitempty"` // Force eviction even when models have active API calls (default: false for safety)
|
||||
SizeAwareEviction *bool `json:"size_aware_eviction,omitempty"` // Evict largest models first rather than least-recently-used (default: false)
|
||||
LRUEvictionMaxRetries *int `json:"lru_eviction_max_retries,omitempty"` // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
||||
LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
|
||||
|
||||
|
||||
56
core/config/serving_defaults.go
Normal file
56
core/config/serving_defaults.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// Serving-policy model-config defaults.
|
||||
//
|
||||
// Sibling to hardware_defaults.go: those fill values driven by the target
|
||||
// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values
|
||||
// that improve multi-request / multi-user *serving* regardless of the GPU. They
|
||||
// run together from SetDefaults and only ever fill values the user left unset.
|
||||
|
||||
// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend
|
||||
// reuses across requests via KV-cache shifting. The llama.cpp backend ships this
|
||||
// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system
|
||||
// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed.
|
||||
// This is the universally-useful part of "paged attention" (cross-request prefix
|
||||
// sharing) and needs none of the block-KV machinery.
|
||||
const DefaultCacheReuse = 256
|
||||
|
||||
// ApplyServingDefaults fills serving-policy ModelConfig values the user left
|
||||
// unset. Currently: enable cross-request prefix caching. Explicit
|
||||
// cache_reuse/n_cache_reuse in the model options always wins.
|
||||
func ApplyServingDefaults(cfg *ModelConfig) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") {
|
||||
cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse))
|
||||
xlog.Debug("[serving_defaults] enabling cross-request prefix cache",
|
||||
"cache_reuse", DefaultCacheReuse)
|
||||
}
|
||||
}
|
||||
|
||||
// backendOptionSet reports whether the backend options already set any of names.
|
||||
// Options are "name:value" strings (or bare "name"); used so we never override
|
||||
// an explicit value. Shared with hardware_defaults.go.
|
||||
func backendOptionSet(opts []string, names ...string) bool {
|
||||
for _, o := range opts {
|
||||
name := o
|
||||
if i := strings.IndexByte(o, ':'); i >= 0 {
|
||||
name = o[:i]
|
||||
}
|
||||
name = strings.TrimSpace(strings.ToLower(name))
|
||||
for _, n := range names {
|
||||
if name == n {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
30
core/config/serving_defaults_test.go
Normal file
30
core/config/serving_defaults_test.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("Serving-policy config defaults", func() {
|
||||
Describe("ApplyServingDefaults (cross-request prefix cache)", func() {
|
||||
It("enables cache_reuse when unset", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(ContainElement("cache_reuse:256"))
|
||||
})
|
||||
It("never overrides an explicit cache_reuse", func() {
|
||||
cfg := &ModelConfig{Options: []string{"cache_reuse:0"}}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"}))
|
||||
})
|
||||
It("recognizes the n_cache_reuse alias", func() {
|
||||
cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}}
|
||||
ApplyServingDefaults(cfg)
|
||||
Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"}))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -70,4 +70,32 @@ var _ = Describe("PipelineVoiceRecognition", func() {
|
||||
Expect((Pipeline{VoiceRecognition: &PipelineVoiceRecognition{}}).VoiceGateEnabled()).To(BeTrue())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("Enforce / Identity helpers", func() {
|
||||
It("treats a nil Enforce as enforcing (backward compatible)", func() {
|
||||
v := PipelineVoiceRecognition{Model: "spk"}
|
||||
Expect(v.EnforceGate()).To(BeTrue())
|
||||
})
|
||||
It("honors an explicit enforce:false", func() {
|
||||
off := false
|
||||
v := PipelineVoiceRecognition{Model: "spk", Enforce: &off}
|
||||
Expect(v.EnforceGate()).To(BeFalse())
|
||||
})
|
||||
It("reports identity disabled when no identity block is set", func() {
|
||||
v := PipelineVoiceRecognition{Model: "spk"}
|
||||
Expect(v.IdentityEnabled()).To(BeFalse())
|
||||
Expect(v.AnnounceEnabled()).To(BeFalse())
|
||||
Expect(v.PersonalizeEnabled()).To(BeFalse())
|
||||
})
|
||||
It("reports identity enabled when announce or personalize is on", func() {
|
||||
v := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Announce: true}}
|
||||
Expect(v.IdentityEnabled()).To(BeTrue())
|
||||
Expect(v.AnnounceEnabled()).To(BeTrue())
|
||||
Expect(v.PersonalizeEnabled()).To(BeFalse())
|
||||
|
||||
v2 := PipelineVoiceRecognition{Model: "spk", Identity: &VoiceIdentityConfig{Personalize: true}}
|
||||
Expect(v2.IdentityEnabled()).To(BeTrue())
|
||||
Expect(v2.PersonalizeEnabled()).To(BeTrue())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user