Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot]
0de0f8fcbb chore(deps): bump the pip group across 4 directories with 1 update
Bumps the pip group with 1 update in the /backend/python/ace-step directory: torch.
Bumps the pip group with 1 update in the /backend/python/llama-cpp-quantization directory: torch.
Bumps the pip group with 1 update in the /backend/python/sglang directory: torch.
Bumps the pip group with 1 update in the /backend/python/vllm-omni directory: torch.


Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu

Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu

Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu

Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu

Updates `torch` from 2.10.0 to 2.12.0+cpu

Updates `torch` from 2.10.0 to 2.12.0+cpu

Updates `torch` from 2.10.0 to 2.12.0+cpu

Updates `torch` from 2.10.0 to 2.12.0+cpu

Updates `torch` from 2.9.0 to 2.12.0+cpu

Updates `torch` from 2.9.0 to 2.12.0+cpu

Updates `torch` from 2.9.0 to 2.12.0+cpu

Updates `torch` from 2.9.0 to 2.12.0+cpu

Updates `torch` from 2.7.0 to 2.12.0+cu130

Updates `torch` from 2.7.0 to 2.12.0+cu130

Updates `torch` from 2.7.0 to 2.12.0+cu130

Updates `torch` from 2.7.0 to 2.12.0+cu130

---
updated-dependencies:
- dependency-name: torch
  dependency-version: 2.12.0+cpu
  dependency-type: direct:production
- dependency-name: torch
  dependency-version: 2.12.0+cpu
  dependency-type: direct:production
- dependency-name: torch
  dependency-version: 2.12.0+cpu
  dependency-type: direct:production
- dependency-name: torch
  dependency-version: 2.12.0+cu130
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-06-27 08:10:29 +00:00
275 changed files with 1572 additions and 18897 deletions

View File

@@ -7,11 +7,8 @@
# Runs only the checks relevant to what's staged:
# - Go files -> make lint + make test-coverage-check
# - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
# - realtime state machines / specs -> make test-realtime-conformance
# (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz
# spec edit must still re-verify the design, detected separately from Go)
# A commit touching none of these is skipped entirely (other docs/YAML can't
# change lint findings, Go coverage, the UI, or the realtime conformance gate).
# A commit touching neither is skipped entirely (docs/YAML/etc. can't change
# lint findings, Go coverage, or the UI).
#
# To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
set -eu
@@ -23,13 +20,11 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)"
go_changed=0
ui_changed=0
rt_changed=0
if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi
if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then
echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping."
if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then
echo "pre-commit: no Go or React UI changes staged — skipping."
exit 0
fi
@@ -62,11 +57,4 @@ if [ "$ui_changed" -eq 1 ]; then
make test-ui-coverage-check
fi
if [ "$rt_changed" -eq 1 ]; then
echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —"
echo " Go transition/rapid tests under -race + FizzBee model check of the"
echo " authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)."
make test-realtime-conformance
fi
echo "pre-commit ✓ all relevant checks passed"

View File

@@ -3745,302 +3745,6 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# voice-detect
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-voice-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-voice-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect'
base-image: "ubuntu:24.04"
ubuntu-version: '2404'
runs-on: 'ubuntu-24.04-arm'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-voice-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-voice-detect'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f32-voice-detect'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f16'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f16-voice-detect'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-voice-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-voice-detect'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-voice-detect'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2204'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-voice-detect'
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
runs-on: 'ubuntu-latest'
skip-drivers: 'false'
backend: "voice-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# face-detect
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-face-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-face-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-face-detect'
base-image: "ubuntu:24.04"
ubuntu-version: '2404'
runs-on: 'ubuntu-24.04-arm'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-face-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-face-detect'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f32-face-detect'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f16'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-sycl-f16-face-detect'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-face-detect'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-face-detect'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-face-detect'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2204'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-face-detect'
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
runs-on: 'ubuntu-latest'
skip-drivers: 'false'
backend: "face-detect"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# acestep-cpp
- build-type: ''
cuda-major-version: ""
@@ -5224,14 +4928,6 @@ includeDarwin:
tag-suffix: "-metal-darwin-arm64-ced"
build-type: "metal"
lang: "go"
- backend: "voice-detect"
tag-suffix: "-metal-darwin-arm64-voice-detect"
build-type: "metal"
lang: "go"
- backend: "face-detect"
tag-suffix: "-metal-darwin-arm64-face-detect"
build-type: "metal"
lang: "go"
- backend: "acestep-cpp"
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
build-type: "metal"
@@ -5295,6 +4991,9 @@ includeDarwin:
- backend: "qwen-tts"
tag-suffix: "-metal-darwin-arm64-qwen-tts"
build-type: "mps"
- backend: "fish-speech"
tag-suffix: "-metal-darwin-arm64-fish-speech"
build-type: "mps"
- backend: "voxcpm"
tag-suffix: "-metal-darwin-arm64-voxcpm"
build-type: "mps"

View File

@@ -82,7 +82,7 @@ jobs:
# as the Linux registry cache.
- name: Restore Homebrew cache
id: brew-cache
uses: actions/cache/restore@v6
uses: actions/cache/restore@v4
with:
path: |
~/Library/Caches/Homebrew/downloads
@@ -142,7 +142,7 @@ jobs:
- name: Save Homebrew cache
if: github.event_name != 'pull_request' && steps.brew-cache.outputs.cache-hit != 'true'
uses: actions/cache/save@v6
uses: actions/cache/save@v4
with:
path: |
~/Library/Caches/Homebrew/downloads
@@ -178,7 +178,7 @@ jobs:
- name: Restore ccache
if: inputs.backend == 'llama-cpp'
id: ccache-cache
uses: actions/cache/restore@v6
uses: actions/cache/restore@v4
with:
path: ~/Library/Caches/ccache
key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}
@@ -211,7 +211,7 @@ jobs:
- name: Restore Python wheel cache
if: inputs.lang == 'python'
id: pyenv-cache
uses: actions/cache/restore@v6
uses: actions/cache/restore@v4
with:
path: |
~/Library/Caches/pip
@@ -256,14 +256,14 @@ jobs:
- name: Save ccache
if: inputs.backend == 'llama-cpp' && github.event_name != 'pull_request'
uses: actions/cache/save@v6
uses: actions/cache/save@v4
with:
path: ~/Library/Caches/ccache
key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}
- name: Save Python wheel cache
if: inputs.lang == 'python' && github.event_name != 'pull_request' && steps.pyenv-cache.outputs.cache-hit != 'true'
uses: actions/cache/save@v6
uses: actions/cache/save@v4
with:
path: |
~/Library/Caches/pip

View File

@@ -46,14 +46,6 @@ jobs:
variable: "CED_VERSION"
branch: "master"
file: "backend/go/ced/Makefile"
- repository: "mudler/voice-detect.cpp"
variable: "VOICEDETECT_VERSION"
branch: "master"
file: "backend/go/voice-detect/Makefile"
- repository: "mudler/face-detect.cpp"
variable: "FACEDETECT_VERSION"
branch: "master"
file: "backend/go/face-detect/Makefile"
- repository: "mudler/depth-anything.cpp"
variable: "DEPTHANYTHING_VERSION"
branch: "master"

View File

@@ -1,69 +0,0 @@
---
name: 'realtime-conformance'
# Verifies the realtime state-machine implementations conform to their formal
# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH
# layers are enforced and the gate is fail-closed: the Go conformance layer
# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of
# the authoritative specs. FizzBee is pinned + checksum-verified
# (formal-verification/fizzbee.sha256), so a failed install fails the job rather
# than silently skipping verification.
on:
pull_request:
paths:
- 'core/http/endpoints/openai/coordinator/**'
- 'core/http/endpoints/openai/respcoord/**'
- 'core/http/endpoints/openai/turncoord/**'
- 'core/http/endpoints/openai/conncoord/**'
- 'core/http/endpoints/openai/compactcoord/**'
- 'core/http/endpoints/openai/ttscoord/**'
- 'formal-verification/**'
- 'scripts/realtime-conformance.sh'
- 'scripts/install-fizzbee.sh'
- '.github/workflows/realtime-conformance.yml'
push:
branches:
- master
paths:
- 'core/http/endpoints/openai/coordinator/**'
- 'core/http/endpoints/openai/respcoord/**'
- 'core/http/endpoints/openai/turncoord/**'
- 'core/http/endpoints/openai/conncoord/**'
- 'core/http/endpoints/openai/compactcoord/**'
- 'core/http/endpoints/openai/ttscoord/**'
- 'formal-verification/**'
- 'scripts/realtime-conformance.sh'
concurrency:
group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
jobs:
conformance:
runs-on: ubuntu-latest
strategy:
matrix:
go-version: ['1.26.x']
steps:
- name: Clone
uses: actions/checkout@v7
- name: Setup Go ${{ matrix.go-version }}
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}
cache: false
- name: Cache FizzBee
uses: actions/cache@v4
with:
path: .tools/fizzbee
key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }}
- name: Install FizzBee (pinned, checksum-verified)
# No `|| true`: a failed/forged download must fail the job, not silently
# drop the design verification. install-fizzbee.sh is a no-op if the
# cached binary is already present and valid.
run: ./scripts/install-fizzbee.sh
- name: Run conformance gate (fail-closed)
# No skip env: both the Go conformance and the FizzBee model check are
# required. The gate auto-detects .tools/fizzbee/fizz.
run: make test-realtime-conformance

View File

@@ -1008,11 +1008,7 @@ jobs:
# image + working dir.
tests-vibevoice-cpp-grpc-transcription:
needs: detect-changes
# Skip on release tag pushes: the ASR Q4_K model is ~10 GB and cannot be
# pulled from HF within the inner `go test -timeout 30m` budget on a CI
# runner, so every tag build hung and timed out. Still runs on PRs/branch
# pushes that touch vibevoice-cpp so regressions are caught off the release path.
if: (needs.detect-changes.outputs.vibevoice-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true') && !startsWith(github.ref, 'refs/tags/')
if: needs.detect-changes.outputs.vibevoice-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
runs-on: bigger-runner
timeout-minutes: 150
steps:

9
.gitignore vendored
View File

@@ -97,12 +97,3 @@ core/http/react-ui/test-results/
# Local Apple signing material (never commit)
.certs/
# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate)
.tools/
# FizzBee model-check artifacts: the parser emits <spec>.json next to each
# .fizz and the checker writes run dirs under out/. Both are regenerated by
# the realtime-conformance gate; only the .fizz sources are authoritative.
formal-verification/*.json
formal-verification/out/

View File

@@ -171,17 +171,6 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
ln -s /opt/rocm-**/lib/llvm/lib/libomp.so /usr/lib/libomp.so \
; fi
# ROCm's bundled libdrm_amdgpu is built with a hardcoded fallback lookup path
# for the ASIC ID table (/opt/amdgpu/share/libdrm/amdgpu.ids), which only exists
# if AMD's full amdgpu graphics/DKMS stack is installed. This compute-only image
# doesn't have it, so hipblas/rocBLAS log "No such file or directory" on every
# model load and can fail to identify the GPU. Point it at the equivalent file
# Ubuntu's libdrm-common package already ships.
RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ -f /usr/share/libdrm/amdgpu.ids ] && [ ! -e /opt/amdgpu/share/libdrm/amdgpu.ids ]; then \
mkdir -p /opt/amdgpu/share/libdrm && \
ln -s /usr/share/libdrm/amdgpu.ids /opt/amdgpu/share/libdrm/amdgpu.ids \
; fi
RUN expr "${BUILD_TYPE}" = intel && echo "intel" > /run/localai/capability || echo "not intel"
# Cuda

View File

@@ -405,18 +405,6 @@ test-realtime: build-mock-backend
@echo 'Running realtime e2e tests (mock backend)'
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
# Verify the realtime state-machine implementations conform to their formal
# designs (Go transition/rapid tests under -race + FizzBee model check of the
# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and
# docs/design/specs/README.md.
test-realtime-conformance:
GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh
# Install the pinned, checksum-verified FizzBee model checker (into .tools/,
# gitignored) used by test-realtime-conformance. Idempotent; no-op if present.
install-fizzbee:
./scripts/install-fizzbee.sh
# Container-based real-model realtime testing. Build env vars / pipeline
# definition kept here so test-realtime-models-docker can drive a fully wired
# pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
@@ -1039,7 +1027,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper
## is reachable.
test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \
BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
BACKEND_TEST_CAPS=health,load,transcription \
$(MAKE) test-extra-backend
@@ -1482,13 +1470,8 @@ build-launcher-darwin:
mv cmd/launcher/LocalAI.app dist/LocalAI.app
bash contrib/macos/sign-and-notarize.sh sign dist/LocalAI.app
# Notarize + staple the .app itself, then wrap it into a drag-to-Applications
# DMG via hdiutil and sign the DMG. The app is stapled BEFORE packaging so the
# bundle carries its own ticket and verifies offline (a dmg-only staple leaves
# the app relying on an online Gatekeeper check, which fails offline / once the
# app is copied out of the dmg). No-op without notary secrets.
# Wrap the (signed) app into a drag-to-Applications DMG via hdiutil, then sign the DMG.
dmg-launcher-darwin: build-launcher-darwin
bash contrib/macos/sign-and-notarize.sh notarize-app dist/LocalAI.app
rm -rf dist/dmg dist/LocalAI.dmg
mkdir -p dist/dmg
cp -R dist/LocalAI.app dist/dmg/LocalAI.app
@@ -1500,7 +1483,7 @@ dmg-launcher-darwin: build-launcher-darwin
notarize-launcher-darwin: dmg-launcher-darwin
bash contrib/macos/sign-and-notarize.sh notarize dist/LocalAI.dmg
# Single entrypoint for CI: build -> sign app -> notarize+staple app -> dmg -> sign dmg -> notarize+staple dmg.
# Single entrypoint for CI: build -> sign app -> dmg -> sign dmg -> notarize -> staple.
release-launcher-darwin: notarize-launcher-darwin
@echo "dist/LocalAI.dmg is ready"

View File

@@ -177,7 +177,6 @@ For more details, see the [Getting Started guide](https://localai.io/basics/gett
## Latest News
- **June 2026**: New native biometric backends from the LocalAI team: [voice-detect.cpp](https://github.com/mudler/voice-detect.cpp) for speaker recognition and voice analysis (ECAPA-TDNN, WeSpeaker, ERes2Net, CAM++, wav2vec2 age/gender/emotion) and [face-detect.cpp](https://github.com/mudler/face-detect.cpp) for face detection, recognition, demographics and anti-spoofing (SCRFD/ArcFace, YuNet/SFace). Both are from-scratch C++/ggml engines with no Python or onnxruntime at inference, self-contained GGUF weights, bit-exact parity with the reference, and GPU cuDNN parity, replacing the heavier Python `insightface` and `speaker-recognition` backends ([PR #10441](https://github.com/mudler/LocalAI/pull/10441)).
- **June 2026**: New [realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo) (a tiny Go client for the Realtime API with a full talk-back voice loop and tool calling), plus [streaming of the realtime LLM / TTS / transcription pipeline stages](https://github.com/mudler/LocalAI/pull/10176) and [configurable WebRTC ICE candidates](https://github.com/mudler/LocalAI/pull/10231).
- **June 2026**: Big speech push: the [parakeet.cpp](https://github.com/mudler/parakeet.cpp) ASR engine gains [NeMo-faithful segment timestamps](https://github.com/mudler/LocalAI/pull/10207), a [multilingual streaming Nemotron-3.5 model](https://github.com/mudler/LocalAI/pull/10199), [dynamic batching for concurrent transcription](https://github.com/mudler/LocalAI/pull/10112) and [CUDA graphs](https://github.com/mudler/LocalAI/pull/10273); the new [CrispASR backend](https://github.com/mudler/LocalAI/pull/10099) adds multi-architecture ASR + TTS, and [60 Piper TTS voices across 42 languages](https://github.com/mudler/LocalAI/pull/10296) land in the gallery (plus [per-request TTS instructions and params](https://github.com/mudler/LocalAI/pull/10172)).
- **June 2026**: New backends and models: [locate-anything.cpp](https://github.com/mudler/LocalAI/pull/10264) for open-vocabulary object detection via ggml, [Ideogram4 image generation](https://github.com/mudler/LocalAI/pull/10201) in stablediffusion-ggml, [llama.cpp video input](https://github.com/mudler/LocalAI/pull/10216), and the [Gemma 4 QAT family with MTP speculative-decoding pairs](https://github.com/mudler/LocalAI/pull/10215). Plus an [interactive CLI chat mode](https://github.com/mudler/LocalAI/pull/10226) and [RAG source citations in agent responses](https://github.com/mudler/LocalAI/pull/10228).

View File

@@ -137,7 +137,7 @@ RUN <<EOT bash
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi
apt-get clean && \
rm -rf /var/lib/apt/lists/*

View File

@@ -18,18 +18,6 @@ service Backend {
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
// AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The
// first message MUST carry a Config; subsequent messages carry Audio frames
// (mono float PCM at config.sample_rate, 16 kHz default). After a
// successful open the backend replies with a single ready ack
// (TranscriptLiveResponse{ready:true}); backends or models without
// cache-aware streaming support return UNIMPLEMENTED instead. Newly
// finalized text streams back as deltas; eou=true marks the model's
// end-of-utterance token. One stream spans many utterances (the decoder
// resets itself after each EOU). Closing the send side finalizes: the
// backend flushes the decoder tail and emits a terminal message carrying
// final_result. A second Config mid-stream resets the decode session.
rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {}
rpc TTS(TTSRequest) returns (Result) {}
rpc TTSStream(TTSRequest) returns (stream Reply) {}
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -491,10 +479,6 @@ message TranscriptResult {
string text = 2;
string language = 3;
float duration = 4;
// True when the decode ended on the model's end-of-utterance special token
// (<EOU>/<EOB>, emitted by cache-aware streaming models such as
// parakeet_realtime_eou_120m-v1). The marker itself is stripped from text.
bool eou = 5;
}
message TranscriptStreamResponse {
@@ -502,34 +486,6 @@ message TranscriptStreamResponse {
TranscriptResult final_result = 2;
}
// === AudioTranscriptionLive messages =====================================
message TranscriptLiveRequest {
oneof payload {
TranscriptLiveConfig config = 1;
TranscriptLiveAudio audio = 2;
}
}
message TranscriptLiveConfig {
string language = 1; // "" => model default
int32 sample_rate = 2; // 0 => 16000; backends may reject others
map<string, string> params = 3; // backend-specific tuning
}
message TranscriptLiveAudio {
repeated float pcm = 1; // mono PCM in [-1,1] at config.sample_rate
}
message TranscriptLiveResponse {
bool ready = 1; // open ack: sent once, before any delta
string delta = 2; // newly-finalized text since previous response
bool eou = 3; // <EOU> fired during this feed (the user yielded the turn)
repeated TranscriptWord words = 4; // words finalized by this feed (stream-relative ns)
TranscriptResult final_result = 5; // terminal message only, after the send side closes
bool eob = 6; // <EOB> fired: a backchannel ("uh-huh") ended — NOT a turn boundary
}
message TranscriptWord {
int64 start = 1;
int64 end = 2;

View File

@@ -1,6 +1,15 @@
## Multimodal support is provided by the in-tree `mtmd` library target
## (examples/mtmd/), which the grpc-server links and includes below. clip/llava
## were pruned upstream; the high-level mtmd_* / mtmd_helper_* API is used instead.
## Clip/LLaVA library for multimodal support — built locally from copied sources
set(TARGET myclip)
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
install(TARGETS ${TARGET} LIBRARY)
target_include_directories(myclip PUBLIC .)
target_include_directories(myclip PUBLIC ../..)
target_include_directories(myclip PUBLIC ../../common)
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if (NOT MSVC)
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual)
endif()
set(TARGET grpc-server)
set(CMAKE_CXX_STANDARD 17)
@@ -58,16 +67,12 @@ add_library(hw_grpc_proto
${hw_proto_hdrs} )
add_executable(${TARGET} grpc-server.cpp json.hpp)
# mtmd public headers (mtmd.h / mtmd-helper.h) live in examples/mtmd/.
# Linking the mtmd target also propagates this include dir, but we add it
# explicitly for clarity.
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
absl::flags_parse
gRPC::${_REFLECTION}
gRPC::${_GRPC_GRPCPP}
protobuf::${_PROTOBUF_LIBPROTOBUF})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,5 +1,5 @@
IK_LLAMA_VERSION?=87fc8701ff4da81a7d2a91ec0695f95eb3066a47
IK_LLAMA_VERSION?=b84902d2ad27c34f989f23947200c4b91b1568fd
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
CMAKE_ARGS?=

View File

@@ -11,8 +11,8 @@
#include <memory>
#include <string>
#include <getopt.h>
#include "mtmd.h"
#include "mtmd-helper.h"
#include "clip.h"
#include "llava.h"
#include "log.h"
#include "common.h"
#include "json.hpp"
@@ -45,9 +45,7 @@ using backend::HealthMessage;
///// LLAMA.CPP server code below
// Match mtmd.h and ik_llama's server/common headers, which all use
// nlohmann::ordered_json; a plain nlohmann::json alias collides at global scope.
using json = nlohmann::ordered_json;
using json = nlohmann::json;
struct server_params
{
@@ -221,11 +219,6 @@ struct llama_client_slot
// multimodal
std::vector<slot_image> images;
// Full prompt with mtmd media markers (mtmd_default_marker()) substituted in
// place of the legacy [img-N] tags, covering the text up to and including the
// last image. The text after the last image is kept in params.input_suffix and
// decoded through the normal token path so the sampling loop is unchanged.
std::string mtmd_prompt;
// stats
size_t sent_count = 0;
@@ -259,14 +252,14 @@ struct llama_client_slot
for (slot_image & img : images)
{
if (img.bitmap) {
mtmd_bitmap_free(img.bitmap);
img.bitmap = nullptr;
free(img.image_embedding);
if (img.img_data) {
clip_image_u8_free(img.img_data);
}
img.prefix_prompt = "";
}
images.clear();
mtmd_prompt = "";
}
bool has_budget(gpt_params &global_params) {
@@ -403,13 +396,46 @@ struct llama_metrics {
}
};
struct llava_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
struct llama_server_context
{
llama_model *model = nullptr;
llama_context *ctx = nullptr;
const llama_vocab * vocab = nullptr;
mtmd_context *mctx = nullptr;
clip_ctx *clp_ctx = nullptr;
gpt_params params;
@@ -465,6 +491,11 @@ struct llama_server_context
if (!params.mmproj.path.empty()) {
multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) {
LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
return false;
}
if (params.n_ctx < 2048) { // request larger context for the image embedding
params.n_ctx = 2048;
@@ -481,24 +512,10 @@ struct llama_server_context
}
if (multimodal) {
// mtmd_init_from_file requires the already-loaded text model, so it must
// run AFTER llama_init_from_gpt_params. It validates the projector
// against the model internally and returns nullptr on dim mismatch, so
// the explicit clip_n_mmproj_embd check is no longer needed.
mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params.mmproj_use_gpu;
mparams.print_timings = false;
mparams.n_threads = params.n_threads_mtmd != -1 ? params.n_threads_mtmd
: params.n_threads_batch != -1 ? params.n_threads_batch
: params.n_threads;
mparams.verbosity = GGML_LOG_LEVEL_INFO;
mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED
: LLAMA_FLASH_ATTN_TYPE_DISABLED;
mparams.image_min_tokens = params.image_min_tokens;
mparams.image_max_tokens = params.image_max_tokens;
mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
if (mctx == nullptr) {
LOG_ERR("unable to load multimodal projector: %s", params.mmproj.path.c_str());
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_model_n_embd(model);
if (n_embd_clip != n_embd_llm) {
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
llama_free(ctx);
llama_free_model(model);
return false;
@@ -848,8 +865,8 @@ struct llama_server_context
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
img_sl.bitmap = mtmd_helper_bitmap_init_from_buf(mctx, image_buffer.data(), image_buffer.size());
if (img_sl.bitmap == nullptr)
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
__func__,
@@ -862,74 +879,50 @@ struct llama_server_context
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
// Translate the legacy [img-N] tags into mtmd media markers, in
// order, and collect the matching bitmaps in marker order so they
// line up with the markers passed to mtmd_tokenize(). The text after
// the last image stays in input_suffix and is decoded through the
// normal token path, so the sampling loop is unchanged.
// example: system prompt [img-102] user [img-103] describe [img-134]
// process prompt
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
if (slot->images.size() > 0 && !slot->prompt.is_array())
{
const std::string marker = mtmd_default_marker();
std::string prompt = slot->prompt.get<std::string>();
std::string built_prompt;
std::vector<slot_image> ordered;
size_t pos = 0, copy_from = 0;
size_t pos = 0, begin_prefix = 0;
std::string pattern = "[img-";
auto free_images = [&]() {
for (slot_image &img : slot->images) {
if (img.bitmap) {
mtmd_bitmap_free(img.bitmap);
img.bitmap = nullptr;
}
}
slot->images.clear();
};
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
size_t tag_begin = pos;
size_t end_prefix = pos;
pos += pattern.length();
size_t end_pos = prompt.find(']', pos);
if (end_pos == std::string::npos) {
break;
}
std::string image_id = prompt.substr(pos, end_pos - pos);
try
if (end_pos != std::string::npos)
{
int img_id = std::stoi(image_id);
bool found = false;
for (slot_image &img : slot->images)
std::string image_id = prompt.substr(pos, end_pos - pos);
try
{
if (img.id == img_id) {
found = true;
// text before this tag, then the media marker
built_prompt += prompt.substr(copy_from, tag_begin - copy_from);
built_prompt += marker;
copy_from = end_pos + 1;
ordered.push_back(img);
break;
int img_id = std::stoi(image_id);
bool found = false;
for (slot_image &img : slot->images)
{
if (img.id == img_id) {
found = true;
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
begin_prefix = end_pos + 1;
break;
}
}
}
if (!found) {
LOG("ERROR: Image with id: %i, not found.\n", img_id);
free_images();
if (!found) {
LOG("ERROR: Image with id: %i, not found.\n", img_id);
slot->images.clear();
return false;
}
} catch (const std::invalid_argument& e) {
LOG("Invalid image number id in prompt\n");
slot->images.clear();
return false;
}
} catch (const std::invalid_argument& e) {
LOG("Invalid image number id in prompt\n");
free_images();
return false;
}
pos = end_pos + 1;
}
// bitmaps are consumed in marker order by mtmd_tokenize()
slot->images = ordered;
slot->mtmd_prompt = built_prompt;
slot->prompt = "";
slot->params.input_suffix = prompt.substr(copy_from);
slot->params.input_suffix = prompt.substr(begin_prefix);
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
}
}
@@ -1183,10 +1176,21 @@ struct llama_server_context
bool process_images(llama_client_slot &slot) const
{
// With the mtmd pipeline, image encoding is no longer eager: the bitmaps
// are tokenized and encoded together with the surrounding text inside
// ingest_images() via mtmd_tokenize() + mtmd_helper_eval_chunks(). This
// just reports whether the slot carries any images to process.
for (slot_image &img : slot.images)
{
if (!img.request_encode_image)
{
continue;
}
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG("Error processing the given image");
return false;
}
img.request_encode_image = false;
}
return slot.images.size() > 0;
}
@@ -1431,70 +1435,69 @@ struct llama_server_context
}
}
// Tokenize the multimodal prompt (text interleaved with media markers) together
// with the slot's bitmaps, then decode the resulting chunks into the llama
// context via the high-level mtmd helper. The helper runs llama_decode() on the
// text chunks and mtmd_encode() + llama_decode() on the image chunks, handling
// batching and any pre/post decode setup (e.g. non-causal attention for gemma3).
// Advances slot.n_past by the number of positions consumed, then leaves the
// post-image suffix tokens in `batch` so the normal decode + sampling loop
// produces the first generated token.
// for multiple images processing
bool ingest_images(llama_client_slot &slot, int n_batch)
{
if (mctx == nullptr)
{
LOG("%s : multimodal context is not initialized\n", __func__);
return false;
}
int image_idx = 0;
// bitmaps stay owned by slot.images (freed on reset()); pass non-owning ptrs
std::vector<const mtmd_bitmap *> bitmaps;
bitmaps.reserve(slot.images.size());
for (const slot_image &img : slot.images)
while (image_idx < (int) slot.images.size())
{
bitmaps.push_back(img.bitmap);
}
slot_image &img = slot.images[image_idx];
mtmd_input_text inp_txt;
inp_txt.text = slot.mtmd_prompt.c_str();
inp_txt.add_special = add_bos_token;
inp_txt.parse_special = true;
// process prefix prompt
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
};
if (llama_decode(ctx, batch_view))
{
LOG("%s : failed to eval\n", __func__);
return false;
}
}
mtmd::input_chunks chunks(mtmd_input_chunks_init());
int32_t res = mtmd_tokenize(mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps.data(),
bitmaps.size());
if (res != 0)
{
LOG("%s : failed to tokenize multimodal prompt, res = %d\n", __func__, res);
return false;
}
// process image with llm
for (int i = 0; i < img.image_tokens; i += n_batch)
{
int n_eval = img.image_tokens - i;
if (n_eval > n_batch)
{
n_eval = n_batch;
}
const llama_pos start_pos = (llama_pos) system_tokens.size() + slot.n_past;
llama_pos new_n_past = start_pos;
if (mtmd_helper_eval_chunks(mctx,
ctx,
chunks.ptr.get(),
start_pos,
slot.id,
n_batch,
/*logits_last=*/ false,
&new_n_past) != 0)
{
LOG("%s : failed to eval multimodal chunks\n", __func__);
return false;
}
slot.n_past += (int32_t) (new_n_past - start_pos);
const int n_embd = llama_model_n_embd(model);
float * embd = img.image_embedding + i * n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
if (llama_decode(ctx, llava_batch.batch))
{
LOG("%s : failed to eval image\n", __func__);
return false;
}
slot.n_past += n_eval;
}
image_idx++;
// queue the post-image suffix text for the normal decode + sampling path
common_batch_clear(batch);
std::vector<llama_token> suffix_tokens = tokenize(slot.params.input_suffix, false);
for (llama_token tok : suffix_tokens)
{
common_batch_add(batch, tok, system_tokens.size() + slot.n_past, { slot.id }, false);
slot.n_past += 1;
common_batch_clear(batch);
// append prefix of next image
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
slot.params.input_suffix : // no more images, then process suffix prompt
(json)(slot.images[image_idx].prefix_prompt);
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i)
{
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
slot.n_past += 1;
}
}
return true;
@@ -1881,11 +1884,8 @@ struct llama_server_context
const bool has_images = process_images(slot);
// For the multimodal path the whole pre-image / inter-image text is
// tokenized and decoded inside ingest_images() via mtmd, so no prefix
// tokens are queued here; the post-image suffix is appended by
// ingest_images() for the normal decode + sampling loop.
std::vector<llama_token> prefix_tokens = has_images ? std::vector<llama_token>() : prompt_tokens;
// process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;

View File

@@ -0,0 +1,11 @@
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2494,7 +2494,7 @@
}
new_data = work.data();
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr, nullptr);
} else {
new_type = cur->type;
new_data = cur->data;

View File

@@ -17,9 +17,28 @@ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
cp -r utils.hpp llama.cpp/examples/grpc-server/
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/
## Multimodal support is provided by the `mtmd` library target (examples/mtmd/),
## which the grpc-server links and includes directly. No source copy is needed:
## clip/llava were pruned upstream and the high-level mtmd_* API is used instead.
## Copy clip/llava files for multimodal support (built as myclip library)
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
# Prepend llama.h include to llava.h
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
# Copy clip-impl.h if it exists
if [ -f llama.cpp/examples/llava/clip-impl.h ]; then
cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
fi
# Copy stb_image.h
if [ -f llama.cpp/vendor/stb/stb_image.h ]; then
cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
elif [ -f llama.cpp/common/stb_image.h ]; then
cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
fi
## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd)
if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then
sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp
fi
set +e
if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then

View File

@@ -11,12 +11,9 @@
#include "json.hpp"
#include "mtmd.h"
#include "clip.h"
// mtmd.h and ik_llama's entire server/common stack (chat.h, server-common.h,
// server-task.h, ...) declare `using json = nlohmann::ordered_json`, so match it
// here: a plain `nlohmann::json` alias collides with mtmd.h's at global scope.
using json = nlohmann::ordered_json;
using json = nlohmann::json;
extern bool server_verbose;
@@ -114,12 +111,13 @@ struct slot_image
{
int32_t id;
// mtmd bitmap (image/audio) decoded from the request buffer. Owned by the
// slot; freed via mtmd_bitmap_free() on reset. The high-level mtmd pipeline
// (mtmd_tokenize + mtmd_helper_eval_chunks) consumes these directly, so the
// legacy eager-encode fields (embedding/tokens) and per-image prefix prompt
// are no longer needed.
mtmd_bitmap * bitmap = nullptr;
bool request_encode_image = false;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
// completion token output with probabilities

View File

@@ -101,13 +101,4 @@ if(LLAMA_GRPC_BUILD_TESTS)
target_link_libraries(message_content_test PRIVATE ${_LLAMA_COMMON_TARGET})
target_compile_features(message_content_test PRIVATE cxx_std_17)
add_test(NAME message_content_test COMMAND message_content_test)
# Parent-death watcher test (parent_watch.h) — standard library only, but
# needs a threading runtime for std::thread.
find_package(Threads REQUIRED)
add_executable(parent_watch_test parent_watch_test.cpp parent_watch.h)
target_include_directories(parent_watch_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(parent_watch_test PRIVATE Threads::Threads)
target_compile_features(parent_watch_test PRIVATE cxx_std_17)
add_test(NAME parent_watch_test COMMAND parent_watch_test)
endif()

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=fdb1db877c526ec90f668eca1b858da5dba85560
LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=
@@ -156,11 +156,11 @@ llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target ggml-rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/ggml-rpc-server llama-cpp-rpc-server
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
llama.cpp:
mkdir -p llama.cpp

View File

@@ -30,19 +30,6 @@
#define LOCALAI_HAS_SERVER_SCHEMA 1
#include "server-schema.cpp"
#endif
// server-stream.cpp exists only in llama.cpp after the upstream refactor that
// added the SSE stream-resumption layer (stream_session/stream_pipe_producer).
// server-context.cpp calls into it (spipe->cleanup(), stream_aware_should_stop,
// stream_session_attach_pipe), so its definitions must be part of this
// translation unit or the link fails with "undefined reference to
// stream_pipe_producer::cleanup()". The file is self-contained (its only
// external symbols come from server-common, already pulled in above) and the
// http route-handler factories it also defines are unused here but harmless.
// __has_include keeps the source compatible with older pins/forks that predate
// the split.
#if __has_include("server-stream.cpp")
#include "server-stream.cpp"
#endif
#include "server-context.cpp"
// LocalAI
@@ -75,8 +62,6 @@
#include <windows.h>
#endif
#include "parent_watch.h" // best-effort parent-death backstop (see header)
using grpc::Server;
using grpc::ServerBuilder;
@@ -3444,10 +3429,6 @@ int main(int argc, char** argv) {
}
}
// Best-effort backstop: self-terminate if the LocalAI process that spawned
// us dies without cleaning us up (see parent_watch.h).
llama_grpc::start_parent_death_watcher();
server_context ctx_server;
BackendServiceImpl service(ctx_server);

View File

@@ -1,179 +0,0 @@
// Parent-death watcher (best-effort backstop) for the llama.cpp gRPC backend.
//
// LocalAI spawns this backend as a child process and, on a clean shutdown,
// tears it down itself (SIGTERM -> grace -> SIGKILL). That graceful path only
// runs when LocalAI receives a catchable signal and lives long enough to run
// its handlers. If LocalAI is SIGKILLed (e.g. a supervising process's grace
// period elapses first), that teardown never runs and this backend would be
// reparented to init and linger, holding VRAM and its listen port.
//
// The watcher here is a best-effort backstop for exactly that case: it does
// NOT replace the graceful teardown, it only covers the "parent vanished
// without cleaning up" path. It detects reparenting: when the process that
// spawned this backend dies, the kernel reparents us to the nearest sub-reaper
// or to init (PID 1), so getppid() stops matching the value captured at
// startup. This getppid() approach is portable across Linux/macOS (unlike the
// Linux-only PR_SET_PDEATHSIG), which is why it is used here, mirroring the Go
// backends' pkg/grpc/parentwatch.go. It is disabled on Windows, which has no
// equivalent orphan-reparenting semantics.
//
// This header is intentionally dependency-free (C++ standard library only) so
// it can be exercised by a standalone unit test (parent_watch_test.cpp) without
// building the full llama.cpp + gRPC backend.
#ifndef LLAMA_GRPC_PARENT_WATCH_H
#define LLAMA_GRPC_PARENT_WATCH_H
#include <algorithm>
#include <cctype>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <string>
#include <thread>
#if !defined(_WIN32)
#include <unistd.h> // getppid(2), _exit(2)
#endif
namespace llama_grpc {
// Env var names are shared verbatim with the Go and Python backends for
// consistency across languages.
inline const char *kEnvParentWatch() { return "LOCALAI_BACKEND_PARENT_WATCH"; }
inline const char *kEnvParentWatchInterval() { return "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL"; }
// Default poll interval in milliseconds. Matches the Go side's 2 * time.Second.
inline long parent_watch_default_interval_ms() { return 2000; }
namespace detail {
inline std::string trim_lower(const std::string &in, bool lower) {
size_t a = in.find_first_not_of(" \t\r\n");
size_t b = in.find_last_not_of(" \t\r\n");
if (a == std::string::npos) {
return "";
}
std::string s = in.substr(a, b - a + 1);
if (lower) {
std::transform(s.begin(), s.end(), s.begin(),
[](unsigned char c) { return std::tolower(c); });
}
return s;
}
} // namespace detail
// parent_watch_enabled reports whether the watcher should run. Enabled by
// default; a falsey value ("false"/"0"/"no"/"off", case-insensitive) disables
// it, matching the Go implementation's exact semantics.
inline bool parent_watch_enabled() {
#if defined(_WIN32)
return false;
#else
const char *v = std::getenv(kEnvParentWatch());
if (v == nullptr || v[0] == '\0') {
return true;
}
const std::string s = detail::trim_lower(v, true);
return !(s == "false" || s == "0" || s == "no" || s == "off");
#endif
}
// parent_watch_interval_ms returns the poll interval in milliseconds. Accepts
// Go-style duration strings ("500ms", "2s", "1m") for cross-language parity, or
// a bare number interpreted as seconds. Defaults to
// parent_watch_default_interval_ms().
inline long parent_watch_interval_ms() {
const long def = parent_watch_default_interval_ms();
const char *v = std::getenv(kEnvParentWatchInterval());
if (v == nullptr || v[0] == '\0') {
return def;
}
const std::string s = detail::trim_lower(v, false);
if (s.empty()) {
return def;
}
size_t i = 0;
while (i < s.size() && (std::isdigit((unsigned char)s[i]) || s[i] == '.')) {
i++;
}
if (i == 0) {
return def;
}
double num = 0.0;
try {
num = std::stod(s.substr(0, i));
} catch (...) {
return def;
}
const std::string unit = s.substr(i);
long ms;
if (unit == "ms") {
ms = (long)num;
} else if (unit == "s" || unit.empty()) {
ms = (long)(num * 1000.0);
} else if (unit == "m") {
ms = (long)(num * 60000.0);
} else {
return def; // unrecognized unit
}
return ms > 0 ? ms : def;
}
#if !defined(_WIN32)
// parent_died reports whether this process has been reparented away from the
// parent it had when the watcher started. Reparenting is the standard POSIX
// signal that the original parent (here, the LocalAI process that spawned this
// backend) has exited: the orphan is handed to the nearest sub-reaper or to
// init (PID 1), so getppid() no longer matches the value captured at startup.
inline bool parent_died(pid_t orig_ppid) {
const pid_t ppid = getppid();
return ppid != orig_ppid || ppid == 1;
}
// watch_parent_death polls until parent_died reports the original parent is
// gone, then invokes on_death. It blocks, so run it on its own thread.
inline void watch_parent_death(pid_t orig_ppid, long interval_ms,
const std::function<void()> &on_death) {
for (;;) {
std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms));
if (parent_died(orig_ppid)) {
on_death();
return;
}
}
}
#endif
// start_parent_death_watcher installs the best-effort safety net described in
// the file header on the calling backend process. It is a no-op when disabled,
// on Windows, or when the process is already orphaned at startup
// (getppid() <= 1). This is a backstop alongside — never a replacement for —
// LocalAI's graceful teardown.
inline void start_parent_death_watcher() {
#if !defined(_WIN32)
if (!parent_watch_enabled()) {
return;
}
const pid_t orig_ppid = getppid();
// A parent of 1 (or less) at startup means we were already orphaned (or
// launched directly under init) — there is no original parent to watch for.
if (orig_ppid <= 1) {
return;
}
const long interval_ms = parent_watch_interval_ms();
std::thread([orig_ppid, interval_ms]() {
watch_parent_death(orig_ppid, interval_ms, [orig_ppid]() {
fprintf(stderr,
"backend parent process (pid %d) exited without stopping "
"this backend; self-terminating to avoid orphaning\n",
(int)orig_ppid);
fflush(stderr);
_exit(1);
});
}).detach();
#endif
}
} // namespace llama_grpc
#endif // LLAMA_GRPC_PARENT_WATCH_H

View File

@@ -1,197 +0,0 @@
// Unit tests for the parent-death watcher (parent_watch.h).
//
// Build & run standalone (C++ standard library only, no nlohmann/json needed):
// g++ -std=c++17 -pthread parent_watch_test.cpp -o t && ./t
//
// The core test (TestDetectsReparent) builds a genuine two-level process tree
// (test -> middle -> grandchild), lets the middle process die, and asserts the
// grandchild's watch_parent_death detects the reparenting and self-terminates —
// mirroring the Go test in pkg/grpc/parentwatch_test.go, but with fork(2).
//
// On Windows this file compiles to a no-op success (the watcher is unsupported
// there), matching parent_watch.h's platform gating.
#include <cstdio>
#include <cstdlib>
#include <string>
#include "parent_watch.h"
static int failures = 0;
static void check(bool ok, const std::string &name) {
if (!ok) {
failures++;
fprintf(stderr, "FAIL: %s\n", name.c_str());
} else {
fprintf(stderr, "ok: %s\n", name.c_str());
}
}
// Env-parsing tests are platform-independent and always run.
static void test_env_parsing() {
using namespace llama_grpc;
// Interval: default when unset.
unsetenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL");
check(parent_watch_interval_ms() == 2000, "interval default 2000ms");
setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "500ms", 1);
check(parent_watch_interval_ms() == 500, "interval 500ms");
setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "2s", 1);
check(parent_watch_interval_ms() == 2000, "interval 2s");
setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "1m", 1);
check(parent_watch_interval_ms() == 60000, "interval 1m");
setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "3", 1); // bare number -> seconds
check(parent_watch_interval_ms() == 3000, "interval bare 3 -> 3000ms");
setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "garbage", 1);
check(parent_watch_interval_ms() == 2000, "interval garbage -> default");
unsetenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL");
#if !defined(_WIN32)
// Enabled semantics (POSIX only; always false on Windows).
unsetenv("LOCALAI_BACKEND_PARENT_WATCH");
check(parent_watch_enabled(), "enabled by default");
for (const char *falsey : {"false", "0", "no", "off", "OFF", " False "}) {
setenv("LOCALAI_BACKEND_PARENT_WATCH", falsey, 1);
check(!parent_watch_enabled(), std::string("disabled by '") + falsey + "'");
}
setenv("LOCALAI_BACKEND_PARENT_WATCH", "true", 1);
check(parent_watch_enabled(), "enabled by 'true'");
setenv("LOCALAI_BACKEND_PARENT_WATCH", "1", 1);
check(parent_watch_enabled(), "enabled by '1'");
unsetenv("LOCALAI_BACKEND_PARENT_WATCH");
#endif
}
#if !defined(_WIN32)
#include <atomic>
#include <ctime>
#include <sys/stat.h>
#include <sys/wait.h>
#include <unistd.h>
static bool file_exists(const std::string &p) {
struct stat st;
return ::stat(p.c_str(), &st) == 0;
}
static bool wait_for_file(const std::string &p, int timeout_ms) {
int waited = 0;
while (waited < timeout_ms) {
if (file_exists(p)) {
return true;
}
usleep(20 * 1000);
waited += 20;
}
return false;
}
static void write_file(const std::string &p, const std::string &content) {
FILE *f = fopen(p.c_str(), "w");
if (f) {
fwrite(content.data(), 1, content.size(), f);
fclose(f);
}
}
// Builds test -> middle -> grandchild via fork(2). The grandchild arms the REAL
// watch_parent_death against middle; middle exits, orphaning the grandchild;
// the watcher must detect the reparenting and self-terminate.
static void test_detects_reparent() {
char tmpl[] = "/tmp/parentwatch_test_XXXXXX";
char *dir = mkdtemp(tmpl);
if (dir == nullptr) {
check(false, "mkdtemp");
return;
}
const std::string ready_file = std::string(dir) + "/ready";
const std::string exited_file = std::string(dir) + "/exited";
pid_t middle = fork();
if (middle < 0) {
check(false, "fork middle");
return;
}
if (middle == 0) {
// ---- middle process ----
pid_t grandchild = fork();
if (grandchild < 0) {
_exit(4);
}
if (grandchild == 0) {
// ---- grandchild process ----
pid_t orig_ppid = getppid(); // == middle
std::thread([&]() {
llama_grpc::watch_parent_death(orig_ppid, 50 /*ms*/, [&]() {
write_file(exited_file, "1");
_exit(7);
});
}).detach();
// Safety valve: never linger if something goes wrong.
std::thread([]() {
usleep(30 * 1000 * 1000);
_exit(2);
}).detach();
// Signal readiness only after the watcher captured orig_ppid.
write_file(ready_file, std::to_string(getpid()));
for (;;) {
pause();
}
}
// middle: wait until grandchild is ready, then exit to orphan it.
if (!wait_for_file(ready_file, 10000)) {
_exit(5);
}
_exit(0);
}
// ---- test (top) process ----
int status = 0;
waitpid(middle, &status, 0); // reap middle only; grandchild is orphaned
check(file_exists(ready_file), "grandchild signaled readiness");
bool detected = wait_for_file(exited_file, 10000);
check(detected, "watcher detected parent death and self-terminated");
// Best-effort cleanup: kill the grandchild if it somehow survived.
if (file_exists(ready_file)) {
FILE *f = fopen(ready_file.c_str(), "r");
if (f) {
int pid = 0;
if (fscanf(f, "%d", &pid) == 1 && pid > 1) {
kill(pid, SIGKILL);
}
fclose(f);
}
}
unlink(ready_file.c_str());
unlink(exited_file.c_str());
rmdir(dir);
}
#endif // !_WIN32
int main() {
test_env_parsing();
#if !defined(_WIN32)
test_detects_reparent();
#endif
if (failures == 0) {
fprintf(stderr, "\nAll parent_watch tests passed.\n");
return 0;
}
fprintf(stderr, "\n%d parent_watch test(s) failed.\n", failures);
return 1;
}

View File

@@ -22,10 +22,6 @@ cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
# unit test (compiled only when -DLLAMA_GRPC_BUILD_TESTS=ON).
cp -r message_content.h llama.cpp/tools/grpc-server/
cp -r message_content_test.cpp llama.cpp/tools/grpc-server/
# Parent-death watcher (included by grpc-server.cpp) and its standalone unit
# test (run via backend/cpp/run-unit-tests.sh; also buildable under ctest).
cp -r parent_watch.h llama.cpp/tools/grpc-server/
cp -r parent_watch_test.cpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/

View File

@@ -8,7 +8,7 @@
# Local development: point at a working checkout instead of cloning, e.g.
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
PRIVACY_FILTER_VERSION?=735a6c28607ee82afc3a670383f41b55266a3b9a
PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
PRIVACY_FILTER_SRC?=

View File

@@ -54,7 +54,7 @@ for test_src in "${tests[@]}"; do
name="$(basename "$test_src" .cpp)"
bin="$(mktemp -d)/$name"
echo "==> $test_src"
if ! "$CXX" -std=c++17 -Wall -Wextra -pthread \
if ! "$CXX" -std=c++17 -Wall -Wextra \
-I"$JSON_INC" -I"$(dirname "$test_src")" \
"$test_src" -o "$bin"; then
echo "COMPILE FAILED: $test_src" >&2

View File

@@ -142,12 +142,19 @@ func buildAnthropicRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream boo
if req.MaxTokens <= 0 {
req.MaxTokens = anthropicDefaultMaxTokens
}
// Do not forward temperature/top_p. Newer Anthropic reasoning models reject
// requests that carry temperature ("`temperature` is deprecated for this
// model"), and the OpenAI-compatible clients typically send only the
// server-side DEFAULT sampling values rather than user intent — dropping
// them loses nothing and lets the upstream apply its own defaults.
_ = opts
// Newer Anthropic models 400 when both temperature and top_p are
// set ("`temperature` and `top_p` cannot both be specified for
// this model. Please use only one.") even though their docs only
// "recommend" picking one. The OpenAI-compatible chat UI almost
// always sends both with default values, so prefer temperature
// and drop top_p when both are present.
if t := opts.GetTemperature(); t != 0 {
v := float64(t)
req.Temperature = &v
} else if t := opts.GetTopP(); t != 0 {
v := float64(t)
req.TopP = &v
}
req.Tools = convertOpenAITools(opts.GetTools())
req.ToolChoice = convertOpenAIToolChoice(opts.GetToolChoice())

View File

@@ -3,6 +3,7 @@ package main
import (
"encoding/json"
"io"
"math"
"net/http"
"net/http/httptest"
"strings"
@@ -74,16 +75,15 @@ func TestPredict_Anthropic_BasicMessages(t *testing.T) {
g.Expect(captured.Messages).To(HaveLen(1))
g.Expect(captured.Messages[0].Role).To(Equal("user"))
g.Expect(captured.MaxTokens).To(Equal(int32(32)))
// Newer Anthropic reasoning models reject requests carrying temperature
// ("`temperature` is deprecated for this model"); clients typically send
// only default sampling values, so the translator forwards neither.
g.Expect(captured.Temperature).To(BeNil())
g.Expect(captured.Temperature).NotTo(BeNil())
g.Expect(*captured.Temperature).To(Equal(0.5))
// Anthropic 400s when both temperature and top_p are set; the
// translator must prefer temperature and drop top_p.
g.Expect(captured.TopP).To(BeNil())
g.Expect(captured.Stream).To(BeFalse())
}
// Sampling parameters are not forwarded at all — the upstream applies its
// own defaults (newest models reject explicit temperature/top_p).
// When only top_p is set, it should be forwarded.
func TestPredict_Anthropic_TopPOnly(t *testing.T) {
g := NewWithT(t)
srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
@@ -99,7 +99,11 @@ func TestPredict_Anthropic_TopPOnly(t *testing.T) {
})
g.Expect(err).NotTo(HaveOccurred())
g.Expect(captured.Temperature).To(BeNil())
g.Expect(captured.TopP).To(BeNil())
// PredictOptions.TopP is float32 on the wire; the translator widens
// to float64 so 0.9 round-trips as 0.8999999761581421… — compare
// with a small tolerance rather than exact equality.
g.Expect(captured.TopP).NotTo(BeNil())
g.Expect(math.Abs(*captured.TopP - 0.9)).To(BeNumerically("<=", 1e-6))
}
func TestPredict_Anthropic_DefaultsMaxTokens(t *testing.T) {

View File

@@ -30,7 +30,7 @@ type openAIRequest struct {
Stream bool `json:"stream,omitempty"`
Temperature *float64 `json:"temperature,omitempty"`
TopP *float64 `json:"top_p,omitempty"`
MaxTokens *int32 `json:"max_completion_tokens,omitempty"` // newer OpenAI models reject max_tokens ("use max_completion_tokens instead")
MaxTokens *int32 `json:"max_tokens,omitempty"`
Stop []string `json:"stop,omitempty"`
FrequencyPenalty *float64 `json:"frequency_penalty,omitempty"`
PresencePenalty *float64 `json:"presence_penalty,omitempty"`
@@ -107,10 +107,14 @@ func buildOpenAIRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream bool)
Tools: parseRawJSON(opts.GetTools()),
ToolChoice: parseRawJSON(opts.GetToolChoice()),
}
// Do not forward temperature/top_p. Newer OpenAI reasoning models reject
// temperature as deprecated, and clients typically send only default
// sampling values rather than user intent — let the upstream apply its
// own defaults.
if t := opts.GetTemperature(); t != 0 {
v := float64(t)
req.Temperature = &v
}
if t := opts.GetTopP(); t != 0 {
v := float64(t)
req.TopP = &v
}
if n := opts.GetTokens(); n > 0 {
req.MaxTokens = &n
}

View File

@@ -74,9 +74,8 @@ func TestPredict_OpenAI_BasicChat(t *testing.T) {
g.Expect(captured.Messages).To(HaveLen(2))
g.Expect(captured.Messages[0].Role).To(Equal("system"))
g.Expect(captured.Messages[1].Role).To(Equal("user"))
// Sampling parameters are not forwarded (newest models reject explicit
// temperature); token limit is serialized as max_completion_tokens.
g.Expect(captured.Temperature).To(BeNil())
g.Expect(captured.Temperature).NotTo(BeNil())
g.Expect(*captured.Temperature).To(Equal(0.5))
g.Expect(captured.MaxTokens).NotTo(BeNil())
g.Expect(*captured.MaxTokens).To(Equal(int32(32)))
g.Expect(captured.Stream).To(BeFalse())

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# CrispASR version (release tag)
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
CRISPASR_VERSION?=9a26976a8c8cf5af0afcdd04463cf8ba91e96a54
CRISPASR_VERSION?=8f1218141b792b8868861c1af17ba1e361b05dc0
SO_TARGET?=libgocrispasr.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

View File

@@ -1,18 +0,0 @@
# Fetched upstream sources
sources/
# CMake build directories
build*/
# build artifacts staged in-tree by the Makefile (cp from sources/) or
# symlinked for local dev; the real sources live in face-detect.cpp upstream.
*.so
*.so.*
facedetect_capi.h
compile_commands.json
# Compiled backend binary
face-detect-grpc
# Packaging output
package/

View File

@@ -1,110 +0,0 @@
# face-detect backend Makefile.
#
# Upstream pin lives below as FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
# convention).
#
# Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
# symlink the .so + header into this directory and skip the clone/cmake steps:
#
# ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
# ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
# go build -o face-detect-grpc .
#
# The default target below does the proper clone-at-pin + cmake build so CI does
# not need a side-checkout.
FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
GOCMD?=go
GO_TAGS?=
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
BUILD_TYPE?=
NATIVE?=false
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
# The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
# side, so only the facedetect_capi_* surface is exported.
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif
# face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
# does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
# -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
ifeq ($(CUDA_MAJOR_VERSION),13)
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
endif
endif
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
else ifeq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
endif
.PHONY: face-detect-grpc package build clean purge test all
all: face-detect-grpc
# Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
# as the target so make only re-clones when missing. After a FACEDETECT_VERSION
# bump, run 'make purge && make' to refetch.
sources/face-detect.cpp:
mkdir -p sources/face-detect.cpp
cd sources/face-detect.cpp && \
git init -q && \
git remote add origin $(FACEDETECT_REPO) && \
git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
git checkout FETCH_HEAD && \
git submodule update --init --recursive --depth 1 --single-branch
# Build the shared lib + header out-of-tree, then stage them next to the Go
# sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
# them up.
libfacedetect.so: sources/face-detect.cpp
cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
package: face-detect-grpc
bash package.sh
build: package
# Test target. The embed/detect/verify/analyze smoke specs are gated on
# FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
# heavy specs auto-skip and only the pure-Go parsing specs run.
test:
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
clean: purge
rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
purge:
rm -rf sources/face-detect.cpp

View File

@@ -1,431 +0,0 @@
package main
import (
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
)
// purego-bound entry points from libfacedetect.so. Names match
// facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
// is enough to spot drift.
//
// The opaque ctx and the malloc'd char*/float* return values are declared as
// uintptr so we get the raw pointer back and can release it via the matching
// capi free function. purego's native string/[]float32 returns would copy and
// forget the original pointer, leaking the C-owned buffer on every call.
var (
CppAbiVersion func() int32
CppLoad func(ggufPath string) uintptr
CppFree func(ctx uintptr)
CppLastError func(ctx uintptr) string
CppFreeString func(s uintptr)
CppFreeVec func(v uintptr)
CppEmbedPath func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
CppEmbedRGB func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
CppDetectJSON func(ctx uintptr, imagePath string) uintptr
CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
)
// FaceDetect implements the face-recognition (biometric) subset of the Backend
// gRPC service over libfacedetect.so. The C side keeps a single loaded model
// pack plus a per-ctx last-error buffer and is not reentrant, so
// base.SingleThread serializes every call.
type FaceDetect struct {
base.SingleThread
opts loadOptions
ctxPtr uintptr
}
func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
model := opts.ModelFile
if model == "" {
model = opts.ModelPath
}
if !filepath.IsAbs(model) && opts.ModelPath != "" {
model = filepath.Join(opts.ModelPath, model)
}
if model == "" {
return errors.New("face-detect: ModelFile is required")
}
f.opts = parseOptions(opts.Options)
if f.opts.modelName == "" {
f.opts.modelName = filepath.Base(model)
}
// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
// one backend process per model and serves requests concurrently, so the
// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
// FACEDETECT_THREADS is read by the engine at backend construction, so it
// must be set before the capi load. A non-positive Threads means "unset":
// leave the env alone so the engine keeps its sane default.
threads := opts.Threads
if threads > 0 {
if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
}
xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
}
xlog.Info("face-detect: loading model", "model", model,
"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
ctx := CppLoad(model)
if ctx == 0 {
// The last-error buffer lives on the ctx that was never returned, so
// surface the path the operator tried to load instead.
return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
}
f.ctxPtr = ctx
return nil
}
// Embeddings returns the L2-normalized ArcFace embedding of the primary face in
// the supplied image. Mirroring the Python face backend, the image is read from
// Images[0] as a base64 payload; materializeImage decodes it to a temp file so
// the path-based C-API can run its own decode (cv2.imread parity). The gRPC
// server wraps the returned slice in an EmbeddingResult.
func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
if f.ctxPtr == 0 {
return nil, errors.New("face-detect: model not loaded")
}
if len(req.Images) == 0 || req.Images[0] == "" {
return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
}
path, cleanup, err := materializeImage(req.Images[0])
if err != nil {
return nil, err
}
defer cleanup()
return f.embedPath(path)
}
func (f *FaceDetect) embedPath(path string) ([]float32, error) {
var vec uintptr
var dim int32
rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
if rc != 0 || vec == 0 || dim <= 0 {
return nil, f.lastErr("embed", path)
}
defer CppFreeVec(vec)
// Copy out of the C-owned malloc'd buffer before freeing it. The
// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
// nor moves this buffer and we copy immediately.
src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
out := make([]float32, int(dim))
copy(out, src)
return out, nil
}
// Detect runs SCRFD over the image and returns one Detection per face. The
// C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
// width/height, so the corners are converted. The 5 facial landmarks the engine
// also returns are dropped: the Detection message has no field for them.
func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
if f.ctxPtr == 0 {
return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
}
if req.Src == "" {
return pb.DetectResponse{}, errors.New("face-detect: src image is required")
}
path, cleanup, err := materializeImage(req.Src)
if err != nil {
return pb.DetectResponse{}, err
}
defer cleanup()
faces, err := f.detectFaces(path)
if err != nil {
return pb.DetectResponse{}, err
}
dets := make([]*pb.Detection, 0, len(faces))
for _, fc := range faces {
if req.Threshold > 0 && fc.Score < req.Threshold {
continue
}
x, y, w, h := fc.xywh()
dets = append(dets, &pb.Detection{
X: x,
Y: y,
Width: w,
Height: h,
Confidence: fc.Score,
ClassName: "face",
})
}
return pb.DetectResponse{Detections: dets}, nil
}
// FaceVerify embeds the primary face in each image and reports whether they are
// the same identity by cosine distance against a threshold. A request threshold
// <= 0 falls back to the model-configured default (verify_threshold option,
// 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
// veto internally (verified forced false on a spoof); the per-image liveness
// scores are not exposed by the verify entry point, so img*_is_real /
// img*_antispoof_score stay at their zero values.
func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
if f.ctxPtr == 0 {
return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
}
if req.Img1 == "" || req.Img2 == "" {
return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
}
path1, cleanup1, err := materializeImage(req.Img1)
if err != nil {
return pb.FaceVerifyResponse{}, err
}
defer cleanup1()
path2, cleanup2, err := materializeImage(req.Img2)
if err != nil {
return pb.FaceVerifyResponse{}, err
}
defer cleanup2()
threshold := req.Threshold
if threshold <= 0 {
threshold = f.opts.verifyThreshold
}
antiSpoof := int32(0)
if req.AntiSpoofing {
antiSpoof = 1
}
started := time.Now()
var distance float32
var verified int32
rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
unsafe.Pointer(&distance), unsafe.Pointer(&verified))
if rc != 0 {
return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
}
elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
// matching the Python face backend's reporting.
confidence := float32(0)
if threshold > 0 {
confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
}
return pb.FaceVerifyResponse{
Verified: verified != 0,
Distance: distance,
Threshold: threshold,
Confidence: confidence,
Model: f.opts.modelName,
Img1Area: f.bestArea(path1),
Img2Area: f.bestArea(path2),
ProcessingTimeMs: elapsedMs,
}, nil
}
// FaceAnalyze runs the genderage head on every detected face. The C-API returns
// "M"/"F" gender labels and a rounded age; the labels are normalized to the
// "Man"/"Woman" values the proto documents.
func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
if f.ctxPtr == 0 {
return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
}
if req.Img == "" {
return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
}
path, cleanup, err := materializeImage(req.Img)
if err != nil {
return pb.FaceAnalyzeResponse{}, err
}
defer cleanup()
ptr := CppAnalyzeJSON(f.ctxPtr, path)
if ptr == 0 {
return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
}
defer CppFreeString(ptr)
faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
if err != nil {
return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
}
return pb.FaceAnalyzeResponse{Faces: faces}, nil
}
// faceBox is one entry of the detect/analyze JSON documents the engine emits.
type faceBox struct {
Score float32 `json:"score"`
Box []float32 `json:"box"`
Age float32 `json:"age"`
Gender string `json:"gender"`
}
// xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
// proto carries. A short or missing box yields zeros.
func (b faceBox) xywh() (x, y, w, h float32) {
if len(b.Box) < 4 {
return 0, 0, 0, 0
}
return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
}
type facesJSON struct {
Faces []faceBox `json:"faces"`
}
func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
ptr := CppDetectJSON(f.ctxPtr, path)
if ptr == 0 {
return nil, f.lastErr("detect", path)
}
defer CppFreeString(ptr)
var doc facesJSON
if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
}
return doc.Faces, nil
}
// bestArea returns the FacialArea of the highest-scoring face in an image, or an
// empty area when detection fails or finds nothing. Best-effort: verify already
// succeeded, so a missing region must not turn a valid match into an error.
func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
faces, err := f.detectFaces(path)
if err != nil || len(faces) == 0 {
return &pb.FacialArea{}
}
best := faces[0]
for _, fc := range faces[1:] {
if fc.Score > best.Score {
best = fc
}
}
x, y, w, h := best.xywh()
return &pb.FacialArea{X: x, Y: y, W: w, H: h}
}
// parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
// The engine reports gender as "M"/"F"; both the dominant label and the score
// map are filled with the "Man"/"Woman" form the proto documents.
func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
var parsed facesJSON
if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
return nil, err
}
out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
for _, fc := range parsed.Faces {
x, y, w, h := fc.xywh()
fa := &pb.FaceAnalysis{
Region: &pb.FacialArea{X: x, Y: y, W: w, H: h},
FaceConfidence: fc.Score,
Age: fc.Age,
}
if label := normalizeGender(fc.Gender); label != "" {
fa.DominantGender = label
fa.Gender = map[string]float32{label: 1.0}
}
out = append(out, fa)
}
return out, nil
}
// normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
// proto documents. Unknown codes pass through unchanged.
func normalizeGender(g string) string {
switch strings.ToUpper(strings.TrimSpace(g)) {
case "M":
return "Man"
case "F":
return "Woman"
case "":
return ""
default:
return g
}
}
// materializeImage decodes a base64 image payload into a temp file and returns
// its path plus a cleanup func. As a convenience for callers that already pass a
// filesystem path (e.g. a test fixture), an existing path is used as-is with a
// no-op cleanup. data: URI prefixes are stripped before decoding.
func materializeImage(src string) (path string, cleanup func(), err error) {
noop := func() {}
if src == "" {
return "", noop, errors.New("face-detect: empty image input")
}
if _, statErr := os.Stat(src); statErr == nil {
return src, noop, nil
}
payload := src
if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
payload = payload[i+1:]
}
data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
if decErr != nil || len(data) == 0 {
return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
}
tmp, createErr := os.CreateTemp("", "face-detect-*.img")
if createErr != nil {
return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
}
cleanup = func() { _ = os.Remove(tmp.Name()) }
if _, wErr := tmp.Write(data); wErr != nil {
_ = tmp.Close()
cleanup()
return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
}
if cErr := tmp.Close(); cErr != nil {
cleanup()
return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
}
return tmp.Name(), cleanup, nil
}
// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
func (f *FaceDetect) lastErr(op, subject string) error {
msg := strings.TrimSpace(CppLastError(f.ctxPtr))
if msg == "" {
msg = "no error detail"
}
return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
}
// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
//
// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
// moves the buffer and we dereference it immediately to copy the bytes out.
func goStringFromCPtr(cptr uintptr) string {
if cptr == 0 {
return ""
}
p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
n := 0
for *(*byte)(unsafe.Add(p, n)) != 0 {
n++
}
return string(unsafe.Slice((*byte)(p), n))
}

View File

@@ -1,230 +0,0 @@
package main
import (
"encoding/base64"
"os"
"sync"
"testing"
"github.com/ebitengine/purego"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestFaceDetect(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "face-detect Backend Suite")
}
var (
libLoadOnce sync.Once
libLoadErr error
)
// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
// bridge without spinning up the gRPC server. Records the error (the smoke
// specs skip themselves) when libfacedetect.so is not loadable from cwd
// (LD_LIBRARY_PATH or a symlink in ./).
func ensureLibLoaded() error {
libLoadOnce.Do(func() {
libName := os.Getenv("FACEDETECT_LIBRARY")
if libName == "" {
libName = "libfacedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
libLoadErr = err
return
}
purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
})
return libLoadErr
}
var _ = Describe("parseOptions", func() {
It("defaults verify_threshold to 0.35", func() {
o := parseOptions(nil)
Expect(o.verifyThreshold).To(Equal(float32(0.35)))
Expect(o.modelName).To(Equal(""))
})
It("parses verify_threshold, threshold alias and model_name", func() {
o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
Expect(o.verifyThreshold).To(Equal(float32(0.4)))
Expect(o.modelName).To(Equal("buffalo_l"))
o2 := parseOptions([]string{"threshold:0.3"})
Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
})
It("ignores non-positive thresholds and keeps the default", func() {
o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
Expect(o.verifyThreshold).To(Equal(float32(0.35)))
})
})
var _ = Describe("normalizeGender", func() {
It("maps M/F codes to Man/Woman", func() {
Expect(normalizeGender("M")).To(Equal("Man"))
Expect(normalizeGender("f")).To(Equal("Woman"))
Expect(normalizeGender(" m ")).To(Equal("Man"))
})
It("passes empty and unknown codes through", func() {
Expect(normalizeGender("")).To(Equal(""))
Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
})
})
var _ = Describe("faceBox.xywh", func() {
It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
b := faceBox{Box: []float32{10, 20, 50, 80}}
x, y, w, h := b.xywh()
Expect(x).To(Equal(float32(10)))
Expect(y).To(Equal(float32(20)))
Expect(w).To(Equal(float32(40)))
Expect(h).To(Equal(float32(60)))
})
It("returns zeros for a short box", func() {
x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
})
})
var _ = Describe("parseAnalyzeJSON", func() {
It("maps region, age and gender for each face", func() {
doc := `{"faces":[
{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
faces, err := parseAnalyzeJSON(doc)
Expect(err).ToNot(HaveOccurred())
Expect(faces).To(HaveLen(2))
Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
Expect(faces[0].DominantGender).To(Equal("Man"))
Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
Expect(faces[0].Region.W).To(Equal(float32(40)))
Expect(faces[0].Region.H).To(Equal(float32(60)))
Expect(faces[1].DominantGender).To(Equal("Woman"))
})
It("tolerates a missing gender field", func() {
faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
Expect(err).ToNot(HaveOccurred())
Expect(faces).To(HaveLen(1))
Expect(faces[0].DominantGender).To(Equal(""))
Expect(faces[0].Gender).To(BeEmpty())
})
It("returns no faces for an empty document", func() {
faces, err := parseAnalyzeJSON(`{"faces":[]}`)
Expect(err).ToNot(HaveOccurred())
Expect(faces).To(BeEmpty())
})
It("returns an error on malformed JSON", func() {
_, err := parseAnalyzeJSON(`{not-json`)
Expect(err).To(HaveOccurred())
})
})
var _ = Describe("materializeImage", func() {
It("decodes a base64 payload to a temp file", func() {
payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
path, cleanup, err := materializeImage(payload)
Expect(err).ToNot(HaveOccurred())
defer cleanup()
data, rerr := os.ReadFile(path)
Expect(rerr).ToNot(HaveOccurred())
Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
})
It("strips a data: URI prefix before decoding", func() {
payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
path, cleanup, err := materializeImage(payload)
Expect(err).ToNot(HaveOccurred())
defer cleanup()
data, rerr := os.ReadFile(path)
Expect(rerr).ToNot(HaveOccurred())
Expect(data).To(Equal([]byte("hello")))
})
It("uses an existing path as-is", func() {
tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
Expect(err).ToNot(HaveOccurred())
defer func() { _ = os.Remove(tmp.Name()) }()
Expect(tmp.Close()).To(Succeed())
path, cleanup, err := materializeImage(tmp.Name())
Expect(err).ToNot(HaveOccurred())
defer cleanup()
Expect(path).To(Equal(tmp.Name()))
})
It("errors on input that is neither a path nor base64", func() {
_, _, err := materializeImage("not base64!!!")
Expect(err).To(HaveOccurred())
})
})
// The specs below exercise the real C-API end to end. They run only when both a
// model GGUF and a test image are provided, and skip cleanly otherwise so the
// suite stays green without large assets.
var _ = Describe("FaceDetect end-to-end", Ordered, func() {
var (
f *FaceDetect
modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
)
BeforeAll(func() {
if modelPath == "" || imagePath == "" {
Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
}
if err := ensureLibLoaded(); err != nil {
Skip("libfacedetect.so not loadable: " + err.Error())
}
f = &FaceDetect{}
Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
})
It("embeds the primary face in an image", func() {
emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
Expect(err).ToNot(HaveOccurred())
Expect(emb).ToNot(BeEmpty())
})
It("detects at least one face", func() {
resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Detections).ToNot(BeEmpty())
Expect(resp.Detections[0].ClassName).To(Equal("face"))
})
It("verifies an image against itself as the same identity", func() {
resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Verified).To(BeTrue())
Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
})
It("analyzes age/gender for each face", func() {
resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Faces).ToNot(BeEmpty())
})
})

View File

@@ -1,65 +0,0 @@
package main
// Started internally by LocalAI - one gRPC server per loaded model.
//
// Loads libfacedetect.so via purego and registers the flat C-API entry points
// declared in facedetect_capi.h. The library name can be overridden with
// FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
// convention in the sibling backends); the default looks for the .so next to
// this binary (resolved via LD_LIBRARY_PATH by run.sh).
import (
"flag"
"fmt"
"os"
"github.com/ebitengine/purego"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
type LibFuncs struct {
FuncPtr any
Name string
}
func main() {
libName := os.Getenv("FACEDETECT_LIBRARY")
if libName == "" {
libName = "libfacedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
}
// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
// uintptr so the raw pointer can be freed via the matching capi free fn.
libFuncs := []LibFuncs{
{&CppAbiVersion, "facedetect_capi_abi_version"},
{&CppLoad, "facedetect_capi_load"},
{&CppFree, "facedetect_capi_free"},
{&CppLastError, "facedetect_capi_last_error"},
{&CppFreeString, "facedetect_capi_free_string"},
{&CppFreeVec, "facedetect_capi_free_vec"},
{&CppEmbedPath, "facedetect_capi_embed_path"},
{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
{&CppDetectJSON, "facedetect_capi_detect_path_json"},
{&CppVerifyPaths, "facedetect_capi_verify_paths"},
{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
}
for _, lf := range libFuncs {
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
}
fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
flag.Parse()
if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
panic(err)
}
}

View File

@@ -1,47 +0,0 @@
package main
import (
"strconv"
"strings"
)
// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
// not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
// face backend ships with so the two implementations agree on verdicts out of
// the box.
const defaultVerifyThreshold float32 = 0.35
// loadOptions holds the parsed model-level options for face-detect.
type loadOptions struct {
verifyThreshold float32
modelName string
}
func splitOption(o string) (key, value string, ok bool) {
i := strings.Index(o, ":")
if i < 0 {
return "", "", false
}
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
}
// parseOptions reads the backend "key:value" option slice. Unknown keys are
// ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
func parseOptions(opts []string) loadOptions {
o := loadOptions{verifyThreshold: defaultVerifyThreshold}
for _, oo := range opts {
key, value, ok := splitOption(oo)
if !ok {
continue
}
switch key {
case "verify_threshold", "threshold":
if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
o.verifyThreshold = float32(f)
}
case "model_name":
o.modelName = value
}
}
return o
}

View File

@@ -1,68 +0,0 @@
#!/bin/bash
#
# Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
# so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
# is used instead of the host's.
set -e
CURDIR=$(dirname "$(realpath "$0")")
REPO_ROOT="${CURDIR}/../../.."
mkdir -p "$CURDIR/package/lib"
cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
# libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
# LD_LIBRARY_PATH, which run.sh points at lib/.
cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
exit 1
}
# Detect architecture and copy the core runtime libs libfacedetect.so links
# against, plus the matching dynamic loader as lib/ld.so.
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ "$(uname -s)" = "Darwin" ]; then
echo "Detected Darwin"
else
echo "Error: Could not detect architecture"
exit 1
fi
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
# BUILD_TYPE so the backend can reach the GPU without the runtime base image
# shipping those drivers.
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"

View File

@@ -1,16 +0,0 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
# If a self-contained ld.so was packaged, route through it so the packaged
# libc / libstdc++ are used instead of the host's (matches the voice-detect /
# whisper / parakeet backends' runtime layout).
if [ -f "$CURDIR/lib/ld.so" ]; then
echo "Using lib/ld.so"
exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
fi
exec "$CURDIR/face-detect-grpc" "$@"

View File

@@ -1,15 +0,0 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
cd "$CURDIR"
echo "Running face-detect backend tests..."
# The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
# specs run only when a model + image are provided via
# FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
# auto-skip.
LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
echo "face-detect tests completed."

View File

@@ -1,6 +1,6 @@
# parakeet-cpp backend Makefile.
#
# Upstream pin lives below as PARAKEET_VERSION?=e8acc6172a94e20a952cf1843decace5d771a94b
# Upstream pin lives below as PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
# (.github/bump_deps.sh) can find and update it - matches the
# whisper.cpp / ds4 / vibevoice-cpp convention.
#
@@ -15,7 +15,7 @@
# That's what the L0 smoke test uses. The default target below does the
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
PARAKEET_VERSION?=e8acc6172a94e20a952cf1843decace5d771a94b
PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
GOCMD?=go

View File

@@ -1,81 +0,0 @@
package main
// utteranceBoundary is the single definition of a small state machine that was
// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc
// toggle — in the live feed (live.go), the file-stream text path, and the
// file-stream JSON path (goparakeetcpp.go).
//
// It answers one running question: does the decode currently rest on an
// end-of-utterance boundary? That is the value a closing FinalResult reports as
// .Eou and the realtime turn detector treats as a commit point.
//
// parakeet auto-resets its decoder after every <EOU>/<EOB>, so one streaming
// session is a sequence of utterances and this is a LATCH, not a monotonic
// flag: it closes on an <EOU> and reopens as soon as the next utterance starts.
// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes
// false->true because each turn gets a fresh stream. Here the stream outlives
// the turn, so the boundary status must be able to reopen.)
//
// The only transitions, over the events one streamFeedResult carries — an
// <EOU>, an <EOB> (backchannel), or plain speech output (text and/or words):
//
// <EOU>
// open ───────────► closed
// ▲ ▲ │ │ │
// │ └─┘ <EOB>|speech │ │ <EOU>
// │ (stay open) │ └─┘ (stay closed)
// └──────────────────┘
// <EOB>|speech
//
// open = NOT on an utterance boundary: mid-utterance, the last boundary was
// a backchannel <EOB>, or the stream just began (the initial state).
// closed = the last meaningful event was an <EOU> with no later speech: a real
// turn boundary.
//
// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush
// that produced no tail) is a no-op and leaves the state unchanged, matching
// the legacy "leave finalEou as it was" behaviour.
//
// The state carries no data, so it is modelled as a two-valued type (a named
// bool) rather than an int enum: every inhabitant is legal, so illegal states
// are unrepresentable — the payload-free analog of the sealed sum types the
// realtime machines use (those need interfaces because their states carry data,
// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar
// cannot even express).
type utteranceBoundary bool
const (
// boundaryOpen is the zero value (false), so a fresh decode starts open —
// exactly the legacy `var finalEou bool` (false) initial condition.
boundaryOpen utteranceBoundary = false
boundaryClosed utteranceBoundary = true
)
// observe folds one decode increment into the latch and returns the new state.
//
// <EOU> takes priority when a single feed carries both an <EOU> and speech
// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND
// ended, so the decode rests on the boundary. This matches the legacy
// eou-checked-first ordering at every call site.
func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary {
switch {
case r.Eou:
return boundaryClosed
case r.Eob || r.Delta != "" || len(r.Words) > 0:
return boundaryOpen
default:
return b
}
}
// ended reports whether the decode currently rests on an end-of-utterance
// boundary (a real <EOU>, not a backchannel <EOB>). This is what a closing
// FinalResult carries as .Eou.
func (b utteranceBoundary) ended() bool { return b == boundaryClosed }
func (b utteranceBoundary) String() string {
if b == boundaryClosed {
return "closed"
}
return "open"
}

View File

@@ -1,92 +0,0 @@
package main
import (
"math/rand/v2"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() {
It("starts open: a fresh decode is not on a boundary", func() {
var b utteranceBoundary
Expect(b).To(Equal(boundaryOpen))
Expect(b.ended()).To(BeFalse())
})
DescribeTable("single feed transition from the open state",
func(r streamFeedResult, wantEnded bool) {
Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded))
},
Entry("<EOU> closes it", streamFeedResult{Eou: true}, true),
Entry("<EOU> with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true),
Entry("<EOB> stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false),
Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false),
Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false),
)
DescribeTable("single feed transition from the closed state",
func(r streamFeedResult, wantEnded bool) {
Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded))
},
Entry("another <EOU> stays closed", streamFeedResult{Eou: true}, true),
Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false),
Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
Entry("a backchannel <EOB> reopens it", streamFeedResult{Eob: true}, false),
Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true),
)
It("is a latch: <EOU> then trailing speech reopens, then <EOU> closes again", func() {
b := boundaryOpen
b = b.observe(streamFeedResult{Delta: "turn one", Eou: true})
Expect(b.ended()).To(BeTrue())
b = b.observe(streamFeedResult{Delta: " and more"})
Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance")
b = b.observe(streamFeedResult{Eou: true})
Expect(b.ended()).To(BeTrue())
})
It("treats a backchannel before a real EOU correctly", func() {
b := boundaryOpen
b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true})
Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
b = b.observe(streamFeedResult{Delta: "done", Eou: true})
Expect(b.ended()).To(BeTrue())
})
It("matches the reference fold over seeded random feed sequences", func() {
// The invariant: after any sequence of feeds, ended() is true iff the
// last feed that carried ANY event was an <EOU>. <EOU> takes priority
// when a feed carries both an EOU and speech; empty feeds are ignored.
for seed := uint64(1); seed <= 200; seed++ {
rng := rand.New(rand.NewPCG(seed, seed*2654435761))
b := boundaryOpen
lastWasEou := false // reference: did the last meaningful feed end on EOU?
steps := rng.IntN(30)
for i := 0; i < steps; i++ {
var r streamFeedResult
switch rng.IntN(5) {
case 0:
r = streamFeedResult{Eou: true}
case 1:
r = streamFeedResult{Eob: true}
case 2:
r = streamFeedResult{Delta: "w"}
case 3:
r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins
case 4:
r = streamFeedResult{} // empty: no-op
}
b = b.observe(r)
if r.Eou {
lastWasEou = true
} else if r.Eob || r.Delta != "" || len(r.Words) > 0 {
lastWasEou = false
}
}
Expect(b.ended()).To(Equal(lastWasEou),
"seed %d: latch disagreed with the reference fold", seed)
}
})
})

View File

@@ -1,82 +0,0 @@
package main
import (
"context"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
// streamFeedResult is one decode increment from a cache-aware streaming session:
// the newly-finalized text plus the model's own per-feed boundary tokens
// (<EOU>/<EOB>) and word timings. It is the single event type both the live
// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs
// older text-only entry-point split behind one shape.
type streamFeedResult struct {
Delta string
Eou bool
Eob bool
Words []transcriptWord
}
// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when
// finalize is true) and returns the unified decode increment. It prefers the
// ABI v4 JSON entry points (which also carry per-word timestamps) and falls
// back to the older text-only entry points against an older libparakeet.so.
//
// This is the one place the JSON-vs-text choice is made; every consumer works
// in terms of streamFeedResult.
func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) {
if CppStreamFeedJSON != nil {
doc, err := p.streamFeedDoc(stream, pcm, finalize)
if err != nil {
return streamFeedResult{}, err
}
return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil
}
delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize)
if err != nil {
return streamFeedResult{}, err
}
return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil
}
// feedSlices feeds pcm through the session in streamChunkSamples slices,
// invoking onFeed for each decode increment. It does NOT finalize: callers
// decide when the send side is done. The file path finalizes after the whole
// file; the live path finalizes only when its request channel closes, never
// between audio messages. Slicing keeps each per-call engineMu hold short so
// concurrent unary transcription interleaves fairly (the C session buffers
// internally).
//
// If ctx is non-nil it is checked before each slice so a cancelled file
// transcription stops promptly; the live path passes nil (it is bounded by its
// request channel instead of a ctx).
func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error {
for off := 0; off < len(pcm); off += streamChunkSamples {
if ctx != nil {
if err := ctx.Err(); err != nil {
return status.Error(codes.Canceled, "transcription cancelled")
}
}
end := min(off+streamChunkSamples, len(pcm))
res, err := p.feedChunk(stream, pcm[off:end], false)
if err != nil {
return err
}
if err := onFeed(res); err != nil {
return err
}
}
return nil
}
// flushTail finalizes the session once and folds the flushed tail (the last
// ~2 encoder frames of text, which only appear on finalize) through onFeed.
func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error {
res, err := p.feedChunk(stream, nil, true)
if err != nil {
return err
}
return onFeed(res)
}

View File

@@ -103,13 +103,12 @@ type transcriptJSON struct {
// {"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
// "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
//
// "text" is the newly-finalized text since the last call. Under ABI v5 "eou"
// is 1 iff an <EOU> fired this feed (the user yielded the turn) and "eob" 1
// iff an <EOB> fired (a backchannel like "uh-huh" ended — NOT a turn
// boundary). A v4 library has no "eob" field and its "eou" conflates both
// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are
// the words finalized this call with absolute (stream-relative) start/end
// seconds.
// "text" is the newly-finalized text since the last call; "eou" is 1 when an
// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
// we read both and treat either as an utterance boundary for segmentation.
// "words" are the words finalized this call with absolute (stream-relative)
// start/end seconds.
type streamFeedJSON struct {
Text string `json:"text"`
Eou int `json:"eou"`
@@ -365,7 +364,7 @@ var segmentSeparators = []rune{'.', '?', '!'}
// the caller requested word granularity; token ids populate each segment's
// Tokens by time-window membership. Shared by the batched and direct paths.
func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
text, eou := stripEouMarker(strings.TrimSpace(doc.Text))
text := strings.TrimSpace(doc.Text)
// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
gapSeconds := 0.0
@@ -384,7 +383,6 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
return pb.TranscriptResult{
Text: text,
Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
Eou: eou,
}
}
@@ -411,25 +409,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
}
segments = append(segments, seg)
}
return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou}
}
// stripEouMarker removes a trailing literal <EOU>/<EOB> from offline-decode
// text and reports whether the decode ended on an end-of-UTTERANCE token. The
// realtime EOU model's offline decode keeps the special token in the
// detokenized text (the streaming path strips it and surfaces it as flags
// instead); user-visible transcripts must never carry either marker, but only
// <EOU> may confirm the semantic_vad retranscribe cross-check — a decode
// ending on <EOB> means the last thing heard was a backchannel, not the user
// yielding the turn.
func stripEouMarker(text string) (string, bool) {
if strings.HasSuffix(text, "<EOU>") {
return strings.TrimSpace(strings.TrimSuffix(text, "<EOU>")), true
}
if strings.HasSuffix(text, "<EOB>") {
return strings.TrimSpace(strings.TrimSuffix(text, "<EOB>")), false
}
return text, false
return pb.TranscriptResult{Text: text, Segments: segments}
}
// splitWordsIntoSegments groups words into segments exactly as NeMo's
@@ -496,55 +476,41 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
return ids
}
// streamSegmenter accumulates streaming decode increments into per-utterance
// segments. <EOU>/<EOB> are the model's own utterance boundaries; each closes a
// segment. When the feed carries per-word timings (ABI v4 JSON), a closed
// segment takes its start/end from its first/last word; against an older
// text-only library (no words) it falls back to segmenting the delta text, so
// the same assembler serves both paths.
// streamSegmenter accumulates streaming words into per-utterance segments. EOU
// is the model's own utterance boundary; each closed segment takes its start/end
// from its first/last accumulated word.
type streamSegmenter struct {
segs []*pb.TranscriptSegment
cur []transcriptWord // words for the open segment (ABI v4 JSON path)
curText []string // delta text for the open segment (text-only path)
nextID int32
segs []*pb.TranscriptSegment
cur []transcriptWord
nextID int32
}
func (s *streamSegmenter) add(r streamFeedResult) {
s.cur = append(s.cur, r.Words...)
if len(r.Words) == 0 && r.Delta != "" {
// Older libparakeet.so with no per-word timing: segment from the text.
s.curText = append(s.curText, r.Delta)
}
// Both <EOU> and <EOB> reset the decoder, so both close a segment.
if r.Eou || r.Eob {
func (s *streamSegmenter) add(doc streamFeedJSON) {
s.cur = append(s.cur, doc.Words...)
// Close the segment on either turn signal: <EOU> (end of utterance) or
// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
// OR them here to keep the v4 segmentation boundaries.
if doc.Eou != 0 || doc.Eob != 0 {
s.flush()
}
}
func (s *streamSegmenter) flush() {
switch {
case len(s.cur) > 0:
parts := make([]string, len(s.cur))
for i, w := range s.cur {
parts[i] = w.W
}
s.segs = append(s.segs, &pb.TranscriptSegment{
Id: s.nextID,
Start: secondsToNanos(s.cur[0].Start),
End: secondsToNanos(s.cur[len(s.cur)-1].End),
Text: strings.TrimSpace(strings.Join(parts, " ")),
})
s.nextID++
case len(s.curText) > 0:
// No words this segment: emit a text-only segment (no timestamps),
// skipping a purely-whitespace one as the legacy text path did.
if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" {
s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t})
s.nextID++
}
if len(s.cur) == 0 {
return
}
parts := make([]string, len(s.cur))
for i, w := range s.cur {
parts[i] = w.W
}
s.segs = append(s.segs, &pb.TranscriptSegment{
Id: s.nextID,
Start: secondsToNanos(s.cur[0].Start),
End: secondsToNanos(s.cur[len(s.cur)-1].End),
Text: strings.TrimSpace(strings.Join(parts, " ")),
})
s.nextID++
s.cur = nil
s.curText = nil
}
func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
@@ -569,119 +535,18 @@ func secondsToNanos(sec float64) int64 {
return int64(sec * 1e9)
}
// Per-C-call engine serialization for the streaming paths.
//
// Every individual C call (begin / feed / finalize / free) takes engineMu and
// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's
// lifetime. This is safe because each parakeet.cpp call builds its own ggml
// graph and all streaming caches live in the session object, not the ctx —
// the only ctx-shared mutable state is last_error, which is why it is read
// under the same lock as the failing call. Holding the lock per call (rather
// than per stream, as this file previously did) keeps a long-lived live
// session from starving batched unary transcription and vice versa.
//
// A stream must not outlive its ctx (C-API contract). Free() takes engineMu
// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded
// instead of feeding a freed engine; streamFree of an orphaned session only
// runs the session destructor, which does not touch the ctx.
// streamBegin opens a cache-aware streaming session. A 0 stream with nil
// error means the loaded model is not a streaming model.
func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) {
p.engineMu.Lock()
defer p.engineMu.Unlock()
if p.ctxPtr == 0 {
return 0, grpcerrors.ModelNotLoaded("parakeet-cpp")
}
if CppStreamBeginLang != nil {
return CppStreamBeginLang(p.ctxPtr, lang), nil
}
return CppStreamBegin(p.ctxPtr), nil
}
func (p *ParakeetCpp) streamFree(stream uintptr) {
if stream == 0 {
return
}
p.engineMu.Lock()
defer p.engineMu.Unlock()
CppStreamFree(stream)
}
// streamFeedText runs one text-mode feed (or the finalize flush when
// finalize is true) under engineMu, returning the newly-finalized delta and
// whether an <EOU>/<EOB> fired during the call.
func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) {
p.engineMu.Lock()
defer p.engineMu.Unlock()
if p.ctxPtr == 0 {
return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp")
}
var ret uintptr
var events int32
if finalize {
ret = CppStreamFinalize(stream)
} else {
ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events))
}
if ret == 0 {
// last_error is ctx-shared: read it under the same lock as the call.
msg := CppLastError(p.ctxPtr)
if msg == "" {
msg = "unknown error"
}
return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
}
delta = goStringFromCPtr(ret)
CppFreeString(ret)
// ABI v5: eou_out is a bitmask (bit 0 = <EOU>, bit 1 = <EOB>). A v4
// library sets 0/1 for either token, which the bit-0 test reads as the
// old conflated eou — the EOB distinction simply isn't available there.
return delta, events&1 != 0, events&2 != 0, nil
}
// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and
// returns the parsed {text,eou,frame_sec,words} document.
func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) {
p.engineMu.Lock()
defer p.engineMu.Unlock()
if p.ctxPtr == 0 {
return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp")
}
var ret uintptr
if finalize {
ret = CppStreamFinalizeJSON(stream)
} else {
ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm)))
}
if ret == 0 {
msg := CppLastError(p.ctxPtr)
if msg == "" {
msg = "unknown error"
}
return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
}
raw := goStringFromCPtr(ret)
CppFreeString(ret)
var doc streamFeedJSON
if err := json.Unmarshal([]byte(raw), &doc); err != nil {
return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
}
return doc, nil
}
// AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through
// the shared decode driver (feedSlices/flushTail), and emits each
// newly-finalized text run as a TranscriptStreamResponse delta. <EOU>/<EOB>
// events close the current segment; a closing FinalResult carries the full
// transcript, the per-utterance segments, and whether the file ended on an
// utterance boundary.
// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
// current segment; a closing FinalResult carries the full transcript and the
// per-utterance segments.
//
// stream_begin returns 0 for models that are not cache-aware streaming models
// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this
// returns codes.Unimplemented rather than faking a stream from an offline
// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported.
// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
// back to a single offline transcription emitted as one delta plus a closing
// FinalResult, matching LocalAI's non-streaming streaming contract (and the
// whisper backend), so the streaming endpoint works for every model.
func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
defer close(results)
@@ -695,73 +560,185 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
return status.Error(codes.Canceled, "transcription cancelled")
}
stream, err := p.streamBegin(opts.GetLanguage())
if err != nil {
return err
var stream uintptr
if CppStreamBeginLang != nil {
stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage())
} else {
stream = CppStreamBegin(p.ctxPtr)
}
if stream == 0 {
// Not a cache-aware streaming model. Report the missing capability
// honestly instead of decoding offline and emitting it as one "delta"
// + final: a client that asked for streaming must learn the model
// cannot stream, not receive a batch result dressed as a stream (which
// is indistinguishable except qualitatively, and silently breaks any
// feature that genuinely needs incremental output). Callers wanting a
// plain transcript use the unary AudioTranscription path. This mirrors
// AudioTranscriptionLive, which already returns Unimplemented here.
return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp",
"loaded model is not a cache-aware streaming model")
// Not a cache-aware streaming model: run a normal offline
// transcription and emit it as one delta + a closing final result.
res, err := p.AudioTranscription(ctx, opts)
if err != nil {
return err
}
if t := strings.TrimSpace(res.Text); t != "" {
results <- &pb.TranscriptStreamResponse{Delta: t}
}
results <- &pb.TranscriptStreamResponse{FinalResult: &res}
return nil
}
defer p.streamFree(stream)
defer CppStreamFree(stream)
// The C engine is a single shared context: a streaming session and a batched
// unary dispatch must never touch it at once, so hold engineMu for the whole
// stream. This lock is intentionally taken AFTER the non-streaming fallback
// above returns: that fallback goes through AudioTranscription -> the batcher
// -> runBatch, which itself acquires engineMu, so locking here first would
// deadlock. Do not hoist this lock above the fallback.
p.engineMu.Lock()
defer p.engineMu.Unlock()
data, duration, err := decodeWavMono16k(opts.Dst)
if err != nil {
return err
}
// Fold the shared decode driver's per-feed increments into the streamed
// deltas and the closing batch result: words/text accumulate into
// per-utterance segments (streamSegmenter), and the utterance-boundary
// latch (boundary.go) records whether the file ended on an <EOU>. These
// are the offline path's concern — the live RPC carries none of them.
// ABI v4: when the streaming JSON entry points are present, drive them so the
// per-utterance segments carry per-word start/end timestamps. Falls through to
// the text-only loop below against an older libparakeet.so. Runs under the
// engineMu already held above.
if CppStreamFeedJSON != nil {
return p.streamJSON(ctx, stream, data, duration, results)
}
var (
full strings.Builder
seg streamSegmenter
boundary utteranceBoundary
segText strings.Builder
segments []*pb.TranscriptSegment
segID int32
)
emit := func(r streamFeedResult) error {
if r.Delta != "" {
full.WriteString(r.Delta)
results <- &pb.TranscriptStreamResponse{Delta: r.Delta}
flushSegment := func() {
t := strings.TrimSpace(segText.String())
segText.Reset()
if t == "" {
return
}
seg.add(r)
boundary = boundary.observe(r)
segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
segID++
}
// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
// it, accumulates the text, and sends a delta when non-empty. A 0 return
// is an error (vs the "" empty-but-non-NULL no-new-text case).
emitDelta := func(ret uintptr) error {
if ret == 0 {
msg := CppLastError(p.ctxPtr)
if msg == "" {
msg = "unknown error"
}
return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
}
delta := goStringFromCPtr(ret)
CppFreeString(ret)
if delta == "" {
return nil
}
full.WriteString(delta)
segText.WriteString(delta)
results <- &pb.TranscriptStreamResponse{Delta: delta}
return nil
}
if err := p.feedSlices(ctx, stream, data, emit); err != nil {
return err
}
if err := p.flushTail(stream, emit); err != nil {
return err
}
seg.flush() // close a trailing utterance that never saw an <EOU>
for off := 0; off < len(data); off += streamChunkSamples {
if err := ctx.Err(); err != nil {
return status.Error(codes.Canceled, "transcription cancelled")
}
end := min(off+streamChunkSamples, len(data))
chunk := data[off:end]
// final.Text is the exact concatenation of the streamed deltas (full is
// their accumulation), so concat(deltas) == FinalResult.Text holds even
// when the model prepends a leading space to the first word (SentencePiece
// detokenization). This matches the whisper backend's streaming contract.
// The single-segment fallback stays trimmed.
fullText := full.String()
segments := seg.segments()
if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" {
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed})
var eou int32
ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
if err := emitDelta(ret); err != nil {
return err
}
if eou != 0 {
flushSegment()
}
}
// Flush the streaming tail (final encoder chunk).
if err := emitDelta(CppStreamFinalize(stream)); err != nil {
return err
}
flushSegment()
text := strings.TrimSpace(full.String())
if len(segments) == 0 && text != "" {
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
}
results <- &pb.TranscriptStreamResponse{
FinalResult: &pb.TranscriptResult{
Text: fullText,
Text: text,
Segments: segments,
Duration: duration,
},
}
return nil
}
// streamJSON drives the streaming JSON entry points (present since ABI v4): each
// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
// newly-finalized text is emitted as a delta (unchanged streaming contract)
// while words are accumulated into per-utterance segments (closed on <EOU> or
// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
// engineMu (already held by the caller).
func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
duration float32, results chan *pb.TranscriptStreamResponse) error {
var (
full strings.Builder
seg streamSegmenter
)
// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
// emits the delta, and routes words through the segmenter.
consume := func(ret uintptr) error {
if ret == 0 {
msg := CppLastError(p.ctxPtr)
if msg == "" {
msg = "unknown error"
}
return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
}
raw := goStringFromCPtr(ret)
CppFreeString(ret)
var doc streamFeedJSON
if err := json.Unmarshal([]byte(raw), &doc); err != nil {
return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
}
if doc.Text != "" {
full.WriteString(doc.Text)
results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
}
seg.add(doc)
return nil
}
for off := 0; off < len(data); off += streamChunkSamples {
if err := ctx.Err(); err != nil {
return status.Error(codes.Canceled, "transcription cancelled")
}
end := min(off+streamChunkSamples, len(data))
chunk := data[off:end]
if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
return err
}
}
if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
return err
}
seg.flush() // close any trailing utterance that never saw an EOU
text := strings.TrimSpace(full.String())
segments := seg.segments()
if len(segments) == 0 && text != "" {
segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
}
results <- &pb.TranscriptStreamResponse{
FinalResult: &pb.TranscriptResult{
Text: text,
Segments: segments,
Duration: duration,
Eou: boundary.ended(),
},
}
return nil
@@ -826,10 +803,6 @@ func (p *ParakeetCpp) Free() error {
close(p.batStop)
p.batStop = nil
}
// engineMu so an in-flight streaming call (which locks per C call and
// re-checks ctxPtr under the lock) can never feed into a freed ctx.
p.engineMu.Lock()
defer p.engineMu.Unlock()
if p.ctxPtr != 0 {
CppFree(p.ctxPtr)
p.ctxPtr = 0

View File

@@ -14,8 +14,6 @@ import (
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
func TestParakeetCpp(t *testing.T) {
@@ -203,29 +201,6 @@ var _ = Describe("ParakeetCpp", func() {
})
Context("AudioTranscriptionStream", func() {
It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() {
// stream_begin == 0 means the loaded model is not a cache-aware
// streaming model. The backend must surface that, not silently
// decode offline and fake a one-shot "stream".
savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }()
CppStreamBeginLang = nil
CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
p := &ParakeetCpp{ctxPtr: 1}
results := make(chan *pb.TranscriptStreamResponse, 8)
err := p.AudioTranscriptionStream(context.Background(),
&pb.TranscriptRequest{Dst: "ignored.wav"}, results)
Expect(status.Code(err)).To(Equal(codes.Unimplemented))
// Honest signal: nothing was emitted — no faked batch result.
var emitted []*pb.TranscriptStreamResponse
for r := range results {
emitted = append(emitted, r)
}
Expect(emitted).To(BeEmpty())
})
It("streams deltas and a closing FinalResult from a cache-aware model", func() {
// Streaming needs a cache-aware streaming model (e.g.
// realtime_eou); the offline test model would fail stream_begin.

View File

@@ -1,186 +0,0 @@
package main
import (
"strings"
"time"
"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
// liveSampleRate is the only PCM rate the parakeet C streaming API accepts.
const liveSampleRate = 16000
// AudioTranscriptionLive drives one cache-aware streaming session over audio
// fed incrementally by the caller (the realtime API's semantic_vad turn
// detection). Contract:
//
// - the first request must carry a Config; a Config mid-stream resets the
// decode session (free + begin) and drops accumulated transcript state;
// - a Ready ack is sent right after a successful stream_begin so callers
// can degrade synchronously when the model has no streaming support
// (LiveTranscriptionUnsupported, codes.Unimplemented);
// - every feed that produced output is forwarded as {delta, eou, words};
// the <EOU>/<EOB> flag is the model's own utterance boundary and the
// decoder auto-resets after it, so one session spans many utterances;
// - closing the send side finalizes: the held-back tail chunk is flushed
// (the last ~2 encoder frames of words only appear here) and a terminal
// FinalResult carries the full transcript Text only. Per-utterance
// segments, duration, and the terminal <EOU> flag are NOT produced here —
// the realtime core consumes the streamed per-feed tokens and the final
// Text; those batch fields are the file path's concern (see
// AudioTranscriptionStream).
//
// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree
// take engineMu internally), never for the session lifetime — unary
// transcription keeps flowing between feeds.
func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
defer close(out)
if p.ctxPtr == 0 {
return grpcerrors.ModelNotLoaded("parakeet-cpp")
}
first, ok := <-in
if !ok {
return nil // caller closed without sending anything
}
cfg := first.GetConfig()
if cfg == nil {
return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config")
}
if err := validateLiveConfig(cfg); err != nil {
return err
}
stream, err := p.streamBegin(cfg.GetLanguage())
if err != nil {
return err
}
if stream == 0 {
return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
"loaded model is not a cache-aware streaming model")
}
// stream is reassigned on a mid-stream Config reset; free whatever is
// current when the RPC unwinds.
defer func() { p.streamFree(stream) }()
out <- &pb.TranscriptLiveResponse{Ready: true}
var (
full strings.Builder
fedSecs float64
// behindSec accumulates how far decode wall time has fallen behind
// the audio it was fed. A live caller feeds in real time, so a
// persistent positive backlog means every downstream signal —
// including the <EOU> the turn detector waits on — arrives that many
// seconds late. Warned once per session; reset by a Config reset.
behindSec float64
behindWarned bool
)
// emit forwards one decode increment: it streams the per-feed tokens the
// realtime turn detector consumes (delta/eou/eob/words) and accumulates the
// running transcript for the closing FinalResult. No segmentation or
// boundary latch here — the live consumer reads only the streamed tokens
// and the final Text; per-utterance segments and the terminal <EOU> flag
// are an offline-path concern (see AudioTranscriptionStream / boundary.go).
emit := func(r streamFeedResult) error {
if r.Delta != "" {
full.WriteString(r.Delta)
}
if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 {
out <- &pb.TranscriptLiveResponse{
Delta: r.Delta,
Eou: r.Eou,
Eob: r.Eob,
Words: liveWordsToProto(r.Words),
}
}
return nil
}
for req := range in {
switch payload := req.GetPayload().(type) {
case *pb.TranscriptLiveRequest_Config:
if err := validateLiveConfig(payload.Config); err != nil {
return err
}
// Reset: a fresh decode session, dropping accumulated state.
p.streamFree(stream)
stream, err = p.streamBegin(payload.Config.GetLanguage())
if err != nil {
return err
}
if stream == 0 {
return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
"loaded model is not a cache-aware streaming model")
}
full.Reset()
fedSecs = 0
case *pb.TranscriptLiveRequest_Audio:
pcm := payload.Audio.GetPcm()
audioSec := float64(len(pcm)) / liveSampleRate
fedSecs += audioSec
start := time.Now()
// nil ctx: a live session is bounded by this request channel, not a
// context — cancellation is the caller closing the stream.
if err := p.feedSlices(nil, stream, pcm, emit); err != nil {
return err
}
wallSec := time.Since(start).Seconds()
behindSec += wallSec - audioSec
if behindSec < 0 {
behindSec = 0
}
xlog.Debug("parakeet-cpp: live feed",
"audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000),
"behind_ms", int(behindSec*1000), "fed_s", fedSecs)
if behindSec > 1 && !behindWarned {
behindWarned = true
xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+
"end-of-utterance signals will arrive late",
"behind_s", behindSec, "fed_s", fedSecs)
}
}
}
// Send side closed: flush the streaming tail and emit the final transcript.
// The live FinalResult carries only Text — the authoritative full-turn
// transcript the realtime core commits. Per-utterance segments, duration,
// and the terminal <EOU> flag are not produced on the live path.
if err := p.flushTail(stream, emit); err != nil {
return err
}
out <- &pb.TranscriptLiveResponse{
FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())},
}
return nil
}
func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error {
if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate {
return status.Errorf(codes.InvalidArgument,
"parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate)
}
return nil
}
func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord {
if len(words) == 0 {
return nil
}
out := make([]*pb.TranscriptWord, len(words))
for i, w := range words {
out[i] = &pb.TranscriptWord{
Start: secondsToNanos(w.Start),
End: secondsToNanos(w.End),
Text: w.W,
}
}
return out
}

View File

@@ -1,417 +0,0 @@
package main
import (
"sync"
"time"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed
// Cpp* package vars (the same seam batcher_test.go uses), so they run
// without libparakeet.so.
// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory
// and keeps them alive for the duration of a spec (goStringFromCPtr reads
// through the raw pointer; Go's GC must not collect the backing array while
// a stub's return value is in flight).
type liveCstrPool struct {
mu sync.Mutex
bufs [][]byte
}
func (p *liveCstrPool) cstr(s string) uintptr {
p.mu.Lock()
defer p.mu.Unlock()
b := append([]byte(s), 0)
p.bufs = append(p.bufs, b)
return uintptr(unsafe.Pointer(&b[0]))
}
// liveStubs swaps every C entry point the live path touches and returns a
// restore func for AfterEach.
func liveStubs() (restore func()) {
savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON
savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON
savedFree, savedLastError := CppStreamFree, CppLastError
savedFreeString := CppFreeString
return func() {
CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang
CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON
CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON
CppStreamFree, CppLastError = savedFree, savedLastError
CppFreeString = savedFreeString
}
}
// runLive starts the RPC on its own goroutine and returns the request
// channel plus a collector for everything the backend emitted.
func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) {
in := make(chan *pb.TranscriptLiveRequest)
out := make(chan *pb.TranscriptLiveResponse, 32)
errCh := make(chan error, 1)
go func() { errCh <- p.AudioTranscriptionLive(in, out) }()
return in, out, errCh
}
func liveConfig(lang string) *pb.TranscriptLiveRequest {
return &pb.TranscriptLiveRequest{
Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}},
}
}
func liveAudio(pcm []float32) *pb.TranscriptLiveRequest {
return &pb.TranscriptLiveRequest{
Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}},
}
}
func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse {
var got []*pb.TranscriptLiveResponse
for r := range out {
got = append(got, r)
}
return got
}
var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() {
var (
pool *liveCstrPool
restore func()
p *ParakeetCpp
)
BeforeEach(func() {
pool = &liveCstrPool{}
restore = liveStubs()
p = &ParakeetCpp{ctxPtr: 1}
CppStreamBeginLang = nil
CppStreamBegin = func(ctx uintptr) uintptr { return 7 }
CppStreamFree = func(s uintptr) {}
CppFreeString = func(s uintptr) {}
CppLastError = func(ctx uintptr) string { return "stub error" }
CppStreamFeed = nil
CppStreamFeedJSON = nil
CppStreamFinalize = nil
CppStreamFinalizeJSON = nil
})
AfterEach(func() { restore() })
It("rejects a stream whose first message is not a config", func() {
in, out, errCh := runLive(p)
in <- liveAudio([]float32{0.1})
close(in)
err := <-errCh
Expect(status.Code(err)).To(Equal(codes.InvalidArgument))
Expect(collectLive(out)).To(BeEmpty())
})
It("rejects a non-16k sample rate", func() {
in, _, errCh := runLive(p)
in <- &pb.TranscriptLiveRequest{
Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}},
}
close(in)
Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument))
})
It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() {
CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
in, out, errCh := runLive(p)
in <- liveConfig("")
close(in)
err := <-errCh
Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
Expect(collectLive(out)).To(BeEmpty())
})
It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() {
var freed []uintptr
CppStreamFree = func(s uintptr) { freed = append(freed, s) }
feeds := 0
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
feeds++
switch feeds {
case 1:
return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` +
`"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`)
default:
return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` +
`"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`)
}
}
CppStreamFinalizeJSON = func(s uintptr) uintptr {
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
}
in, out, errCh := runLive(p)
in <- liveConfig("en")
in <- liveAudio(make([]float32, 100))
in <- liveAudio(make([]float32, 200))
close(in)
Expect(<-errCh).NotTo(HaveOccurred())
got := collectLive(out)
Expect(got).To(HaveLen(4)) // ready, two deltas, final
Expect(got[0].Ready).To(BeTrue())
Expect(got[1].Delta).To(Equal("hello "))
Expect(got[1].Eou).To(BeFalse())
Expect(got[1].Words).To(HaveLen(1))
Expect(got[1].Words[0].Text).To(Equal("hello"))
Expect(got[2].Delta).To(Equal("world"))
Expect(got[2].Eou).To(BeTrue())
final := got[3].FinalResult
Expect(final).NotTo(BeNil())
Expect(final.Text).To(Equal("hello world"))
// The live FinalResult carries only Text. Per-utterance segments,
// duration and the terminal eou flag are an offline-path concern (see
// boundary.go / AudioTranscriptionStream); the realtime core reads the
// streamed per-feed tokens above plus this Text.
Expect(final.Eou).To(BeFalse())
Expect(final.Segments).To(BeEmpty())
Expect(final.Duration).To(BeZero())
Expect(freed).To(Equal([]uintptr{7}))
})
It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() {
feeds := 0
CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
feeds++
if feeds == 2 {
*(*int32)(eouOut) = 1
return pool.cstr("done")
}
return pool.cstr("first ")
}
CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
in, out, errCh := runLive(p)
in <- liveConfig("")
in <- liveAudio(make([]float32, 10))
in <- liveAudio(make([]float32, 10))
close(in)
Expect(<-errCh).NotTo(HaveOccurred())
got := collectLive(out)
Expect(got).To(HaveLen(4))
Expect(got[1].Delta).To(Equal("first "))
Expect(got[1].Eou).To(BeFalse())
Expect(got[2].Delta).To(Equal("done"))
Expect(got[2].Eou).To(BeTrue())
Expect(got[3].FinalResult.Text).To(Equal("first done"))
})
It("forwards <EOB> as eob — a backchannel, never an eou (ABI v5 JSON)", func() {
feeds := 0
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
feeds++
if feeds == 1 {
return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` +
`"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`)
}
return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` +
`"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`)
}
CppStreamFinalizeJSON = func(s uintptr) uintptr {
return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`)
}
in, out, errCh := runLive(p)
in <- liveConfig("")
in <- liveAudio(make([]float32, 10))
in <- liveAudio(make([]float32, 10))
close(in)
Expect(<-errCh).NotTo(HaveOccurred())
got := collectLive(out)
Expect(got).To(HaveLen(4))
Expect(got[1].Eob).To(BeTrue())
Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
Expect(got[2].Eou).To(BeTrue())
})
It("maps the v5 eou_out bitmask on the text path (bit0 <EOU>, bit1 <EOB>)", func() {
feeds := 0
CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
feeds++
if feeds == 1 {
*(*int32)(eouOut) = 2 // <EOB> only
return pool.cstr("uh-huh")
}
*(*int32)(eouOut) = 1 // <EOU>
return pool.cstr(" done")
}
CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
in, out, errCh := runLive(p)
in <- liveConfig("")
in <- liveAudio(make([]float32, 10))
in <- liveAudio(make([]float32, 10))
close(in)
Expect(<-errCh).NotTo(HaveOccurred())
got := collectLive(out)
Expect(got).To(HaveLen(4))
Expect(got[1].Eob).To(BeTrue())
Expect(got[1].Eou).To(BeFalse())
Expect(got[2].Eou).To(BeTrue())
Expect(got[2].Eob).To(BeFalse())
})
It("accumulates trailing text after an EOU into the final transcript", func() {
feeds := 0
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
feeds++
if feeds == 1 {
return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`)
}
return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`)
}
CppStreamFinalizeJSON = func(s uintptr) uintptr {
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
}
in, out, errCh := runLive(p)
in <- liveConfig("")
in <- liveAudio(make([]float32, 10))
in <- liveAudio(make([]float32, 10))
close(in)
Expect(<-errCh).NotTo(HaveOccurred())
got := collectLive(out)
final := got[len(got)-1].FinalResult
Expect(final.Text).To(Equal("turn one and more"))
})
It("resets the decode session on a mid-stream config", func() {
var begun, freed int
CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) }
CppStreamFree = func(s uintptr) { freed++ }
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`)
}
CppStreamFinalizeJSON = func(s uintptr) uintptr {
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
}
in, out, errCh := runLive(p)
in <- liveConfig("")
in <- liveAudio(make([]float32, 10))
in <- liveConfig("") // reset
in <- liveAudio(make([]float32, 10))
close(in)
Expect(<-errCh).NotTo(HaveOccurred())
got := collectLive(out)
final := got[len(got)-1].FinalResult
Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped")
Expect(begun).To(Equal(2))
Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind")
})
It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() {
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
}
CppStreamFinalizeJSON = func(s uintptr) uintptr {
return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
}
in, out, errCh := runLive(p)
in <- liveConfig("")
in <- liveAudio(make([]float32, 10))
// The session is open and idle between feeds: the engine lock must be
// acquirable, which is what lets batched unary transcription proceed
// mid-session. Under stream-lifetime locking this probe would block
// until the stream ended and the Eventually would time out.
locked := make(chan struct{})
go func() {
p.engineMu.Lock()
p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability
close(locked)
}()
Eventually(locked, time.Second).Should(BeClosed())
close(in)
Expect(<-errCh).NotTo(HaveOccurred())
collectLive(out)
})
It("errors out and reads last_error under the lock when a feed fails", func() {
CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 }
in, out, errCh := runLive(p)
in <- liveConfig("")
in <- liveAudio(make([]float32, 10))
err := <-errCh
Expect(err).To(MatchError(ContainSubstring("stub error")))
got := collectLive(out)
Expect(got).To(HaveLen(1)) // just the ready ack
close(in)
})
})
var _ = Describe("stripEouMarker", func() {
It("strips a trailing <EOU> and reports it", func() {
text, eou := stripEouMarker("it is certainly very like the old portrait<EOU>")
Expect(text).To(Equal("it is certainly very like the old portrait"))
Expect(eou).To(BeTrue())
})
It("strips a trailing <EOB> WITHOUT reporting an utterance end", func() {
// A decode ending on a backchannel must not confirm the
// retranscribe gate — the user was acknowledging, not yielding.
text, eou := stripEouMarker("uh-huh<EOB>")
Expect(text).To(Equal("uh-huh"))
Expect(eou).To(BeFalse())
})
It("leaves marker-free text alone", func() {
text, eou := stripEouMarker("plain transcript")
Expect(text).To(Equal("plain transcript"))
Expect(eou).To(BeFalse())
})
It("does not strip a marker in the middle of the text", func() {
text, eou := stripEouMarker("a<EOU>b")
Expect(text).To(Equal("a<EOU>b"))
Expect(eou).To(BeFalse())
})
})
var _ = Describe("transcriptResultFromDoc EOU handling", func() {
It("strips the offline marker from text and sets the result flag", func() {
doc := transcriptJSON{Text: "the old portrait<EOU>"}
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
Expect(res.Text).To(Equal("the old portrait"))
Expect(res.Eou).To(BeTrue())
Expect(res.Segments).To(HaveLen(1))
Expect(res.Segments[0].Text).To(Equal("the old portrait"))
})
It("reports eou=false for marker-free decodes", func() {
doc := transcriptJSON{Text: "no marker here"}
res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
Expect(res.Text).To(Equal("no marker here"))
Expect(res.Eou).To(BeFalse())
})
})

View File

@@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
var _ = Describe("streaming segment assembly", func() {
It("closes a segment with start/end from its words on EOU", func() {
acc := &streamSegmenter{}
acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{
acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
}})
segs := acc.segments()
@@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() {
It("buffers words across feeds until EOU", func() {
acc := &streamSegmenter{}
acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
Expect(acc.segments()).To(BeEmpty())
acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
Expect(acc.segments()).To(HaveLen(1))
Expect(acc.segments()[0].Text).To(Equal("hi there"))
})
@@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() {
// field; a backchannel must still close the segment as it did in v4.
It("closes a segment on EOB (backchannel) too", func() {
acc := &streamSegmenter{}
acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{
acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
}})
segs := acc.segments()
@@ -137,18 +137,4 @@ var _ = Describe("streaming segment assembly", func() {
Expect(segs[0].Text).To(Equal("uh huh"))
Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
})
// Older text-only libparakeet.so: no per-word timings, so a segment is cut
// from the delta text on each <EOU>/<EOB> (no timestamps), one per utterance.
It("falls back to text segments when the feed carries no words", func() {
acc := &streamSegmenter{}
acc.add(streamFeedResult{Delta: "first turn", Eou: true})
acc.add(streamFeedResult{Delta: "second turn", Eou: true})
segs := acc.segments()
Expect(segs).To(HaveLen(2))
Expect(segs[0].Text).To(Equal("first turn"))
Expect(segs[1].Text).To(Equal("second turn"))
Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path")
Expect(segs[0].End).To(Equal(int64(0)))
})
})

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=2574f5936571645f784b77623e1f09bad97d948a
STABLEDIFFUSION_GGML_VERSION?=8caa3f908ae6d4a4bef531e73b9a969f266a3d1f
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -798,7 +798,6 @@ void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed) {
int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count) {
sd_image_t* results;
int num_results_out = 0;
std::vector<int> skip_layers = {7, 8, 9};
@@ -995,14 +994,10 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
sd_ctx_params_to_str(&ctx_params),
sd_img_gen_params_to_str(p));
bool gen_ok = generate_image(sd_c, p, &results, &num_results_out);
results = generate_image(sd_c, p);
std::free(p);
if (!gen_ok || num_results_out == 0) {
results = NULL;
}
if (results == NULL) {
fprintf (stderr, "NO results\n");
if (input_image_buffer) free(input_image_buffer);

View File

@@ -1,18 +0,0 @@
# Fetched upstream sources
sources/
# CMake build directories
build*/
# build artifacts staged in-tree by the Makefile (cp from sources/) or
# symlinked for local dev; the real sources live in voice-detect.cpp upstream.
*.so
*.so.*
voicedetect_capi.h
compile_commands.json
# Compiled backend binary
voice-detect-grpc
# Packaging output
package/

View File

@@ -1,107 +0,0 @@
# voice-detect backend Makefile.
#
# Upstream pin lives below as VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
#
# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
# symlink the .so + header into this directory and skip the clone/cmake steps:
#
# ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
# ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
# go build -o voice-detect-grpc .
#
# The default target below does the proper clone-at-pin + cmake build so CI does
# not need a side-checkout.
VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
GOCMD?=go
GO_TAGS?=
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
BUILD_TYPE?=
NATIVE?=false
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
# self-contained: dlopen needs no libggml*.so alongside it, only system libs
# (libstdc++/libgomp/libc) that the runtime image already provides.
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif
# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
ifeq ($(CUDA_MAJOR_VERSION),13)
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
endif
endif
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
else ifeq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON
endif
.PHONY: voice-detect-grpc package build clean purge test all
all: voice-detect-grpc
# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
# bump, run 'make purge && make' to refetch.
sources/voice-detect.cpp:
mkdir -p sources/voice-detect.cpp
cd sources/voice-detect.cpp && \
git init -q && \
git remote add origin $(VOICEDETECT_REPO) && \
git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
git checkout FETCH_HEAD && \
git submodule update --init --recursive --depth 1 --single-branch
# Build the shared lib + header out-of-tree, then stage them next to the Go
# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
# them up.
libvoicedetect.so: sources/voice-detect.cpp
cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
package: voice-detect-grpc
bash package.sh
build: package
# Test target. The embed/verify/analyze smoke specs are gated on
# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
# heavy specs auto-skip and only the pure-Go parsing specs run.
test:
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
clean: purge
rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
purge:
rm -rf sources/voice-detect.cpp

View File

@@ -1,273 +0,0 @@
package main
import (
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
)
// purego-bound entry points from libvoicedetect.so. Names match
// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
// is enough to spot drift.
//
// The opaque ctx and the malloc'd char*/float* return values are declared as
// uintptr so we get the raw pointer back and can release it via the matching
// capi free function. purego's native string/[]float32 returns would copy and
// forget the original pointer, leaking the C-owned buffer on every call.
var (
CppAbiVersion func() int32
CppLoad func(ggufPath string) uintptr
CppFree func(ctx uintptr)
CppLastError func(ctx uintptr) string
CppFreeString func(s uintptr)
CppFreeVec func(v uintptr)
CppEmbedPath func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
CppEmbedPCM func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
)
// VoiceDetect implements the speaker-recognition voice subset of the Backend
// gRPC service over libvoicedetect.so. The C side keeps a single loaded model
// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
// serializes every call.
type VoiceDetect struct {
base.SingleThread
opts loadOptions
ctxPtr uintptr
}
func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
model := opts.ModelFile
if model == "" {
model = opts.ModelPath
}
if !filepath.IsAbs(model) && opts.ModelPath != "" {
model = filepath.Join(opts.ModelPath, model)
}
if model == "" {
return errors.New("voice-detect: ModelFile is required")
}
v.opts = parseOptions(opts.Options)
if v.opts.modelName == "" {
v.opts.modelName = filepath.Base(model)
}
// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
// one backend process per model and serves requests concurrently, so the
// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
// VOICEDETECT_THREADS is read by the engine at backend construction, so it
// must be set before the capi load. A non-positive Threads means "unset":
// leave the env alone so the engine keeps its sane default.
threads := opts.Threads
if threads > 0 {
if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
}
xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
}
xlog.Info("voice-detect: loading model", "model", model,
"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
ctx := CppLoad(model)
if ctx == 0 {
// The last-error buffer lives on the ctx that was never returned, so
// surface the path the operator tried to load instead.
return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
}
v.ctxPtr = ctx
return nil
}
// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
// The request carries a filesystem PATH; the HTTP layer materializes
// base64/URL/data-URI inputs to a temp file before the gRPC call.
func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
if v.ctxPtr == 0 {
return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
}
if req.Audio == "" {
return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
}
emb, err := v.embedPath(req.Audio)
if err != nil {
return pb.VoiceEmbedResponse{}, err
}
return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
}
func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
var vec uintptr
var dim int32
rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
if rc != 0 || vec == 0 || dim <= 0 {
return nil, v.lastErr("embed", path)
}
defer CppFreeVec(vec)
// Copy out of the C-owned malloc'd buffer before freeing it. The
// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
// nor moves this buffer and we copy immediately.
src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
out := make([]float32, int(dim))
copy(out, src)
return out, nil
}
// VoiceVerify embeds two clips and reports whether they are the same speaker by
// cosine distance against a threshold. A request threshold <= 0 falls back to
// the model-configured default (verify_threshold option, 0.25 if unset).
func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
if v.ctxPtr == 0 {
return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
}
if req.Audio1 == "" || req.Audio2 == "" {
return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
}
threshold := req.Threshold
if threshold <= 0 {
threshold = v.opts.verifyThreshold
}
started := time.Now()
var distance float32
var verified int32
rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
unsafe.Pointer(&distance), unsafe.Pointer(&verified))
if rc != 0 {
return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
}
elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
// matching the Python speaker-recognition backend's reporting.
confidence := float32(0)
if threshold > 0 {
confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
}
return pb.VoiceVerifyResponse{
Verified: verified != 0,
Distance: distance,
Threshold: threshold,
Confidence: confidence,
Model: v.opts.modelName,
ProcessingTimeMs: elapsedMs,
}, nil
}
// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
// always evaluates every supported head, so the request's actions filter is
// advisory and the full analysis is returned as a single segment (the engine
// does not produce time-bounded segments).
func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
if v.ctxPtr == 0 {
return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
}
if req.Audio == "" {
return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
}
ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
if ptr == 0 {
return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
}
defer CppFreeString(ptr)
seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
if err != nil {
return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
}
return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
}
// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
//
// {"age":42.0,
// "gender":{"label":"female","female":0.88,"male":0.12},
// "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
//
// gender is a mixed object (a "label" string plus per-class float scores), so
// it is decoded into raw messages and split in parseAnalyzeJSON.
type analyzeJSON struct {
Age float32 `json:"age"`
Gender map[string]json.RawMessage `json:"gender"`
Emotion struct {
Label string `json:"label"`
Scores map[string]float32 `json:"scores"`
} `json:"emotion"`
}
// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
// start/end stay 0: the model emits a single whole-utterance result, not
// time-bounded segments.
func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
var a analyzeJSON
if err := json.Unmarshal([]byte(doc), &a); err != nil {
return nil, err
}
seg := &pb.VoiceAnalysis{
Age: a.Age,
DominantEmotion: a.Emotion.Label,
Emotion: a.Emotion.Scores,
}
if len(a.Gender) > 0 {
gender := make(map[string]float32, len(a.Gender))
for k, raw := range a.Gender {
if k == "label" {
_ = json.Unmarshal(raw, &seg.DominantGender)
continue
}
var score float32
if err := json.Unmarshal(raw, &score); err == nil {
gender[k] = score
}
}
seg.Gender = gender
}
return seg, nil
}
// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
func (v *VoiceDetect) lastErr(op, subject string) error {
msg := strings.TrimSpace(CppLastError(v.ctxPtr))
if msg == "" {
msg = "no error detail"
}
return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
}
// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
//
// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
// moves the buffer and we dereference it immediately to copy the bytes out.
func goStringFromCPtr(cptr uintptr) string {
if cptr == 0 {
return ""
}
p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
n := 0
for *(*byte)(unsafe.Add(p, n)) != 0 {
n++
}
return string(unsafe.Slice((*byte)(p), n))
}

View File

@@ -1,144 +0,0 @@
package main
import (
"os"
"sync"
"testing"
"github.com/ebitengine/purego"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestVoiceDetect(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "voice-detect Backend Suite")
}
var (
libLoadOnce sync.Once
libLoadErr error
)
// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
// bridge without spinning up the gRPC server. Records the error (the smoke
// specs skip themselves) when libvoicedetect.so is not loadable from cwd
// (LD_LIBRARY_PATH or a symlink in ./).
func ensureLibLoaded() error {
libLoadOnce.Do(func() {
libName := os.Getenv("VOICEDETECT_LIBRARY")
if libName == "" {
libName = "libvoicedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
libLoadErr = err
return
}
purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
})
return libLoadErr
}
var _ = Describe("parseOptions", func() {
It("defaults verify_threshold to 0.25", func() {
o := parseOptions(nil)
Expect(o.verifyThreshold).To(Equal(float32(0.25)))
Expect(o.modelName).To(Equal(""))
})
It("parses verify_threshold, threshold alias and model_name", func() {
o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
Expect(o.verifyThreshold).To(Equal(float32(0.4)))
Expect(o.modelName).To(Equal("ecapa"))
o2 := parseOptions([]string{"threshold:0.3"})
Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
})
It("ignores non-positive thresholds and keeps the default", func() {
o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
Expect(o.verifyThreshold).To(Equal(float32(0.25)))
})
})
var _ = Describe("parseAnalyzeJSON", func() {
It("maps age, gender label+scores and emotion label+scores", func() {
doc := `{"age":42.0,
"gender":{"label":"female","female":0.88,"male":0.12},
"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
seg, err := parseAnalyzeJSON(doc)
Expect(err).ToNot(HaveOccurred())
Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
Expect(seg.Start).To(Equal(float32(0)))
Expect(seg.End).To(Equal(float32(0)))
Expect(seg.DominantGender).To(Equal("female"))
Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
// The "label" entry is consumed into DominantGender, not the score map.
Expect(seg.Gender).ToNot(HaveKey("label"))
Expect(seg.DominantEmotion).To(Equal("neutral"))
Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
})
It("tolerates a missing gender block", func() {
seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
Expect(err).ToNot(HaveOccurred())
Expect(seg.DominantGender).To(Equal(""))
Expect(seg.DominantEmotion).To(Equal("happy"))
})
It("returns an error on malformed JSON", func() {
_, err := parseAnalyzeJSON(`{not-json`)
Expect(err).To(HaveOccurred())
})
})
// The specs below exercise the real C-API end to end. They run only when both a
// model GGUF and a test WAV are provided, and skip cleanly otherwise so the
// suite stays green without large assets.
var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
var (
v *VoiceDetect
modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
wavPath = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
)
BeforeAll(func() {
if modelPath == "" || wavPath == "" {
Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
}
if err := ensureLibLoaded(); err != nil {
Skip("libvoicedetect.so not loadable: " + err.Error())
}
v = &VoiceDetect{}
Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
})
It("embeds an audio clip", func() {
resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Embedding).ToNot(BeEmpty())
Expect(resp.Model).ToNot(BeEmpty())
})
It("verifies a clip against itself as the same speaker", func() {
resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Verified).To(BeTrue())
Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
})
})

View File

@@ -1,64 +0,0 @@
package main
// Started internally by LocalAI - one gRPC server per loaded model.
//
// Loads libvoicedetect.so via purego and registers the flat C-API entry points
// declared in voicedetect_capi.h. The library name can be overridden with
// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
// convention in the sibling backends); the default looks for the .so next to
// this binary (resolved via LD_LIBRARY_PATH by run.sh).
import (
"flag"
"fmt"
"os"
"github.com/ebitengine/purego"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
type LibFuncs struct {
FuncPtr any
Name string
}
func main() {
libName := os.Getenv("VOICEDETECT_LIBRARY")
if libName == "" {
libName = "libvoicedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
}
// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
// uintptr so the raw pointer can be freed via the matching capi free fn.
libFuncs := []LibFuncs{
{&CppAbiVersion, "voicedetect_capi_abi_version"},
{&CppLoad, "voicedetect_capi_load"},
{&CppFree, "voicedetect_capi_free"},
{&CppLastError, "voicedetect_capi_last_error"},
{&CppFreeString, "voicedetect_capi_free_string"},
{&CppFreeVec, "voicedetect_capi_free_vec"},
{&CppEmbedPath, "voicedetect_capi_embed_path"},
{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
}
for _, lf := range libFuncs {
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
}
fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
flag.Parse()
if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
panic(err)
}
}

View File

@@ -1,46 +0,0 @@
package main
import (
"strconv"
"strings"
)
// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
// not set one. Matches the Python speaker-recognition backend's default so the
// two implementations agree on verdicts out of the box.
const defaultVerifyThreshold float32 = 0.25
// loadOptions holds the parsed model-level options for voice-detect.
type loadOptions struct {
verifyThreshold float32
modelName string
}
func splitOption(o string) (key, value string, ok bool) {
i := strings.Index(o, ":")
if i < 0 {
return "", "", false
}
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
}
// parseOptions reads the backend "key:value" option slice. Unknown keys are
// ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
func parseOptions(opts []string) loadOptions {
o := loadOptions{verifyThreshold: defaultVerifyThreshold}
for _, oo := range opts {
key, value, ok := splitOption(oo)
if !ok {
continue
}
switch key {
case "verify_threshold", "threshold":
if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
o.verifyThreshold = float32(f)
}
case "model_name":
o.modelName = value
}
}
return o
}

View File

@@ -1,68 +0,0 @@
#!/bin/bash
#
# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
# is used instead of the host's.
set -e
CURDIR=$(dirname "$(realpath "$0")")
REPO_ROOT="${CURDIR}/../../.."
mkdir -p "$CURDIR/package/lib"
cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
# LD_LIBRARY_PATH, which run.sh points at lib/.
cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
exit 1
}
# Detect architecture and copy the core runtime libs libvoicedetect.so links
# against, plus the matching dynamic loader as lib/ld.so.
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ "$(uname -s)" = "Darwin" ]; then
echo "Detected Darwin"
else
echo "Error: Could not detect architecture"
exit 1
fi
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
# BUILD_TYPE so the backend can reach the GPU without the runtime base image
# shipping those drivers.
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"

View File

@@ -1,16 +0,0 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
# If a self-contained ld.so was packaged, route through it so the packaged
# libc / libstdc++ are used instead of the host's (matches the whisper /
# parakeet backends' runtime layout).
if [ -f "$CURDIR/lib/ld.so" ]; then
echo "Using lib/ld.so"
exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
fi
exec "$CURDIR/voice-detect-grpc" "$@"

View File

@@ -1,14 +0,0 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
cd "$CURDIR"
echo "Running voice-detect backend tests..."
# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
echo "voice-detect tests completed."

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
WHISPER_CPP_VERSION?=6fc7c33b4c3a2cec83e4b65abd5e96a890480375
WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
SO_TARGET?=libgowhisper.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

View File

@@ -13,14 +13,8 @@ if [ "$(uname)" != "Darwin" ]; then
fi
if [ "$(uname)" = "Darwin" ]; then
# macOS: single fallback variant (Metal/Accelerate). The cmake build emits a
# Mach-O named .so, but tolerate .dylib too — pick whichever exists so the Go
# loader doesn't panic on a hardcoded name that isn't on disk.
if [ -e "$CURDIR/libgowhisper-fallback.dylib" ]; then
LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
else
LIBRARY="$CURDIR/libgowhisper-fallback.so"
fi
# macOS: single dylib variant (Metal or Accelerate)
LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
export DYLD_LIBRARY_PATH="$CURDIR"/lib:$DYLD_LIBRARY_PATH
else
LIBRARY="$CURDIR/libgowhisper-fallback.so"

View File

@@ -209,78 +209,6 @@
nvidia-cuda-12: "cuda12-ced"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
- &voicedetect
name: "voice-detect"
alias: "voice-detect"
license: mit
icon: https://avatars.githubusercontent.com/u/95302084
description: |
voice-detect speaker recognition and voice analysis.
voice-detect.cpp is a C++/ggml engine that produces L2-normalised
speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
ERes2Net, CAM++) for voice verification and 1:N identification, plus
a wav2vec2 age / gender / emotion analysis head. It replaces the
Python speaker-recognition backend and is exposed through the Voice*
gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
urls:
- https://github.com/mudler/voice-detect.cpp
tags:
- voice-recognition
- speaker-verification
- speaker-embedding
- CPU
- GPU
- CUDA
- HIP
capabilities:
default: "cpu-voice-detect"
nvidia: "cuda12-voice-detect"
intel: "intel-sycl-f16-voice-detect"
metal: "metal-voice-detect"
amd: "rocm-voice-detect"
vulkan: "vulkan-voice-detect"
nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
nvidia-cuda-13: "cuda13-voice-detect"
nvidia-cuda-12: "cuda12-voice-detect"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
- &facedetect
name: "face-detect"
alias: "face-detect"
license: mit
icon: https://avatars.githubusercontent.com/u/95302084
description: |
face-detect face detection, embedding, verification and analysis.
face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face
detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face
embeddings for verification and 1:N identification, plus a landmark /
age / gender analysis head. It replaces the Python insightface backend
and is exposed through the Embedding, Detect and Face* gRPC rpcs and
the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD
ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
urls:
- https://github.com/mudler/face-detect.cpp
tags:
- face-recognition
- face-verification
- face-embedding
- CPU
- GPU
- CUDA
- HIP
capabilities:
default: "cpu-face-detect"
nvidia: "cuda12-face-detect"
intel: "intel-sycl-f16-face-detect"
metal: "metal-face-detect"
amd: "rocm-face-detect"
vulkan: "vulkan-face-detect"
nvidia-l4t: "nvidia-l4t-arm64-face-detect"
nvidia-cuda-13: "cuda13-face-detect"
nvidia-cuda-12: "cuda12-face-detect"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect"
- &voxtral
name: "voxtral"
alias: "voxtral"
@@ -1428,6 +1356,7 @@
intel: "intel-fish-speech"
amd: "rocm-fish-speech"
nvidia-l4t: "nvidia-l4t-fish-speech"
metal: "metal-fish-speech"
default: "cpu-fish-speech"
nvidia-cuda-13: "cuda13-fish-speech"
nvidia-cuda-12: "cuda12-fish-speech"
@@ -2899,236 +2828,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-ced
## voice-detect
- !!merge <<: *voicedetect
name: "voice-detect-development"
capabilities:
default: "cpu-voice-detect-development"
nvidia: "cuda12-voice-detect-development"
intel: "intel-sycl-f16-voice-detect-development"
metal: "metal-voice-detect-development"
amd: "rocm-voice-detect-development"
vulkan: "vulkan-voice-detect-development"
nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
nvidia-cuda-13: "cuda13-voice-detect-development"
nvidia-cuda-12: "cuda12-voice-detect-development"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
- !!merge <<: *voicedetect
name: "nvidia-l4t-arm64-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
name: "nvidia-l4t-arm64-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-nvidia-l4t-arm64-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cpu-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
mirrors:
- localai/localai-backends:latest-cpu-voice-detect
- !!merge <<: *voicedetect
name: "cpu-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
mirrors:
- localai/localai-backends:master-cpu-voice-detect
- !!merge <<: *voicedetect
name: "metal-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
name: "metal-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cuda12-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
name: "cuda12-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
name: "rocm-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
name: "rocm-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f32-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f32-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f16-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f16-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
name: "vulkan-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
name: "vulkan-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
## face-detect
- !!merge <<: *facedetect
name: "face-detect-development"
capabilities:
default: "cpu-face-detect-development"
nvidia: "cuda12-face-detect-development"
intel: "intel-sycl-f16-face-detect-development"
metal: "metal-face-detect-development"
amd: "rocm-face-detect-development"
vulkan: "vulkan-face-detect-development"
nvidia-l4t: "nvidia-l4t-arm64-face-detect-development"
nvidia-cuda-13: "cuda13-face-detect-development"
nvidia-cuda-12: "cuda12-face-detect-development"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development"
- !!merge <<: *facedetect
name: "nvidia-l4t-arm64-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
name: "nvidia-l4t-arm64-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
name: "cuda13-nvidia-l4t-arm64-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
name: "cuda13-nvidia-l4t-arm64-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
name: "cpu-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect"
mirrors:
- localai/localai-backends:latest-cpu-face-detect
- !!merge <<: *facedetect
name: "cpu-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect"
mirrors:
- localai/localai-backends:master-cpu-face-detect
- !!merge <<: *facedetect
name: "metal-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
name: "metal-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
name: "cuda12-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
name: "cuda12-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
name: "rocm-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
name: "rocm-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f32-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f32-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f16-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f16-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
name: "vulkan-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-face-detect
- !!merge <<: *facedetect
name: "vulkan-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect"
mirrors:
- localai/localai-backends:master-gpu-vulkan-face-detect
- !!merge <<: *facedetect
name: "cuda13-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect
- !!merge <<: *facedetect
name: "cuda13-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect
## stablediffusion-ggml
- !!merge <<: *stablediffusionggml
name: "cpu-stablediffusion-ggml"
@@ -5171,6 +4870,7 @@
intel: "intel-fish-speech-development"
amd: "rocm-fish-speech-development"
nvidia-l4t: "nvidia-l4t-fish-speech-development"
metal: "metal-fish-speech-development"
default: "cpu-fish-speech-development"
nvidia-cuda-13: "cuda13-fish-speech-development"
nvidia-cuda-12: "cuda12-fish-speech-development"
@@ -5246,6 +4946,16 @@
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech
- !!merge <<: *fish-speech
name: "metal-fish-speech"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-fish-speech"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-fish-speech
- !!merge <<: *fish-speech
name: "metal-fish-speech-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-fish-speech"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-fish-speech
## faster-qwen3-tts
- !!merge <<: *faster-qwen3-tts
name: "faster-qwen3-tts-development"

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/rocm7.0
torch==2.10.0+rocm7.0
torch==2.12.0+cpu
torchaudio
torchvision

View File

@@ -11,8 +11,6 @@ import os
import grpc
from parent_watch import start_parent_death_watcher
class _AbortHandler(grpc.RpcMethodHandler):
"""A method handler that immediately aborts with UNAUTHENTICATED."""
@@ -72,13 +70,6 @@ def get_auth_interceptors(*, aio: bool = False):
Returns an empty list when LOCALAI_GRPC_AUTH_TOKEN is not set.
"""
# Arm the best-effort parent-death backstop here: this is the single helper
# every LocalAI Python backend invokes exactly once while building its gRPC
# server (mirroring how the Go watcher arms in pkg/grpc's shared serve path).
# start_parent_death_watcher() is idempotent and a no-op when disabled or on
# unsupported platforms — see parent_watch.py.
start_parent_death_watcher()
token = os.environ.get("LOCALAI_GRPC_AUTH_TOKEN", "")
if not token:
return []

View File

@@ -20,15 +20,7 @@ def split_reasoning(text, think_start, think_end):
Returns ``(reasoning_content, remaining_text)``. When ``think_start`` is
empty or not found, returns ``("", text)`` unchanged.
"""
if not think_start or not text:
return "", text
if think_start not in text:
# Models like Qwen3.5 open assistant turns already INSIDE thinking, so
# the generated text carries only the closing tag. Everything before it
# is reasoning that would otherwise leak into the content.
if think_end and think_end in text:
head, _, tail = text.partition(think_end)
return head.strip(), tail.strip()
if not think_start or not text or think_start not in text:
return "", text
pattern = re.compile(
re.escape(think_start) + r"(.*?)" + re.escape(think_end or ""),

View File

@@ -1,75 +0,0 @@
"""Unit tests for the mlx/mlx-vlm shared helpers (mlx_utils.py).
Run standalone (Python standard library only, no backend venv needed):
python3 -m unittest mlx_utils_test
These mirror the server-less helper tests in backend/python/mlx/test.py
(TestSharedHelpers), but live here so they run on any platform: the mlx
test module imports grpc/backend_pb2 at import time and needs the MLX venv,
whereas mlx_utils only needs the standard library.
"""
import types
import unittest
from mlx_utils import parse_tool_calls, split_reasoning
class TestSplitReasoning(unittest.TestCase):
def test_both_tags(self):
r, c = split_reasoning(
"<think>step 1\nstep 2</think>The answer is 42.", "<think>", "</think>"
)
self.assertEqual(r, "step 1\nstep 2")
self.assertEqual(c, "The answer is 42.")
def test_implicit_opener_only_closing_tag(self):
# Qwen3.5 opens the assistant turn already inside thinking, so the
# output carries only the closing tag; everything before it is reasoning.
r, c = split_reasoning(
"The user is asking about the weather.\n</think>\n\nThe weather in Rome is sunny.",
"<think>",
"</think>",
)
self.assertEqual(r, "The user is asking about the weather.")
self.assertEqual(c, "The weather in Rome is sunny.")
def test_no_tags_at_all(self):
r, c = split_reasoning("just text", "<think>", "</think>")
self.assertEqual(r, "")
self.assertEqual(c, "just text")
def test_empty_think_end_and_no_opener_match(self):
# No think_end to anchor on, and the opener is absent → return unchanged.
r, c = split_reasoning("no opener here", "<think>", "")
self.assertEqual(r, "")
self.assertEqual(c, "no opener here")
def test_empty_text(self):
r, c = split_reasoning("", "<think>", "</think>")
self.assertEqual(r, "")
self.assertEqual(c, "")
class TestParseToolCalls(unittest.TestCase):
def test_with_shim(self):
tm = types.SimpleNamespace(
tool_call_start="<tool_call>",
tool_call_end="</tool_call>",
parse_tool_call=lambda body, tools: {
"name": "get_weather",
"arguments": {"location": body.strip()},
},
)
calls, remaining = parse_tool_calls(
"Sure: <tool_call>Paris</tool_call>", tm, tools=None
)
self.assertEqual(len(calls), 1)
self.assertEqual(calls[0]["name"], "get_weather")
self.assertEqual(calls[0]["arguments"], '{"location": "Paris"}')
self.assertEqual(calls[0]["index"], 0)
self.assertNotIn("<tool_call>", remaining)
if __name__ == "__main__":
unittest.main()

View File

@@ -1,149 +0,0 @@
"""Parent-death watcher (best-effort backstop) for LocalAI Python backends.
LocalAI spawns each backend as a child process and, on a clean shutdown, tears
it down itself (SIGTERM -> grace -> SIGKILL). That graceful path only runs when
LocalAI receives a catchable signal and lives long enough to run its handlers.
If LocalAI is SIGKILLed (e.g. a supervising process's grace period elapses
first), that teardown never runs and this backend would be reparented to init
and linger, holding GPU/VRAM and its listen port.
The watcher here is a best-effort backstop for exactly that case: it does NOT
replace the graceful teardown, it only covers the "parent vanished without
cleaning up" path. It detects reparenting: when the process that spawned this
backend dies, the kernel reparents us to the nearest sub-reaper or to init
(PID 1), so os.getppid() stops matching the value captured at startup. This
getppid() approach is portable across Linux/macOS (unlike the Linux-only
PR_SET_PDEATHSIG), which is why it is used here, mirroring the Go backends'
pkg/grpc/parentwatch.go and the C++ backends' parent_watch.h. It is disabled on
Windows, which has no equivalent orphan-reparenting semantics.
Env vars (shared verbatim across the Go, C++ and Python backends):
LOCALAI_BACKEND_PARENT_WATCH enabled by default; a falsey value
("false"/"0"/"no"/"off", case-insensitive)
disables it.
LOCALAI_BACKEND_PARENT_WATCH_INTERVAL poll interval as a Go-style duration
string ("500ms", "2s", "1m") or a bare
number of seconds. Defaults to 2s.
"""
import os
import sys
import threading
ENV_PARENT_WATCH = "LOCALAI_BACKEND_PARENT_WATCH"
ENV_PARENT_WATCH_INTERVAL = "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL"
_DEFAULT_INTERVAL_SECONDS = 2.0
# Guard so repeated calls (e.g. get_auth_interceptors invoked more than once)
# only ever arm a single watcher thread per process.
_started = False
_started_lock = threading.Lock()
def _enabled():
"""Report whether the watcher should run in this process."""
# Windows does not reparent orphans to a well-known init PID, so the
# getppid() heuristic used here doesn't apply there.
if os.name == "nt" or sys.platform.startswith("win"):
return False
val = os.environ.get(ENV_PARENT_WATCH, "").strip().lower()
if val in ("false", "0", "no", "off"):
return False
return True
def _interval_seconds():
"""Return the configured poll interval in seconds, or the default.
Accepts Go-style duration strings ("500ms", "2s", "1m") for cross-language
parity, or a bare number interpreted as seconds.
"""
raw = os.environ.get(ENV_PARENT_WATCH_INTERVAL, "").strip()
if not raw:
return _DEFAULT_INTERVAL_SECONDS
# Split numeric prefix from unit suffix.
i = 0
while i < len(raw) and (raw[i].isdigit() or raw[i] == "." or (i == 0 and raw[i] in "+-")):
i += 1
if i == 0:
return _DEFAULT_INTERVAL_SECONDS
try:
num = float(raw[:i])
except ValueError:
return _DEFAULT_INTERVAL_SECONDS
unit = raw[i:].lower()
if unit == "ms":
seconds = num / 1000.0
elif unit in ("s", ""):
seconds = num
elif unit == "m":
seconds = num * 60.0
else:
return _DEFAULT_INTERVAL_SECONDS
return seconds if seconds > 0 else _DEFAULT_INTERVAL_SECONDS
def _parent_died(orig_ppid):
"""Report whether this process has been reparented away from orig_ppid.
Reparenting is the standard POSIX signal that the original parent (here, the
LocalAI process that spawned this backend) has exited: the orphan is handed
to the nearest sub-reaper or to init (PID 1), so os.getppid() no longer
matches the value captured at startup.
"""
ppid = os.getppid()
return ppid != orig_ppid or ppid == 1
def _watch(orig_ppid, interval, on_death):
"""Poll until _parent_died reports the original parent is gone, then call
on_death. Blocks, so run it on its own (daemon) thread."""
import time
while True:
time.sleep(interval)
if _parent_died(orig_ppid):
on_death()
return
def start_parent_death_watcher():
"""Install the best-effort safety net described in this module's docstring.
No-op when disabled, on Windows, when already orphaned at startup
(os.getppid() <= 1), or if already started. This is a backstop alongside —
never a replacement for — LocalAI's graceful teardown.
"""
global _started
if not _enabled():
return
with _started_lock:
if _started:
return
orig_ppid = os.getppid()
# A parent of 1 (or less) at startup means we were already orphaned (or
# launched directly under init) — there is no original parent to watch.
if orig_ppid <= 1:
return
interval = _interval_seconds()
def on_death():
print(
"backend parent process (pid {}) exited without stopping this "
"backend; self-terminating to avoid orphaning".format(orig_ppid),
file=sys.stderr,
flush=True,
)
# Immediate, non-cleanup exit: this is a shutdown safety net and the
# normal graceful path is already gone.
os._exit(1)
thread = threading.Thread(
target=_watch,
args=(orig_ppid, interval, on_death),
name="parent-death-watcher",
daemon=True,
)
thread.start()
_started = True

View File

@@ -1,150 +0,0 @@
"""Unit tests for the parent-death watcher (parent_watch.py).
Run standalone (Python standard library only, no backend venv needed):
python3 -m unittest parent_watch_test
The core test (test_detects_reparent) builds a genuine two-level process tree
(test -> middle -> grandchild) with os.fork, lets the middle process die, and
asserts the grandchild's parent_watch._watch detects the reparenting and
self-terminates — mirroring the Go test in pkg/grpc/parentwatch_test.go and the
C++ test in backend/cpp/llama-cpp/parent_watch_test.cpp.
"""
import os
import sys
import tempfile
import threading
import time
import unittest
import parent_watch
class TestParentWatchEnvParsing(unittest.TestCase):
def setUp(self):
self._saved = {
k: os.environ.get(k)
for k in (parent_watch.ENV_PARENT_WATCH, parent_watch.ENV_PARENT_WATCH_INTERVAL)
}
for k in self._saved:
os.environ.pop(k, None)
def tearDown(self):
for k, v in self._saved.items():
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
def test_interval_default(self):
self.assertEqual(parent_watch._interval_seconds(), 2.0)
def test_interval_units(self):
cases = {"500ms": 0.5, "2s": 2.0, "1m": 60.0, "3": 3.0, "0.5s": 0.5}
for raw, expected in cases.items():
os.environ[parent_watch.ENV_PARENT_WATCH_INTERVAL] = raw
self.assertAlmostEqual(parent_watch._interval_seconds(), expected, msg=raw)
def test_interval_garbage_falls_back(self):
os.environ[parent_watch.ENV_PARENT_WATCH_INTERVAL] = "garbage"
self.assertEqual(parent_watch._interval_seconds(), 2.0)
@unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only")
def test_enabled_default(self):
self.assertTrue(parent_watch._enabled())
@unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only")
def test_disabled_by_falsey(self):
for val in ("false", "0", "no", "off", "OFF", " False "):
os.environ[parent_watch.ENV_PARENT_WATCH] = val
self.assertFalse(parent_watch._enabled(), msg=val)
@unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only")
def test_enabled_by_truthy(self):
for val in ("true", "1", "yes", "on"):
os.environ[parent_watch.ENV_PARENT_WATCH] = val
self.assertTrue(parent_watch._enabled(), msg=val)
@unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "fork/reparent is POSIX only")
class TestParentWatchReparent(unittest.TestCase):
def _wait_for_file(self, path, timeout=10.0):
deadline = time.time() + timeout
while time.time() < deadline:
if os.path.exists(path):
return True
time.sleep(0.02)
return False
def test_detects_reparent(self):
tmpdir = tempfile.mkdtemp(prefix="parentwatch_test_")
ready_file = os.path.join(tmpdir, "ready")
exited_file = os.path.join(tmpdir, "exited")
middle = os.fork()
if middle == 0:
# ---- middle process ----
grandchild = os.fork()
if grandchild == 0:
# ---- grandchild process: arm the REAL watcher against middle ----
orig_ppid = os.getppid()
def on_death():
with open(exited_file, "w") as f:
f.write("1")
os._exit(7)
threading.Thread(
target=parent_watch._watch,
args=(orig_ppid, 0.05, on_death),
daemon=True,
).start()
# Safety valve: never linger if something goes wrong.
def bail():
time.sleep(30)
os._exit(2)
threading.Thread(target=bail, daemon=True).start()
# Signal readiness only after the watcher captured orig_ppid.
with open(ready_file, "w") as f:
f.write(str(os.getpid()))
while True:
time.sleep(1)
else:
# middle: wait until grandchild is ready, then exit to orphan it.
if not self._wait_for_file(ready_file):
os._exit(5)
os._exit(0)
# ---- test (top) process ----
os.waitpid(middle, 0) # reap middle only; grandchild is orphaned
self.assertTrue(os.path.exists(ready_file), "grandchild never signaled readiness")
self.assertTrue(
self._wait_for_file(exited_file),
"watcher did not detect parent death within timeout",
)
# Best-effort cleanup: kill the grandchild if it somehow survived.
try:
with open(ready_file) as f:
pid = int(f.read().strip())
if pid > 1:
os.kill(pid, 9)
except (OSError, ValueError):
pass
for p in (ready_file, exited_file):
try:
os.remove(p)
except OSError:
pass
try:
os.rmdir(tmpdir)
except OSError:
pass
if __name__ == "__main__":
unittest.main()

View File

@@ -58,18 +58,7 @@ def messages_to_dicts(proto_messages):
d["reasoning_content"] = msg.reasoning_content
if msg.tool_calls:
try:
tool_calls = json.loads(msg.tool_calls)
# Chat templates (e.g. Qwen) iterate function.arguments as a
# mapping, but the OpenAI wire format carries it as a JSON
# string — decode it back so the template's .items() works.
for tc in tool_calls:
fn = tc.get("function") if isinstance(tc, dict) else None
if isinstance(fn, dict) and isinstance(fn.get("arguments"), str):
try:
fn["arguments"] = json.loads(fn["arguments"])
except json.JSONDecodeError:
pass
d["tool_calls"] = tool_calls
d["tool_calls"] = json.loads(msg.tool_calls)
except json.JSONDecodeError:
pass
result.append(d)

View File

@@ -1,122 +0,0 @@
"""Unit tests for the shared python backend helpers (python_utils.py).
Run standalone (Python standard library only, no backend venv needed):
python3 -m unittest python_utils_test
These mirror the server-less helper tests in backend/python/mlx/test.py
(TestSharedHelpers), but live here so they run on any platform: the mlx
test module imports grpc/backend_pb2 at import time and needs the MLX venv,
whereas python_utils has no third-party dependency. Proto Message objects
are faked with types.SimpleNamespace (real proto fields default to "").
"""
import json
import types
import unittest
from python_utils import messages_to_dicts, parse_options
def _msg(**fields):
"""Fake a proto Message: every unset field is the empty string, as protobuf."""
defaults = {
"role": "",
"content": "",
"name": "",
"tool_call_id": "",
"reasoning_content": "",
"tool_calls": "",
}
defaults.update(fields)
return types.SimpleNamespace(**defaults)
class TestParseOptions(unittest.TestCase):
def test_type_inference(self):
opts = parse_options(
["temperature:0.7", "max_tokens:128", "trust:true", "name:hello", "no_colon_skipped"]
)
self.assertEqual(opts["temperature"], 0.7)
self.assertEqual(opts["max_tokens"], 128)
self.assertIs(opts["trust"], True)
self.assertEqual(opts["name"], "hello")
self.assertNotIn("no_colon_skipped", opts)
class TestMessagesToDicts(unittest.TestCase):
def test_basic_fields(self):
out = messages_to_dicts(
[
_msg(role="user", content="hi"),
_msg(role="tool", content="42", tool_call_id="call_1", name="f"),
]
)
self.assertEqual(out[0], {"role": "user", "content": "hi"})
self.assertEqual(out[1]["tool_call_id"], "call_1")
self.assertEqual(out[1]["name"], "f")
def test_tool_call_arguments_string_decoded_to_mapping(self):
# OpenAI wire format ships function.arguments as a JSON *string*; chat
# templates iterate it as a mapping, so it must come back as a dict.
out = messages_to_dicts(
[
_msg(
role="assistant",
tool_calls=json.dumps(
[
{
"id": "call_1",
"type": "function",
"function": {
"name": "get_weather",
"arguments": '{"location": "Rome"}',
},
}
]
),
)
]
)
args = out[0]["tool_calls"][0]["function"]["arguments"]
self.assertEqual(args, {"location": "Rome"})
self.assertEqual(dict(args.items()), {"location": "Rome"})
def test_tool_call_arguments_already_mapping_is_idempotent(self):
out = messages_to_dicts(
[
_msg(
role="assistant",
tool_calls=json.dumps(
[{"function": {"name": "f", "arguments": {"a": 1}}}]
),
)
]
)
self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], {"a": 1})
def test_tool_call_arguments_invalid_json_left_as_string(self):
out = messages_to_dicts(
[
_msg(
role="assistant",
tool_calls=json.dumps(
[{"function": {"name": "f", "arguments": "not-json"}}]
),
)
]
)
self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], "not-json")
def test_tool_call_without_function_key(self):
out = messages_to_dicts(
[_msg(role="assistant", tool_calls=json.dumps([{"id": "call_1"}]))]
)
self.assertEqual(out[0]["tool_calls"], [{"id": "call_1"}])
def test_tool_calls_invalid_json_dropped(self):
out = messages_to_dicts([_msg(role="assistant", tool_calls="{not json")])
self.assertNotIn("tool_calls", out[0])
if __name__ == "__main__":
unittest.main()

View File

@@ -13,17 +13,6 @@ fi
# fish-speech uses pyrootutils which requires a .project-root marker
touch "${backend_dir}/.project-root"
# On darwin arm64 the transitive `tokenizers` dep compiles its Rust extension
# from source (Linux uses prebuilt manylinux wheels, so it never compiles
# there). The pinned tokenizers crate that fish-speech's stack resolves to
# contains a `&T` -> `&mut T` cast that trips the now-deny-by-default
# `invalid_reference_casting` lint in the macOS runner's newer Rust toolchain,
# breaking the build (seen in the v4.5.5 release CI fish-speech darwin/metal
# job). Allow that lint so the unchanged third-party crate compiles as before.
# Append rather than clobber any pre-existing RUSTFLAGS; harmless on Linux
# where no Rust compile happens.
export RUSTFLAGS="${RUSTFLAGS:-} -A invalid_reference_casting"
installRequirements
# Clone fish-speech source (the pip package doesn't include inference modules)

View File

@@ -0,0 +1,2 @@
torch
torchaudio

View File

@@ -3,5 +3,4 @@ protobuf
certifi
packaging==24.1
pip
chardet
click
chardet

View File

@@ -1,5 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.10.0
torch==2.12.0+cpu
transformers>=4.56.2
huggingface-hub>=1.3.0
sentencepiece

View File

@@ -1,4 +1,4 @@
torch==2.10.0
torch==2.12.0+cpu
transformers>=4.56.2
huggingface-hub>=1.3.0
sentencepiece

View File

@@ -147,25 +147,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
d["reasoning_content"] = msg.reasoning_content
if msg.tool_calls:
try:
tool_calls = json.loads(msg.tool_calls)
d["tool_calls"] = json.loads(msg.tool_calls)
except json.JSONDecodeError:
pass
else:
# OpenAI wire format carries function.arguments as a
# JSON-encoded string, but chat templates (e.g. Qwen3)
# iterate over it as a mapping. The vllm backend
# already parses arguments before applying the chat
# template (PR #10256); mirror that here so the
# sglang backend works with the same wire format.
if isinstance(tool_calls, list):
for tc in tool_calls:
func = tc.get("function") if isinstance(tc, dict) else None
if isinstance(func, dict) and isinstance(func.get("arguments"), str):
try:
func["arguments"] = json.loads(func["arguments"])
except json.JSONDecodeError:
pass
d["tool_calls"] = tool_calls
result.append(d)
return result

View File

@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
accelerate
torch==2.9.0
torch==2.12.0+cpu
torchvision
torchaudio
transformers

View File

@@ -6,7 +6,7 @@
# for cublas12 so uv consults this index alongside PyPI.
--extra-index-url https://download.pytorch.org/whl/cu128
accelerate
torch==2.9.1
torch==2.12.0+cpu
torchvision
torchaudio
transformers

View File

@@ -1,4 +1,4 @@
accelerate
torch==2.7.0
torch==2.12.0+cu130
transformers
bitsandbytes

View File

@@ -748,12 +748,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# When (A) native streaming ran cleanly, per-delta yields above already
# delivered everything — do NOT extract again on the full text or we'd
# duplicate content/tool_calls into the final chunk.
# NOTE: `native_streaming` is a capability flag ("streaming parser is
# available"), not a state flag ("streaming actually ran"). For
# non-streaming requests it is still True but the per-delta loop was
# never entered, so we MUST still run extract_tool_calls here. Hence
# the explicit `streaming and …` guard on both branches.
if has_tool_parser and not (streaming and native_streaming and not native_streaming_error):
if has_tool_parser and not (native_streaming and not native_streaming_error):
try:
tp = tp_instance
if tp is None:
@@ -775,7 +770,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
))
except Exception as e:
print(f"Tool parser error: {e}", file=sys.stderr)
elif streaming and native_streaming and not native_streaming_error:
elif native_streaming and not native_streaming_error:
# Per-delta path already emitted content + tool_calls; the final
# chat_delta should carry only metadata (token counts, logprobs).
content = ""

View File

@@ -35,21 +35,6 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi
# AMD ROCm: vLLM ships prebuilt ROCm wheels, but on a DEDICATED index
# (https://wheels.vllm.ai/rocm/), NOT PyPI, and ONLY for CPython 3.12. On any
# other Python the installer silently falls back to the CUDA-only PyPI wheel,
# which is unusable on an AMD GPU (import fails, so the backend never finds the
# vllm module). Force Python 3.12 before the venv is created (matches the
# intel/l4t13 cp312 bump); the hipblas branch below pulls vllm from the ROCm
# wheel index. unsafe-best-match lets uv consult that index and PyPI together.
# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
if [ "x${BUILD_TYPE}" == "xhipblas" ]; then
PYTHON_VERSION="3.12"
PYTHON_PATCH="12"
PY_STANDALONE_TAG="20251120"
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi
# cublas13 pulls the vLLM wheel from a per-tag cu130 index (PyPI's vllm wheel
# is built against CUDA 12 and won't load on cu130). uv's default per-package
# first-match strategy would still pick the PyPI wheel, so allow it to consult
@@ -119,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
# can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
# vllm pin (requirements-cublas13-after.txt, bumped independently against
# vllm/vllm) until vllm-metal supports a newer vLLM.
VLLM_METAL_VERSION="v0.3.0.dev20260701212152"
VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
# The coupled vLLM source version is whatever this vllm-metal release builds
# against -- it declares it in its own installer as `vllm_v=`. Derive it from
@@ -209,22 +194,6 @@ elif [ "x${BUILD_TYPE}" == "xintel" ]; then
export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
popd
# AMD ROCm: install vllm from its dedicated ROCm wheel index instead of the
# CUDA-only PyPI wheel. installRequirements brings the base ROCm
# torch/transformers (requirements-hipblas.txt), then we pull vllm (plus the
# matching ROCm torch, via --upgrade) from wheels.vllm.ai/rocm. This is the
# method upstream prescribes for AMD; the Python-3.12 pin is set above.
# There is intentionally no requirements-hipblas-after.txt: a bare `vllm`
# there would resolve to the CUDA wheel, and installRequirements never loads
# a ${BUILD_TYPE}-after file for hipblas anyway (BUILD_TYPE == BUILD_PROFILE).
# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
elif [ "x${BUILD_TYPE}" == "xhipblas" ]; then
installRequirements
# --upgrade reconciles the base ROCm torch to whatever the vllm ROCm wheel
# pins; --extra-index-url adds the ROCm wheel repository on top of PyPI.
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
--extra-index-url https://wheels.vllm.ai/rocm/ --upgrade vllm
# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
# requirements-cpu-after.txt and compiles vllm locally against the host's
# actual CPU. Not used by default because it takes ~30-40 minutes, but

View File

@@ -3,8 +3,8 @@
# on a cu130 host. Pull the cu130-flavoured wheel from vLLM's per-tag index
# instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
# so uv consults this index alongside PyPI.
--extra-index-url https://wheels.vllm.ai/0.24.0/cu130
--extra-index-url https://wheels.vllm.ai/0.23.0/cu130
# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
# which pins this exact vLLM version. Bumping vllm here means coordinating with a
# vllm-metal release that supports the new version, or macOS/Metal builds break.
vllm==0.24.0
vllm==0.23.0

View File

@@ -0,0 +1 @@
vllm

View File

@@ -351,16 +351,6 @@ impl Backend for KokorosService {
Err(Status::unimplemented("Not supported"))
}
type AudioTranscriptionLiveStream =
ReceiverStream<Result<backend::TranscriptLiveResponse, Status>>;
async fn audio_transcription_live(
&self,
_: Request<tonic::Streaming<backend::TranscriptLiveRequest>>,
) -> Result<Response<Self::AudioTranscriptionLiveStream>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn diarize(
&self,
_: Request<backend::DiarizeRequest>,

View File

@@ -207,20 +207,12 @@ func (l *Launcher) StartLocalAI() error {
}
// Build command arguments
dataPath := l.GetDataPath()
args := []string{
"run",
"--models-path", l.config.ModelsPath,
"--backends-path", l.config.BackendsPath,
"--address", l.config.Address,
"--log-level", l.config.LogLevel,
// Keep persistent data and dynamic config under the launcher's data
// directory (~/.localai) rather than letting the server resolve them
// to ${basepath}/{data,configuration}. ${basepath} expands to the
// launcher process's CWD (often the user's home root), which puts
// ~/data and ~/configuration outside ~/.localai. See #10610.
"--data-path", filepath.Join(dataPath, "data"),
"--localai-config-dir", filepath.Join(dataPath, "configuration"),
}
l.localaiCmd = exec.CommandContext(l.ctx, binaryPath, args...)
@@ -437,7 +429,7 @@ func (l *Launcher) CheckForUpdates() (bool, string, error) {
}
// DownloadUpdate downloads the latest version
func (l *Launcher) DownloadUpdate(version string, progressCallback func(downloaded, total int64)) error {
func (l *Launcher) DownloadUpdate(version string, progressCallback func(float64)) error {
return l.releaseManager.DownloadRelease(version, progressCallback)
}
@@ -494,6 +486,7 @@ func (l *Launcher) showDownloadLocalAIDialog() {
fyne.DoAndWait(func() {
// Create a standalone window for the download dialog
dialogWindow := l.app.NewWindow("LocalAI Installation Required")
dialogWindow.Resize(fyne.NewSize(500, 350))
dialogWindow.CenterOnScreen()
dialogWindow.SetCloseIntercept(func() {
dialogWindow.Close()
@@ -555,7 +548,6 @@ func (l *Launcher) showDownloadLocalAIDialog() {
)
dialogWindow.SetContent(content)
resizeToContent(dialogWindow, content)
dialogWindow.Show()
})
}
@@ -629,134 +621,88 @@ func (l *Launcher) showDownloadError(title, message string) {
}
// showDownloadProgress shows a standalone progress window for downloading LocalAI
// after a fresh install (no LocalAI binary present yet).
func (l *Launcher) showDownloadProgress(version, title string) {
l.showDownloadProgressWindow(version, title, func(win fyne.Window) {
dialog.ShowConfirm("Installation Complete",
"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
func(bool) {
win.Close()
l.updateStatus("LocalAI installed successfully")
if l.systray != nil {
l.systray.recreateMenu()
}
}, win)
})
}
// showDownloadProgressWindow renders the download progress popup shared by every
// "download/upgrade LocalAI" entry point. It owns the progress bar, the
// human-readable byte readout, resume-aware retry, and content-fit window
// sizing so the behaviour stays identical everywhere. onSuccess runs (on the UI
// goroutine) once the download verifies, and is responsible for the success
// dialog and any follow-up; the window is passed in so it can be parented/closed.
func (l *Launcher) showDownloadProgressWindow(version, title string, onSuccess func(win fyne.Window)) {
fyne.DoAndWait(func() {
// Create progress window
progressWindow := l.app.NewWindow("Downloading LocalAI")
progressWindow.Resize(fyne.NewSize(400, 250))
progressWindow.CenterOnScreen()
progressWindow.SetCloseIntercept(func() {
progressWindow.Close()
})
// Progress bar
progressBar := widget.NewProgressBar()
progressBar.SetValue(0)
// Status label. Truncate with an ellipsis so a long "Download failed:
// <url>" message can't stretch the window (and progress bar) to fit the
// whole error on one line.
// whole error on one line; the full error is shown in the dialog below.
statusLabel := widget.NewLabel("Preparing download...")
statusLabel.Truncation = fyne.TextTruncateEllipsis
// Release notes button
releaseNotesButton := widget.NewButton("View Release Notes", func() {
releaseNotesURL, err := l.githubReleaseNotesURL(version)
if err != nil {
log.Printf("Failed to parse URL: %v", err)
return
}
l.app.OpenURL(releaseNotesURL)
})
// Retry button: hidden until a download fails. GitHub downloads are
// flaky, and the underlying download resumes from the partial file, so
// a retry continues where it left off rather than starting over.
retryButton := widget.NewButton("Retry", nil)
retryButton.Importance = widget.HighImportance
retryButton.Hide()
buttonRow := container.NewHBox(releaseNotesButton, retryButton)
content := container.NewVBox(
// Progress container
progressContainer := container.NewVBox(
widget.NewLabel(title),
progressBar,
statusLabel,
widget.NewSeparator(),
buttonRow,
releaseNotesButton,
)
progressWindow.SetContent(content)
resizeToContent(progressWindow, content)
var startDownload func()
startDownload = func() {
retryButton.Hide()
progressBar.SetValue(0)
statusLabel.SetText("Preparing download...")
resizeToContent(progressWindow, content)
go func() {
err := l.DownloadUpdate(version, func(downloaded, total int64) {
fyne.Do(func() {
if total > 0 {
progressBar.SetValue(float64(downloaded) / float64(total))
statusLabel.SetText(fmt.Sprintf("Downloading… %s / %s", formatBytes(downloaded), formatBytes(total)))
} else {
statusLabel.SetText(fmt.Sprintf("Downloading… %s", formatBytes(downloaded)))
}
})
})
fyne.Do(func() {
if err != nil {
statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
retryButton.Show()
resizeToContent(progressWindow, content)
return
}
progressBar.SetValue(1.0)
statusLabel.SetText("Download complete")
onSuccess(progressWindow)
})
}()
}
retryButton.OnTapped = startDownload
progressWindow.SetContent(progressContainer)
progressWindow.Show()
startDownload()
// Start download in background
go func() {
err := l.DownloadUpdate(version, func(progress float64) {
// Update progress bar
fyne.Do(func() {
progressBar.SetValue(progress)
percentage := int(progress * 100)
statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
})
})
// Handle completion
fyne.Do(func() {
if err != nil {
statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
// Show error dialog
dialog.ShowError(err, progressWindow)
} else {
statusLabel.SetText("Download completed successfully!")
progressBar.SetValue(1.0)
// Show success dialog
dialog.ShowConfirm("Installation Complete",
"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
func(close bool) {
progressWindow.Close()
// Update status and refresh systray menu
l.updateStatus("LocalAI installed successfully")
if l.systray != nil {
l.systray.recreateMenu()
}
}, progressWindow)
}
})
}()
})
}
// resizeToContent sizes a window to fit its content (with a sane minimum width)
// so the dialog doesn't show a large blank gap below the last widget.
func resizeToContent(w fyne.Window, content fyne.CanvasObject) {
size := content.MinSize()
if size.Width < 400 {
size.Width = 400
}
w.Resize(size)
}
// formatBytes renders a byte count as a human-readable size (e.g. "12.3 MB").
func formatBytes(b int64) string {
const unit = 1024
if b < unit {
return fmt.Sprintf("%d B", b)
}
div, exp := int64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
}
// monitorLogs monitors the output of LocalAI and adds it to the log buffer
func (l *Launcher) monitorLogs(reader io.Reader, prefix string) {
scanner := bufio.NewScanner(reader)

View File

@@ -11,7 +11,6 @@ import (
"net/http"
"os"
"os/exec"
"path"
"path/filepath"
"runtime"
"strings"
@@ -51,12 +50,6 @@ type ReleaseManager struct {
ChecksumsPath string
// MetadataPath is where version metadata is stored
MetadataPath string
// BaseDownloadURL is the base URL release assets are downloaded from
// (defaults to https://github.com; overridable for testing)
BaseDownloadURL string
// RetryBackoff is the base wait between download attempts; the Nth retry
// waits N*RetryBackoff (defaults to 1s; lowered in tests)
RetryBackoff time.Duration
// HTTPClient is the HTTP client used for downloads
HTTPClient *http.Client
}
@@ -69,94 +62,28 @@ func NewReleaseManager() *ReleaseManager {
metadataPath := filepath.Join(homeDir, ".localai", "metadata")
return &ReleaseManager{
GitHubOwner: "mudler",
GitHubRepo: "LocalAI",
BinaryPath: binaryPath,
CurrentVersion: internal.PrintableVersion(),
ChecksumsPath: checksumsPath,
MetadataPath: metadataPath,
BaseDownloadURL: "https://github.com",
RetryBackoff: 1 * time.Second,
HTTPClient: httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
GitHubOwner: "mudler",
GitHubRepo: "LocalAI",
BinaryPath: binaryPath,
CurrentVersion: internal.PrintableVersion(),
ChecksumsPath: checksumsPath,
MetadataPath: metadataPath,
HTTPClient: httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
}
}
// GetLatestRelease resolves the latest LocalAI release.
//
// It first follows the github.com "releases/latest" redirect, which reveals the
// latest tag in the final URL and—crucially—is NOT subject to the
// 60-requests/hour unauthenticated rate limit of api.github.com. That limit is
// per-IP, so on shared/NAT/CGNAT/cloud addresses the API returns 403 almost
// immediately (e.g. on a fresh install with no LocalAI present yet). The
// redirect avoids that entirely. The richer JSON API is kept only as a fallback.
//
// Only the version is consumed by callers, so the redirect's tag is sufficient.
// GetLatestRelease fetches the latest release information from GitHub
func (rm *ReleaseManager) GetLatestRelease() (*Release, error) {
version, redirectErr := rm.latestVersionFromRedirect()
if redirectErr == nil {
return &Release{Version: version}, nil
}
log.Printf("Could not resolve latest version via release redirect (%v); falling back to GitHub API", redirectErr)
release, apiErr := rm.latestReleaseFromAPI()
if apiErr != nil {
// Surface both failures so a rate-limited API doesn't mask the (usually
// more relevant) redirect error.
return nil, fmt.Errorf("failed to fetch latest release: %v (redirect: %v)", apiErr, redirectErr)
}
return release, nil
}
// latestVersionFromRedirect returns the latest tag by following the github.com
// "releases/latest" redirect to ".../releases/tag/<tag>".
func (rm *ReleaseManager) latestVersionFromRedirect() (string, error) {
url := fmt.Sprintf("%s/%s/%s/releases/latest", rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo)
resp, err := rm.HTTPClient.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status %s", resp.Status)
}
// After the redirect is followed, the final request URL is the tag page.
version := path.Base(resp.Request.URL.Path)
if version == "" || version == "." || version == "latest" {
return "", fmt.Errorf("could not determine version from %s", resp.Request.URL.String())
}
return version, nil
}
// latestReleaseFromAPI fetches the latest release JSON from api.github.com. This
// is the fallback path; it is rate-limited unless GITHUB_TOKEN is set.
func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
url := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", rm.GitHubOwner, rm.GitHubRepo)
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return nil, err
}
req.Header.Set("Accept", "application/vnd.github+json")
// An optional token lifts the unauthenticated 60/hour limit to 5000/hour.
if token := os.Getenv("GITHUB_TOKEN"); token != "" {
req.Header.Set("Authorization", "Bearer "+token)
}
resp, err := rm.HTTPClient.Do(req)
resp, err := rm.HTTPClient.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to fetch latest release: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
if (resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusTooManyRequests) &&
resp.Header.Get("X-RateLimit-Remaining") == "0" {
return nil, fmt.Errorf("GitHub API rate limit exceeded (status %d); retry later or set GITHUB_TOKEN to raise the limit", resp.StatusCode)
}
return nil, fmt.Errorf("status %d", resp.StatusCode)
return nil, fmt.Errorf("failed to fetch latest release: status %d", resp.StatusCode)
}
// Parse the JSON response properly
@@ -179,7 +106,7 @@ func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
}
// DownloadRelease downloads a specific version of LocalAI
func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(downloaded, total int64)) error {
func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(float64)) error {
// Ensure the binary directory exists
if err := os.MkdirAll(rm.BinaryPath, 0755); err != nil {
return fmt.Errorf("failed to create binary directory: %w", err)
@@ -190,16 +117,16 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
localPath := filepath.Join(rm.BinaryPath, "local-ai")
// Download the binary
downloadURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/%s",
rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, binaryName)
downloadURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/%s",
rm.GitHubOwner, rm.GitHubRepo, version, binaryName)
if err := rm.downloadFile(downloadURL, localPath, progressCallback); err != nil {
return fmt.Errorf("failed to download binary: %w", err)
}
// Download and verify checksums
checksumURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, version)
checksumURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
rm.GitHubOwner, rm.GitHubRepo, version, version)
checksumPath := filepath.Join(rm.BinaryPath, "checksums.txt")
manualChecksumPath := filepath.Join(rm.ChecksumsPath, fmt.Sprintf("checksums-%s.txt", version))
@@ -227,10 +154,6 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
// Verify the checksum if we have a checksum file
if _, err := os.Stat(checksumPath); err == nil {
if err := rm.VerifyChecksum(localPath, checksumPath, binaryName); err != nil {
// Discard the corrupt binary (and any leftover partial) so the next
// retry starts from a clean slate rather than resuming corruption.
os.Remove(localPath)
os.Remove(localPath + ".part")
return fmt.Errorf("checksum verification failed: %w", err)
}
log.Printf("Checksum verification successful")
@@ -273,88 +196,44 @@ func (rm *ReleaseManager) GetBinaryName(version string) string {
}
// downloadFile downloads a file from a URL to a local path with optional progress callback
func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(downloaded, total int64)) error {
func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(float64)) error {
return rm.downloadFileWithRetry(url, filepath, progressCallback, 3)
}
// downloadFileWithRetry downloads a file with retry and HTTP Range resume.
//
// The body is streamed to "<dest>.part" and only renamed to dest on success, so
// a dropped connection leaves a partial file that the next attempt continues via
// a "Range: bytes=N-" request instead of restarting from zero. This matters for
// GitHub release downloads, which are large and flaky.
func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallback func(downloaded, total int64), maxRetries int) error {
partPath := dest + ".part"
// downloadFileWithRetry downloads a file from a URL with retry logic
func (rm *ReleaseManager) downloadFileWithRetry(url, filepath string, progressCallback func(float64), maxRetries int) error {
var lastErr error
for attempt := 1; attempt <= maxRetries; attempt++ {
if attempt > 1 {
log.Printf("Retrying download (attempt %d/%d): %s", attempt, maxRetries, url)
time.Sleep(time.Duration(attempt) * rm.RetryBackoff)
time.Sleep(time.Duration(attempt) * time.Second)
}
// Resume from however much we already have on disk.
var offset int64
if fi, err := os.Stat(partPath); err == nil {
offset = fi.Size()
}
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return err
}
if offset > 0 {
req.Header.Set("Range", fmt.Sprintf("bytes=%d-", offset))
}
resp, err := rm.HTTPClient.Do(req)
resp, err := rm.HTTPClient.Get(url)
if err != nil {
lastErr = err
continue
}
switch resp.StatusCode {
case http.StatusOK:
// Server ignored the Range (or we had nothing): start fresh.
offset = 0
case http.StatusPartialContent:
// Resume: append to the existing partial file.
case http.StatusRequestedRangeNotSatisfiable:
// Stale or already-complete partial: discard and restart fresh.
resp.Body.Close()
os.Remove(partPath)
lastErr = fmt.Errorf("partial download no longer valid (status %s), restarting", resp.Status)
continue
default:
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
lastErr = fmt.Errorf("bad status: %s", resp.Status)
continue
}
var out *os.File
if offset > 0 {
out, err = os.OpenFile(partPath, os.O_WRONLY|os.O_APPEND, 0644)
} else {
out, err = os.Create(partPath)
}
out, err := os.Create(filepath)
if err != nil {
resp.Body.Close()
return err
}
// On a 206 the Content-Length is the remaining bytes, so the full size
// is what we already have plus what's still to come.
total := resp.ContentLength
if offset > 0 && total > 0 {
total += offset
}
// Create a progress reader if callback is provided
var reader io.Reader = resp.Body
if progressCallback != nil && total > 0 {
if progressCallback != nil && resp.ContentLength > 0 {
reader = &progressReader{
Reader: resp.Body,
Total: total,
Current: offset,
Total: resp.ContentLength,
Callback: progressCallback,
}
}
@@ -364,14 +243,11 @@ func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallba
out.Close()
if err != nil {
// Keep the partial file so the next attempt can resume from it.
lastErr = err
os.Remove(filepath)
continue
}
if err := os.Rename(partPath, dest); err != nil {
return err
}
return nil
}
@@ -446,21 +322,20 @@ func (rm *ReleaseManager) saveVersionMetadata(version string) error {
return nil
}
// progressReader wraps an io.Reader to provide download progress as a
// (downloaded, total) byte count so callers can render both a progress bar and
// a human-readable size.
// progressReader wraps an io.Reader to provide download progress
type progressReader struct {
io.Reader
Total int64
Current int64
Callback func(downloaded, total int64)
Callback func(float64)
}
func (pr *progressReader) Read(p []byte) (int, error) {
n, err := pr.Reader.Read(p)
pr.Current += int64(n)
if pr.Callback != nil {
pr.Callback(pr.Current, pr.Total)
progress := float64(pr.Current) / float64(pr.Total)
pr.Callback(progress)
}
return n, err
}

View File

@@ -1,17 +1,9 @@
package launcher_test
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"time"
. "github.com/onsi/ginkgo/v2"
@@ -186,221 +178,4 @@ var _ = Describe("ReleaseManager", func() {
Expect(err.Error()).To(ContainSubstring("checksum not found"))
})
})
Describe("DownloadRelease resume and retry", func() {
var (
version string
binaryName string
content []byte
checksums string
finalPath string
partPath string
)
BeforeEach(func() {
version = "v9.9.9"
binaryName = rm.GetBinaryName(version)
// Deterministic, non-trivial content so resume/append bugs surface.
content = make([]byte, 4096)
for i := range content {
content[i] = byte(i % 251)
}
sum := sha256.Sum256(content)
checksums = fmt.Sprintf("%s %s\n", hex.EncodeToString(sum[:]), binaryName)
finalPath = filepath.Join(tempDir, "local-ai")
partPath = finalPath + ".part"
// Isolate the persistent checksum/metadata dirs to the temp dir so
// the test never touches the real ~/.localai and existing checksum
// files don't short-circuit the download.
rm.ChecksumsPath = filepath.Join(tempDir, "checksums")
rm.MetadataPath = filepath.Join(tempDir, "metadata")
rm.GitHubOwner = "owner"
rm.GitHubRepo = "repo"
rm.RetryBackoff = time.Millisecond
Expect(os.MkdirAll(tempDir, 0755)).To(Succeed())
})
It("resumes from a partial .part file using a Range request", func() {
Expect(os.WriteFile(partPath, content[:1024], 0644)).To(Succeed())
var mu sync.Mutex
sawRange := false
binBytesServed := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasSuffix(r.URL.Path, "checksums.txt") {
_, _ = w.Write([]byte(checksums))
return
}
if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
var start int
_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
mu.Lock()
sawRange = true
mu.Unlock()
w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
w.WriteHeader(http.StatusPartialContent)
n, _ := w.Write(content[start:])
mu.Lock()
binBytesServed += n
mu.Unlock()
return
}
w.WriteHeader(http.StatusOK)
n, _ := w.Write(content)
mu.Lock()
binBytesServed += n
mu.Unlock()
}))
defer srv.Close()
rm.BaseDownloadURL = srv.URL
err := rm.DownloadRelease(version, nil)
Expect(err).ToNot(HaveOccurred())
got, err := os.ReadFile(finalPath)
Expect(err).ToNot(HaveOccurred())
Expect(got).To(Equal(content))
Expect(sawRange).To(BeTrue(), "expected the download to resume with a Range request")
Expect(binBytesServed).To(Equal(len(content)-1024), "expected only the remaining bytes to be served")
Expect(partPath).ToNot(BeAnExistingFile())
})
It("starts fresh when the server ignores the Range header (200)", func() {
// A stale/garbage partial that must NOT be appended to.
Expect(os.WriteFile(partPath, []byte("garbage-garbage-garbage"), 0644)).To(Succeed())
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasSuffix(r.URL.Path, "checksums.txt") {
_, _ = w.Write([]byte(checksums))
return
}
// Ignore any Range and always serve the full body.
w.WriteHeader(http.StatusOK)
_, _ = w.Write(content)
}))
defer srv.Close()
rm.BaseDownloadURL = srv.URL
err := rm.DownloadRelease(version, nil)
Expect(err).ToNot(HaveOccurred())
got, err := os.ReadFile(finalPath)
Expect(err).ToNot(HaveOccurred())
Expect(got).To(Equal(content))
})
It("restarts the download when the partial is stale (416)", func() {
// Oversized partial -> requested Range start is beyond the content.
Expect(os.WriteFile(partPath, make([]byte, len(content)+10), 0644)).To(Succeed())
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasSuffix(r.URL.Path, "checksums.txt") {
_, _ = w.Write([]byte(checksums))
return
}
if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
var start int
_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
if start >= len(content) {
w.WriteHeader(http.StatusRequestedRangeNotSatisfiable)
return
}
w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
w.WriteHeader(http.StatusPartialContent)
_, _ = w.Write(content[start:])
return
}
w.WriteHeader(http.StatusOK)
_, _ = w.Write(content)
}))
defer srv.Close()
rm.BaseDownloadURL = srv.URL
err := rm.DownloadRelease(version, nil)
Expect(err).ToNot(HaveOccurred())
got, err := os.ReadFile(finalPath)
Expect(err).ToNot(HaveOccurred())
Expect(got).To(Equal(content))
})
It("removes the downloaded file when checksum verification fails", func() {
bad := []byte("this is definitely not the expected binary content")
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasSuffix(r.URL.Path, "checksums.txt") {
// Checksums are for `content`, but we serve `bad`.
_, _ = w.Write([]byte(checksums))
return
}
w.WriteHeader(http.StatusOK)
_, _ = w.Write(bad)
}))
defer srv.Close()
rm.BaseDownloadURL = srv.URL
err := rm.DownloadRelease(version, nil)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("checksum"))
Expect(finalPath).ToNot(BeAnExistingFile())
Expect(partPath).ToNot(BeAnExistingFile())
})
It("reports progress as downloaded and total byte counts", func() {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.HasSuffix(r.URL.Path, "checksums.txt") {
_, _ = w.Write([]byte(checksums))
return
}
w.Header().Set("Content-Length", strconv.Itoa(len(content)))
w.WriteHeader(http.StatusOK)
_, _ = w.Write(content)
}))
defer srv.Close()
rm.BaseDownloadURL = srv.URL
var mu sync.Mutex
var lastDownloaded, lastTotal int64
err := rm.DownloadRelease(version, func(downloaded, total int64) {
mu.Lock()
lastDownloaded = downloaded
lastTotal = total
mu.Unlock()
})
Expect(err).ToNot(HaveOccurred())
Expect(lastTotal).To(Equal(int64(len(content))))
Expect(lastDownloaded).To(Equal(int64(len(content))))
})
})
Describe("GetLatestRelease", func() {
It("resolves the latest version from the releases/latest redirect", func() {
// The github.com redirect path must be preferred over the
// rate-limited api.github.com, so a working redirect yields the tag
// without ever needing the API.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case strings.HasSuffix(r.URL.Path, "/releases/latest"):
http.Redirect(w, r, "/owner/repo/releases/tag/v9.9.9", http.StatusFound)
case strings.HasSuffix(r.URL.Path, "/releases/tag/v9.9.9"):
w.WriteHeader(http.StatusOK)
default:
w.WriteHeader(http.StatusNotFound)
}
}))
defer srv.Close()
rm.BaseDownloadURL = srv.URL
rm.GitHubOwner = "owner"
rm.GitHubRepo = "repo"
release, err := rm.GetLatestRelease()
Expect(err).ToNot(HaveOccurred())
Expect(release.Version).To(Equal("v9.9.9"))
})
})
})

View File

@@ -443,23 +443,84 @@ func (sm *SystrayManager) showStartupErrorDialog(err error) {
})
}
// showDownloadProgress shows a progress window for downloading updates. The
// progress UI (byte readout, resume-aware retry, sizing) is shared with the
// other download entry points via the launcher; only the post-success behaviour
// (restart prompt + systray refresh) is specific to the update flow.
// showDownloadProgress shows a progress window for downloading updates
func (sm *SystrayManager) showDownloadProgress(version string) {
sm.launcher.showDownloadProgressWindow(version, fmt.Sprintf("Downloading LocalAI version %s", version), func(win fyne.Window) {
dialog.ShowConfirm("Update Downloaded",
"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
func(restart bool) {
if restart {
sm.app.Quit()
}
win.Close()
}, win)
// Create a new window for download progress
progressWindow := sm.app.NewWindow("Downloading LocalAI Update")
progressWindow.Resize(fyne.NewSize(400, 250))
progressWindow.CenterOnScreen()
sm.hasUpdateAvailable = false
sm.latestVersion = ""
sm.recreateMenu()
// Progress bar
progressBar := widget.NewProgressBar()
progressBar.SetValue(0)
// Status label. Truncate with an ellipsis so a long "Download failed:
// <url>" message can't stretch the window (and progress bar) to fit the
// whole error on one line; the full error is shown in the dialog below.
statusLabel := widget.NewLabel("Preparing download...")
statusLabel.Truncation = fyne.TextTruncateEllipsis
// Release notes button
releaseNotesButton := widget.NewButton("View Release Notes", func() {
releaseNotesURL, err := sm.launcher.githubReleaseNotesURL(version)
if err != nil {
log.Printf("Failed to parse URL: %v", err)
return
}
sm.app.OpenURL(releaseNotesURL)
})
// Progress container
progressContainer := container.NewVBox(
widget.NewLabel(fmt.Sprintf("Downloading LocalAI version %s", version)),
progressBar,
statusLabel,
widget.NewSeparator(),
releaseNotesButton,
)
progressWindow.SetContent(progressContainer)
progressWindow.Show()
// Start download in background
go func() {
err := sm.launcher.DownloadUpdate(version, func(progress float64) {
// Update progress bar
fyne.Do(func() {
progressBar.SetValue(progress)
percentage := int(progress * 100)
statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
})
})
// Handle completion
fyne.Do(func() {
if err != nil {
statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
// Show error dialog
dialog.ShowError(err, progressWindow)
} else {
statusLabel.SetText("Download completed successfully!")
progressBar.SetValue(1.0)
// Show restart dialog
dialog.ShowConfirm("Update Downloaded",
"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
func(restart bool) {
if restart {
sm.app.Quit()
}
progressWindow.Close()
}, progressWindow)
}
})
// Update systray menu
if err == nil {
sm.hasUpdateAvailable = false
sm.latestVersion = ""
sm.recreateMenu()
}
}()
}

View File

@@ -490,19 +490,14 @@ func (ui *LauncherUI) downloadUpdate() {
ui.UpdateStatus("Downloading update " + version + "...")
go func() {
err := ui.launcher.DownloadUpdate(version, func(downloaded, total int64) {
err := ui.launcher.DownloadUpdate(version, func(progress float64) {
// Update progress bar
fyne.Do(func() {
if total > 0 {
ui.progressBar.SetValue(float64(downloaded) / float64(total))
}
ui.progressBar.SetValue(progress)
})
// The progress bar already shows the percentage, so report the
// human-readable size here instead of repeating the percent.
if total > 0 {
ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s / %s", version, formatBytes(downloaded), formatBytes(total)))
} else {
ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s", version, formatBytes(downloaded)))
}
// Update status with percentage
percentage := int(progress * 100)
ui.UpdateStatus(fmt.Sprintf("Downloading update %s... %d%%", version, percentage))
})
fyne.Do(func() {
@@ -603,6 +598,82 @@ func (ui *LauncherUI) LoadConfiguration() {
log.Printf("UI LoadConfiguration: configuration loaded successfully")
}
// showDownloadProgress shows a progress window for downloading LocalAI
func (ui *LauncherUI) showDownloadProgress(version, title string) {
fyne.DoAndWait(func() {
// Create progress window using the launcher's app
progressWindow := ui.launcher.app.NewWindow("Downloading LocalAI")
progressWindow.Resize(fyne.NewSize(400, 250))
progressWindow.CenterOnScreen()
// Progress bar
progressBar := widget.NewProgressBar()
progressBar.SetValue(0)
// Status label. Truncate with an ellipsis so a long "Download failed:
// <url>" message can't stretch the window (and progress bar) to fit the
// whole error on one line; the full error is shown in the dialog below.
statusLabel := widget.NewLabel("Preparing download...")
statusLabel.Truncation = fyne.TextTruncateEllipsis
// Release notes button
releaseNotesButton := widget.NewButton("View Release Notes", func() {
releaseNotesURL, err := ui.launcher.githubReleaseNotesURL(version)
if err != nil {
log.Printf("Failed to parse URL: %v", err)
return
}
ui.launcher.app.OpenURL(releaseNotesURL)
})
// Progress container
progressContainer := container.NewVBox(
widget.NewLabel(title),
progressBar,
statusLabel,
widget.NewSeparator(),
releaseNotesButton,
)
progressWindow.SetContent(progressContainer)
progressWindow.Show()
// Start download in background
go func() {
err := ui.launcher.DownloadUpdate(version, func(progress float64) {
// Update progress bar
fyne.Do(func() {
progressBar.SetValue(progress)
percentage := int(progress * 100)
statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
})
})
// Handle completion
fyne.Do(func() {
if err != nil {
statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
// Show error dialog
dialog.ShowError(err, progressWindow)
} else {
statusLabel.SetText("Download completed successfully!")
progressBar.SetValue(1.0)
// Show success dialog
dialog.ShowConfirm("Installation Complete",
"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
func(close bool) {
progressWindow.Close()
// Update status
ui.UpdateStatus("LocalAI installed successfully")
}, progressWindow)
}
})
}()
})
}
// UpdateRunningState updates UI based on LocalAI running state
func (ui *LauncherUI) UpdateRunningState(isRunning bool) {
fyne.Do(func() {

View File

@@ -71,42 +71,13 @@ cmd_notarize() {
echo "[notarize] notarized and stapled $dmg"
}
# Notarize and staple the .app bundle itself. Stapling the dmg alone is not
# enough: an app with no embedded ticket has no local proof of notarization, so
# Gatekeeper falls back to an online check — and the app then fails to launch on
# a machine that is offline / behind a firewall, or once it has been copied out
# of the dmg. Stapling the bundle makes it verify offline. notarytool needs an
# archive for a bundle, so we zip it first.
cmd_notarize_app() {
local app="$1"
if [ -z "${MACOS_NOTARY_KEY:-}" ]; then
echo "[notarize] MACOS_NOTARY_KEY unset: skipping notarization of $app"
return 0
fi
local keyfile zip
keyfile="$(mktemp).p8"
zip="$(mktemp).zip"
echo "$MACOS_NOTARY_KEY" | base64 --decode > "$keyfile"
ditto -c -k --keepParent "$app" "$zip"
xcrun notarytool submit "$zip" \
--key "$keyfile" \
--key-id "${MACOS_NOTARY_KEY_ID:?}" \
--issuer "${MACOS_NOTARY_ISSUER_ID:?}" \
--wait
rm -f "$keyfile" "$zip"
xcrun stapler staple "$app"
xcrun stapler validate "$app"
echo "[notarize] notarized and stapled $app"
}
main() {
local sub="${1:-}"; shift || true
case "$sub" in
import-cert) cmd_import_cert ;;
sign) cmd_sign "$@" ;;
notarize) cmd_notarize "$@" ;;
notarize-app) cmd_notarize_app "$@" ;;
*) echo "usage: $0 {import-cert|sign <path>|notarize <dmg>|notarize-app <app>}" >&2; exit 2 ;;
import-cert) cmd_import_cert ;;
sign) cmd_sign "$@" ;;
notarize) cmd_notarize "$@" ;;
*) echo "usage: $0 {import-cert|sign <path>|notarize <dmg>}" >&2; exit 2 ;;
esac
}

View File

@@ -37,8 +37,6 @@ func (a *Application) RestartAgentJobService() error {
if d.JobStore != nil {
agentJobService.SetDistributedJobStore(d.JobStore)
}
// Keep agent tasks consistent across replicas (same client the dispatcher uses).
agentJobService.SetTaskSyncNATS(d.Nats)
}
// Start the service

View File

@@ -103,11 +103,6 @@ func newApplication(appConfig *config.ApplicationConfig) *Application {
mcpTools.CloseMCPSessions(modelName)
})
// Record a model_load backend trace for every real backend load, so the
// Traces UI shows which backend runtime served each model and how long
// the load took. Load failures are traced by the modality wrappers.
ml.SetLoadObserver(corebackend.ModelLoadTraceObserver(appConfig))
app := &Application{
backendLoader: config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
modelLoader: ml,
@@ -609,10 +604,6 @@ func (a *Application) StartAgentPool() {
usm.SetJobDBStore(s)
}
}
// Keep per-user agent tasks consistent across replicas (nil in standalone).
if d := a.Distributed(); d != nil {
usm.SetJobSyncNATS(d.Nats)
}
aps.SetUserServicesManager(usm)
a.agentPoolService.Store(aps)

View File

@@ -197,7 +197,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
envWatchdogBusy := appConfig.WatchDogBusy == startupAppConfig.WatchDogBusy
envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout
envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout
envWatchdogInterval := appConfig.WatchDogInterval == startupAppConfig.WatchDogInterval
envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend
envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends
envMemoryReclaimerEnabled := appConfig.MemoryReclaimerEnabled == startupAppConfig.MemoryReclaimerEnabled
@@ -258,14 +257,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
xlog.Warn("invalid watchdog busy timeout in runtime_settings.json", "error", err, "timeout", *settings.WatchdogBusyTimeout)
}
}
if settings.WatchdogInterval != nil && !envWatchdogInterval {
dur, err := time.ParseDuration(*settings.WatchdogInterval)
if err == nil {
appConfig.WatchDogInterval = dur
} else {
xlog.Warn("invalid watchdog interval in runtime_settings.json", "error", err, "interval", *settings.WatchdogInterval)
}
}
// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
if settings.MaxActiveBackends != nil && !envMaxActiveBackends {
appConfig.MaxActiveBackends = *settings.MaxActiveBackends

View File

@@ -355,13 +355,6 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
PrefixProvider: prefixProvider,
PrefixConfig: prefixCfg,
Pressure: pressure,
SharedModels: cfg.Distributed.SharedModels,
// Cap how long a cold load may hold the per-model advisory lock: the
// configured backend.install deadline plus a margin for file staging and
// the remote LoadModel. Derived from the install timeout so raising it
// (for slow links pulling multi-GB images) widens the ceiling too,
// instead of letting the static default cut a legitimately slow load.
ModelLoadCeiling: cfg.Distributed.BackendInstallTimeoutOrDefault() + 10*time.Minute,
})
// Wire staging-progress broadcasting so file-staging shows up on every

View File

@@ -87,31 +87,6 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
})
})
// Watchdog check interval (issue #10601). Unlike the idle/busy timeouts
// (which default to 0), NewApplicationConfig baseline-defaults the
// interval to 500ms. The loader's "apply file value only if still at the
// zero default" env-detection therefore never fired for the interval, so
// a UI-saved Check Interval silently reverted to 500ms on every restart
// while the idle/busy timeouts persisted. These specs construct the
// config the same way boot does (NewApplicationConfig) so they observe
// the real default the loader sees.
Describe("watchdog interval", func() {
It("loads a UI-saved watchdog_interval on the next startup", func() {
cfg := config.NewApplicationConfig()
cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
loadRuntimeSettingsFromFile(cfg)
Expect(cfg.WatchDogInterval).To(Equal(2 * time.Second))
})
It("does not override an explicit env/CLI interval", func() {
cfg := config.NewApplicationConfig()
cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
cfg.WatchDogInterval = 1 * time.Second // simulate SetWatchDogInterval from env
loadRuntimeSettingsFromFile(cfg)
Expect(cfg.WatchDogInterval).To(Equal(1*time.Second), "env/CLI interval must win over the persisted file value")
})
})
// MITM listener address. The file is the only source — no env var
// exists — so a regression here means an admin who configured the
// listener via /api/settings loses it after a reboot, even though

View File

@@ -280,9 +280,6 @@ func New(opts ...config.AppOption) (*Application, error) {
if application.agentJobService != nil {
application.agentJobService.SetDistributedBackends(distSvc.Dispatcher)
application.agentJobService.SetDistributedJobStore(distSvc.JobStore)
// Keep agent tasks consistent across replicas (jobs already sync via the
// dispatcher + DB read-through). Same NATS client the dispatcher uses.
application.agentJobService.SetTaskSyncNATS(distSvc.Nats)
}
// Wire skill store into AgentPoolService (wired at pool start time via closure)
// The actual wiring happens in StartAgentPool since the pool doesn't exist yet.
@@ -369,7 +366,7 @@ func New(opts ...config.AppOption) (*Application, error) {
}
for _, backend := range options.ExternalBackends {
if err := galleryop.InstallExternalBackend(options.Context, options.BackendGalleries, options.SystemState, application.ModelLoader(), nil, backend, "", "", false, options.RequireBackendIntegrity); err != nil {
if err := galleryop.InstallExternalBackend(options.Context, options.BackendGalleries, options.SystemState, application.ModelLoader(), nil, backend, "", "", options.RequireBackendIntegrity); err != nil {
xlog.Error("error installing external backend", "error", err)
}
}

View File

@@ -1,72 +0,0 @@
package backend_test
import (
"errors"
"time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/trace"
"github.com/mudler/LocalAI/pkg/model"
)
// ModelLoadTraceObserver is what makes successful loads visible on the
// Traces page: one model_load row per real backend load, carrying the
// resolved backend runtime. Failures must NOT be recorded here — the
// modality wrappers own those — and the observer must respect the runtime
// tracing toggle.
var _ = Describe("ModelLoadTraceObserver", func() {
var appConfig *config.ApplicationConfig
successEvent := model.BackendLoadEvent{
ModelID: "parakeet-cpp-realtime_eou_120m-v1",
ModelName: "realtime_eou_120m.gguf",
Backend: "parakeet-cpp",
BackendURI: "/backends/intel-sycl-f16-parakeet-cpp-development/run.sh",
Duration: 1500 * time.Millisecond,
}
BeforeEach(func() {
appConfig = &config.ApplicationConfig{
EnableTracing: true,
TracingMaxItems: 64,
}
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
trace.ClearBackendTraces()
})
It("records a model_load trace with the backend runtime on success", func() {
backend.ModelLoadTraceObserver(appConfig)(successEvent)
Eventually(trace.GetBackendTraces).Should(HaveLen(1))
got := trace.GetBackendTraces()[0]
Expect(got.Type).To(Equal(trace.BackendTraceModelLoad))
Expect(got.Summary).To(Equal("Model loaded"))
Expect(got.ModelName).To(Equal("parakeet-cpp-realtime_eou_120m-v1"))
Expect(got.Backend).To(Equal("parakeet-cpp"))
Expect(got.Duration).To(Equal(1500 * time.Millisecond))
Expect(got.Data["backend_runtime"]).To(Equal("/backends/intel-sycl-f16-parakeet-cpp-development/run.sh"))
Expect(got.Data["model_file"]).To(Equal("realtime_eou_120m.gguf"))
Expect(got.Error).To(BeEmpty())
})
It("skips failed loads — the modality wrappers trace those with request context", func() {
failed := successEvent
failed.Err = errors.New("grpc service not ready")
backend.ModelLoadTraceObserver(appConfig)(failed)
Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
})
It("records nothing when tracing is disabled", func() {
appConfig.EnableTracing = false
backend.ModelLoadTraceObserver(appConfig)(successEvent)
Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
})
})

View File

@@ -19,39 +19,6 @@ import (
"github.com/mudler/xlog"
)
// ModelLoadTraceObserver returns the ModelLoader load observer that records
// a model_load backend trace for every successful real load (backend process
// spawn + LoadModel RPC; cache hits never reach the observer). Failures are
// deliberately skipped here: the modality wrappers already record them via
// recordModelLoadFailure with request context, and the backend auto-discovery
// scan probes several backends before one succeeds — tracing every probe
// failure would bury the buffer in noise.
//
// The traced data includes the resolved backend runtime (the installed
// backend's launcher path, which names the variant directory) — that is what
// identifies WHICH build served the load. A stale installed backend is
// invisible in the model config but obvious here.
func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.BackendLoadEvent) {
return func(ev model.BackendLoadEvent) {
if ev.Err != nil || !appConfig.EnableTracing {
return
}
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
trace.RecordBackendTrace(trace.BackendTrace{
Timestamp: time.Now(),
Duration: ev.Duration,
Type: trace.BackendTraceModelLoad,
ModelName: ev.ModelID,
Backend: ev.Backend,
Summary: "Model loaded",
Data: map[string]any{
"model_file": ev.ModelName,
"backend_runtime": ev.BackendURI,
},
})
}
}
// recordModelLoadFailure records a backend trace when model loading fails.
func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
if !appConfig.EnableTracing {

View File

@@ -181,7 +181,6 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
Text: r.Text,
Language: r.Language,
Duration: float64(r.Duration),
Eou: r.Eou,
}
for _, s := range r.Segments {

Some files were not shown because too many files have changed in this diff Show More