chore(deps): bump the pip group across 4 directories with 1 update

Bumps the pip group with 1 update in the /backend/python/ace-step directory: torch. Bumps the pip group with 1 update in the /backend/python/llama-cpp-quantization directory: torch. Bumps the pip group with 1 update in the /backend/python/sglang directory: torch. Bumps the pip group with 1 update in the /backend/python/vllm-omni directory: torch. Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu Updates `torch` from 2.10.0+rocm7.0 to 2.12.0+cpu Updates `torch` from 2.10.0 to 2.12.0+cpu Updates `torch` from 2.10.0 to 2.12.0+cpu Updates `torch` from 2.10.0 to 2.12.0+cpu Updates `torch` from 2.10.0 to 2.12.0+cpu Updates `torch` from 2.9.0 to 2.12.0+cpu Updates `torch` from 2.9.0 to 2.12.0+cpu Updates `torch` from 2.9.0 to 2.12.0+cpu Updates `torch` from 2.9.0 to 2.12.0+cpu Updates `torch` from 2.7.0 to 2.12.0+cu130 Updates `torch` from 2.7.0 to 2.12.0+cu130 Updates `torch` from 2.7.0 to 2.12.0+cu130 Updates `torch` from 2.7.0 to 2.12.0+cu130 --- updated-dependencies: - dependency-name: torch dependency-version: 2.12.0+cpu dependency-type: direct:production - dependency-name: torch dependency-version: 2.12.0+cpu dependency-type: direct:production - dependency-name: torch dependency-version: 2.12.0+cpu dependency-type: direct:production - dependency-name: torch dependency-version: 2.12.0+cu130 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>
2026-07-03 04:46:54 -04:00 · 2026-06-27 08:10:29 +00:00
275 changed files with 1572 additions and 18897 deletions
--- a/.githooks/pre-commit
+++ b/.githooks/pre-commit
@@ -7,11 +7,8 @@
 # Runs only the checks relevant to what's staged:
 #   - Go files          -> make lint + make test-coverage-check
 #   - core/http/react-ui -> make test-ui-coverage-check (Playwright e2e + gate)
-#   - realtime state machines / specs -> make test-realtime-conformance
-#       (respcoord/**, turncoord/**, or formal-verification/** -- a pure .fizz
-#        spec edit must still re-verify the design, detected separately from Go)
-# A commit touching none of these is skipped entirely (other docs/YAML can't
-# change lint findings, Go coverage, the UI, or the realtime conformance gate).
+# A commit touching neither is skipped entirely (docs/YAML/etc. can't change
+# lint findings, Go coverage, or the UI).
 #
 # To bypass for a single commit (e.g. a WIP checkpoint): git commit --no-verify
 set -eu
@@ -23,13 +20,11 @@ staged="$(git diff --cached --name-only --diff-filter=ACMRD)"

 go_changed=0
 ui_changed=0
-rt_changed=0
 if echo "$staged" | grep -qE '\.go$'; then go_changed=1; fi
 if echo "$staged" | grep -qE '^core/http/react-ui/'; then ui_changed=1; fi
-if echo "$staged" | grep -qE '^(core/http/endpoints/openai/(coordinator|respcoord|turncoord|conncoord|compactcoord|ttscoord)/|formal-verification/)'; then rt_changed=1; fi

-if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ] && [ "$rt_changed" -eq 0 ]; then
-	echo "pre-commit: no Go, React UI, or realtime-spec changes staged — skipping."
+if [ "$go_changed" -eq 0 ] && [ "$ui_changed" -eq 0 ]; then
+	echo "pre-commit: no Go or React UI changes staged — skipping."
 	exit 0
 fi

@@ -62,11 +57,4 @@ if [ "$ui_changed" -eq 1 ]; then
 	make test-ui-coverage-check
 fi

-if [ "$rt_changed" -eq 1 ]; then
-	echo "pre-commit ▶ realtime state-machine conformance (make test-realtime-conformance) —"
-	echo "             Go transition/rapid tests under -race + FizzBee model check of the"
-	echo "             authoritative specs. Fail-closed: needs FizzBee (make install-fizzbee)."
-	make test-realtime-conformance
-fi
-
 echo "pre-commit ✓ all relevant checks passed"
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -3745,302 +3745,6 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
-  # voice-detect
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "8"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-12-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-13-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect'
-    base-image: "ubuntu:24.04"
-    ubuntu-version: '2404'
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-voice-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f32'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f32-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f16'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f16-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-voice-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-voice-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-arm64-voice-detect'
-    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2204'
-  - build-type: 'hipblas'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-rocm-hipblas-voice-detect'
-    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
-    runs-on: 'ubuntu-latest'
-    skip-drivers: 'false'
-    backend: "voice-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  # face-detect
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "8"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-12-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-nvidia-cuda-13-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "13"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-cuda-13-arm64-face-detect'
-    base-image: "ubuntu:24.04"
-    ubuntu-version: '2404'
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: ''
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-cpu-face-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f32'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f32-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'sycl_f16'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-intel-sycl-f16-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    platform-tag: 'amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-face-detect'
-    runs-on: 'ubuntu-latest'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'vulkan'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/arm64'
-    platform-tag: 'arm64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-vulkan-face-detect'
-    runs-on: 'ubuntu-24.04-arm'
-    base-image: "ubuntu:24.04"
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
-  - build-type: 'cublas'
-    cuda-major-version: "12"
-    cuda-minor-version: "0"
-    platforms: 'linux/arm64'
-    skip-drivers: 'false'
-    tag-latest: 'auto'
-    tag-suffix: '-nvidia-l4t-arm64-face-detect'
-    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-    runs-on: 'ubuntu-24.04-arm'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2204'
-  - build-type: 'hipblas'
-    cuda-major-version: ""
-    cuda-minor-version: ""
-    platforms: 'linux/amd64'
-    tag-latest: 'auto'
-    tag-suffix: '-gpu-rocm-hipblas-face-detect'
-    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
-    runs-on: 'ubuntu-latest'
-    skip-drivers: 'false'
-    backend: "face-detect"
-    dockerfile: "./backend/Dockerfile.golang"
-    context: "./"
-    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -5224,14 +4928,6 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-ced"
    build-type: "metal"
    lang: "go"
-  - backend: "voice-detect"
-    tag-suffix: "-metal-darwin-arm64-voice-detect"
-    build-type: "metal"
-    lang: "go"
-  - backend: "face-detect"
-    tag-suffix: "-metal-darwin-arm64-face-detect"
-    build-type: "metal"
-    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
@@ -5295,6 +4991,9 @@ includeDarwin:
  - backend: "qwen-tts"
    tag-suffix: "-metal-darwin-arm64-qwen-tts"
    build-type: "mps"
+  - backend: "fish-speech"
+    tag-suffix: "-metal-darwin-arm64-fish-speech"
+    build-type: "mps"
  - backend: "voxcpm"
    tag-suffix: "-metal-darwin-arm64-voxcpm"
    build-type: "mps"
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -82,7 +82,7 @@ jobs:
      # as the Linux registry cache.
      - name: Restore Homebrew cache
        id: brew-cache
-        uses: actions/cache/restore@v6
+        uses: actions/cache/restore@v4
        with:
          path: |
            ~/Library/Caches/Homebrew/downloads
@@ -142,7 +142,7 @@ jobs:

      - name: Save Homebrew cache
        if: github.event_name != 'pull_request' && steps.brew-cache.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v6
+        uses: actions/cache/save@v4
        with:
          path: |
            ~/Library/Caches/Homebrew/downloads
@@ -178,7 +178,7 @@ jobs:
      - name: Restore ccache
        if: inputs.backend == 'llama-cpp'
        id: ccache-cache
-        uses: actions/cache/restore@v6
+        uses: actions/cache/restore@v4
        with:
          path: ~/Library/Caches/ccache
          key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}
@@ -211,7 +211,7 @@ jobs:
      - name: Restore Python wheel cache
        if: inputs.lang == 'python'
        id: pyenv-cache
-        uses: actions/cache/restore@v6
+        uses: actions/cache/restore@v4
        with:
          path: |
            ~/Library/Caches/pip
@@ -256,14 +256,14 @@ jobs:

      - name: Save ccache
        if: inputs.backend == 'llama-cpp' && github.event_name != 'pull_request'
-        uses: actions/cache/save@v6
+        uses: actions/cache/save@v4
        with:
          path: ~/Library/Caches/ccache
          key: ccache-llama-${{ runner.arch }}-${{ steps.llama-version.outputs.version }}-${{ github.run_id }}

      - name: Save Python wheel cache
        if: inputs.lang == 'python' && github.event_name != 'pull_request' && steps.pyenv-cache.outputs.cache-hit != 'true'
-        uses: actions/cache/save@v6
+        uses: actions/cache/save@v4
        with:
          path: |
            ~/Library/Caches/pip
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -46,14 +46,6 @@ jobs:
            variable: "CED_VERSION"
            branch: "master"
            file: "backend/go/ced/Makefile"
-          - repository: "mudler/voice-detect.cpp"
-            variable: "VOICEDETECT_VERSION"
-            branch: "master"
-            file: "backend/go/voice-detect/Makefile"
-          - repository: "mudler/face-detect.cpp"
-            variable: "FACEDETECT_VERSION"
-            branch: "master"
-            file: "backend/go/face-detect/Makefile"
          - repository: "mudler/depth-anything.cpp"
            variable: "DEPTHANYTHING_VERSION"
            branch: "master"
--- a/.github/workflows/realtime-conformance.yml
+++ b/.github/workflows/realtime-conformance.yml
@@ -1,69 +0,0 @@
---
-name: 'realtime-conformance'
-
-# Verifies the realtime state-machine implementations conform to their formal
-# designs (docs/design/realtime-state-machines.md, formal-verification/). BOTH
-# layers are enforced and the gate is fail-closed: the Go conformance layer
-# (respcoord + turncoord transition/rapid tests under -race) AND the FizzBee model check of
-# the authoritative specs. FizzBee is pinned + checksum-verified
-# (formal-verification/fizzbee.sha256), so a failed install fails the job rather
-# than silently skipping verification.
-
-on:
-  pull_request:
-    paths:
-      - 'core/http/endpoints/openai/coordinator/**'
-      - 'core/http/endpoints/openai/respcoord/**'
-      - 'core/http/endpoints/openai/turncoord/**'
-      - 'core/http/endpoints/openai/conncoord/**'
-      - 'core/http/endpoints/openai/compactcoord/**'
-      - 'core/http/endpoints/openai/ttscoord/**'
-      - 'formal-verification/**'
-      - 'scripts/realtime-conformance.sh'
-      - 'scripts/install-fizzbee.sh'
-      - '.github/workflows/realtime-conformance.yml'
-  push:
-    branches:
-      - master
-    paths:
-      - 'core/http/endpoints/openai/coordinator/**'
-      - 'core/http/endpoints/openai/respcoord/**'
-      - 'core/http/endpoints/openai/turncoord/**'
-      - 'core/http/endpoints/openai/conncoord/**'
-      - 'core/http/endpoints/openai/compactcoord/**'
-      - 'core/http/endpoints/openai/ttscoord/**'
-      - 'formal-verification/**'
-      - 'scripts/realtime-conformance.sh'
-
-concurrency:
-  group: realtime-conformance-${{ github.event.pull_request.number || github.sha }}-${{ github.repository }}
-  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
-
-jobs:
-  conformance:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        go-version: ['1.26.x']
-    steps:
-      - name: Clone
-        uses: actions/checkout@v7
-      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v5
-        with:
-          go-version: ${{ matrix.go-version }}
-          cache: false
-      - name: Cache FizzBee
-        uses: actions/cache@v4
-        with:
-          path: .tools/fizzbee
-          key: fizzbee-v0.5.2-${{ runner.os }}-${{ hashFiles('formal-verification/fizzbee.sha256') }}
-      - name: Install FizzBee (pinned, checksum-verified)
-        # No `|| true`: a failed/forged download must fail the job, not silently
-        # drop the design verification. install-fizzbee.sh is a no-op if the
-        # cached binary is already present and valid.
-        run: ./scripts/install-fizzbee.sh
-      - name: Run conformance gate (fail-closed)
-        # No skip env: both the Go conformance and the FizzBee model check are
-        # required. The gate auto-detects .tools/fizzbee/fizz.
-        run: make test-realtime-conformance
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -1008,11 +1008,7 @@ jobs:
  # image + working dir.
  tests-vibevoice-cpp-grpc-transcription:
    needs: detect-changes
-    # Skip on release tag pushes: the ASR Q4_K model is ~10 GB and cannot be
-    # pulled from HF within the inner `go test -timeout 30m` budget on a CI
-    # runner, so every tag build hung and timed out. Still runs on PRs/branch
-    # pushes that touch vibevoice-cpp so regressions are caught off the release path.
-    if: (needs.detect-changes.outputs.vibevoice-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true') && !startsWith(github.ref, 'refs/tags/')
+    if: needs.detect-changes.outputs.vibevoice-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
    runs-on: bigger-runner
    timeout-minutes: 150
    steps:
--- a/.gitignore
+++ b/.gitignore
@@ -97,12 +97,3 @@ core/http/react-ui/test-results/

 # Local Apple signing material (never commit)
 .certs/
-
-# Pinned dev tools (e.g. FizzBee for the realtime-conformance gate)
-.tools/
-
-# FizzBee model-check artifacts: the parser emits <spec>.json next to each
-# .fizz and the checker writes run dirs under out/. Both are regenerated by
-# the realtime-conformance gate; only the .fizz sources are authoritative.
-formal-verification/*.json
-formal-verification/out/
--- a/11
+++ b/11
@@ -171,17 +171,6 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
    ln -s /opt/rocm-**/lib/llvm/lib/libomp.so /usr/lib/libomp.so \
    ; fi

-# ROCm's bundled libdrm_amdgpu is built with a hardcoded fallback lookup path
-# for the ASIC ID table (/opt/amdgpu/share/libdrm/amdgpu.ids), which only exists
-# if AMD's full amdgpu graphics/DKMS stack is installed. This compute-only image
-# doesn't have it, so hipblas/rocBLAS log "No such file or directory" on every
-# model load and can fail to identify the GPU. Point it at the equivalent file
-# Ubuntu's libdrm-common package already ships.
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ -f /usr/share/libdrm/amdgpu.ids ] && [ ! -e /opt/amdgpu/share/libdrm/amdgpu.ids ]; then \
-    mkdir -p /opt/amdgpu/share/libdrm && \
-    ln -s /usr/share/libdrm/amdgpu.ids /opt/amdgpu/share/libdrm/amdgpu.ids \
-    ; fi
-
 RUN expr "${BUILD_TYPE}" = intel && echo "intel" > /run/localai/capability || echo "not intel"

 # Cuda
--- a/23
+++ b/23
@@ -405,18 +405,6 @@ test-realtime: build-mock-backend
 	@echo 'Running realtime e2e tests (mock backend)'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="Realtime && !real-models" --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e

-# Verify the realtime state-machine implementations conform to their formal
-# designs (Go transition/rapid tests under -race + FizzBee model check of the
-# authoritative specs). See docs/design/realtime-state-machines.md (Part 6) and
-# docs/design/specs/README.md.
-test-realtime-conformance:
-	GOCMD=$(GOCMD) ./scripts/realtime-conformance.sh
-
-# Install the pinned, checksum-verified FizzBee model checker (into .tools/,
-# gitignored) used by test-realtime-conformance. Idempotent; no-op if present.
-install-fizzbee:
-	./scripts/install-fizzbee.sh
-
 # Container-based real-model realtime testing. Build env vars / pipeline
 # definition kept here so test-realtime-models-docker can drive a fully wired
 # pipeline (VAD + STT + LLM + TTS) from inside a containerised runner.
@@ -1039,7 +1027,7 @@ test-extra-backend-whisper-transcription: docker-build-whisper
 ## is reachable.
 test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
 	BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
-	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/realtime_eou_120m-v1-f16.gguf \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
 	BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
 	BACKEND_TEST_CAPS=health,load,transcription \
 	$(MAKE) test-extra-backend
@@ -1482,13 +1470,8 @@ build-launcher-darwin:
 	mv cmd/launcher/LocalAI.app dist/LocalAI.app
 	bash contrib/macos/sign-and-notarize.sh sign dist/LocalAI.app

-# Notarize + staple the .app itself, then wrap it into a drag-to-Applications
-# DMG via hdiutil and sign the DMG. The app is stapled BEFORE packaging so the
-# bundle carries its own ticket and verifies offline (a dmg-only staple leaves
-# the app relying on an online Gatekeeper check, which fails offline / once the
-# app is copied out of the dmg). No-op without notary secrets.
+# Wrap the (signed) app into a drag-to-Applications DMG via hdiutil, then sign the DMG.
 dmg-launcher-darwin: build-launcher-darwin
-	bash contrib/macos/sign-and-notarize.sh notarize-app dist/LocalAI.app
 	rm -rf dist/dmg dist/LocalAI.dmg
 	mkdir -p dist/dmg
 	cp -R dist/LocalAI.app dist/dmg/LocalAI.app
@@ -1500,7 +1483,7 @@ dmg-launcher-darwin: build-launcher-darwin
 notarize-launcher-darwin: dmg-launcher-darwin
 	bash contrib/macos/sign-and-notarize.sh notarize dist/LocalAI.dmg

-# Single entrypoint for CI: build -> sign app -> notarize+staple app -> dmg -> sign dmg -> notarize+staple dmg.
+# Single entrypoint for CI: build -> sign app -> dmg -> sign dmg -> notarize -> staple.
 release-launcher-darwin: notarize-launcher-darwin
 	@echo "dist/LocalAI.dmg is ready"

--- a/README.md
+++ b/README.md
@@ -177,7 +177,6 @@ For more details, see the [Getting Started guide](https://localai.io/basics/gett

 ## Latest News

- **June 2026**: New native biometric backends from the LocalAI team: [voice-detect.cpp](https://github.com/mudler/voice-detect.cpp) for speaker recognition and voice analysis (ECAPA-TDNN, WeSpeaker, ERes2Net, CAM++, wav2vec2 age/gender/emotion) and [face-detect.cpp](https://github.com/mudler/face-detect.cpp) for face detection, recognition, demographics and anti-spoofing (SCRFD/ArcFace, YuNet/SFace). Both are from-scratch C++/ggml engines with no Python or onnxruntime at inference, self-contained GGUF weights, bit-exact parity with the reference, and GPU cuDNN parity, replacing the heavier Python `insightface` and `speaker-recognition` backends ([PR #10441](https://github.com/mudler/LocalAI/pull/10441)).
 - **June 2026**: New [realtime voice assistant demo](https://github.com/localai-org/localai-realtime-demo) (a tiny Go client for the Realtime API with a full talk-back voice loop and tool calling), plus [streaming of the realtime LLM / TTS / transcription pipeline stages](https://github.com/mudler/LocalAI/pull/10176) and [configurable WebRTC ICE candidates](https://github.com/mudler/LocalAI/pull/10231).
 - **June 2026**: Big speech push: the [parakeet.cpp](https://github.com/mudler/parakeet.cpp) ASR engine gains [NeMo-faithful segment timestamps](https://github.com/mudler/LocalAI/pull/10207), a [multilingual streaming Nemotron-3.5 model](https://github.com/mudler/LocalAI/pull/10199), [dynamic batching for concurrent transcription](https://github.com/mudler/LocalAI/pull/10112) and [CUDA graphs](https://github.com/mudler/LocalAI/pull/10273); the new [CrispASR backend](https://github.com/mudler/LocalAI/pull/10099) adds multi-architecture ASR + TTS, and [60 Piper TTS voices across 42 languages](https://github.com/mudler/LocalAI/pull/10296) land in the gallery (plus [per-request TTS instructions and params](https://github.com/mudler/LocalAI/pull/10172)).
 - **June 2026**: New backends and models: [locate-anything.cpp](https://github.com/mudler/LocalAI/pull/10264) for open-vocabulary object detection via ggml, [Ideogram4 image generation](https://github.com/mudler/LocalAI/pull/10201) in stablediffusion-ggml, [llama.cpp video input](https://github.com/mudler/LocalAI/pull/10216), and the [Gemma 4 QAT family with MTP speculative-decoding pairs](https://github.com/mudler/LocalAI/pull/10215). Plus an [interactive CLI chat mode](https://github.com/mudler/LocalAI/pull/10226) and [RAG source citations in agent responses](https://github.com/mudler/LocalAI/pull/10228).
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -137,7 +137,7 @@ RUN <<EOT bash
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -18,18 +18,6 @@ service Backend {
  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc AudioTranscriptionStream(TranscriptRequest) returns (stream TranscriptStreamResponse) {}
-  // AudioTranscriptionLive is the bidirectional live-microphone ASR RPC. The
-  // first message MUST carry a Config; subsequent messages carry Audio frames
-  // (mono float PCM at config.sample_rate, 16 kHz default). After a
-  // successful open the backend replies with a single ready ack
-  // (TranscriptLiveResponse{ready:true}); backends or models without
-  // cache-aware streaming support return UNIMPLEMENTED instead. Newly
-  // finalized text streams back as deltas; eou=true marks the model's
-  // end-of-utterance token. One stream spans many utterances (the decoder
-  // resets itself after each EOU). Closing the send side finalizes: the
-  // backend flushes the decoder tail and emits a terminal message carrying
-  // final_result. A second Config mid-stream resets the decode session.
-  rpc AudioTranscriptionLive(stream TranscriptLiveRequest) returns (stream TranscriptLiveResponse) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc TTSStream(TTSRequest) returns (stream Reply) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -491,10 +479,6 @@ message TranscriptResult {
  string text = 2;
  string language = 3;
  float duration = 4;
-  // True when the decode ended on the model's end-of-utterance special token
-  // (<EOU>/<EOB>, emitted by cache-aware streaming models such as
-  // parakeet_realtime_eou_120m-v1). The marker itself is stripped from text.
-  bool eou = 5;
 }

 message TranscriptStreamResponse {
@@ -502,34 +486,6 @@ message TranscriptStreamResponse {
  TranscriptResult final_result = 2;
 }

-// === AudioTranscriptionLive messages =====================================
-
-message TranscriptLiveRequest {
-  oneof payload {
-    TranscriptLiveConfig config = 1;
-    TranscriptLiveAudio  audio  = 2;
-  }
-}
-
-message TranscriptLiveConfig {
-  string language = 1;             // "" => model default
-  int32 sample_rate = 2;           // 0 => 16000; backends may reject others
-  map<string, string> params = 3;  // backend-specific tuning
-}
-
-message TranscriptLiveAudio {
-  repeated float pcm = 1;          // mono PCM in [-1,1] at config.sample_rate
-}
-
-message TranscriptLiveResponse {
-  bool ready = 1;                       // open ack: sent once, before any delta
-  string delta = 2;                     // newly-finalized text since previous response
-  bool eou = 3;                         // <EOU> fired during this feed (the user yielded the turn)
-  repeated TranscriptWord words = 4;    // words finalized by this feed (stream-relative ns)
-  TranscriptResult final_result = 5;    // terminal message only, after the send side closes
-  bool eob = 6;                         // <EOB> fired: a backchannel ("uh-huh") ended — NOT a turn boundary
-}
-
 message TranscriptWord {
  int64 start = 1;
  int64 end = 2;
--- a/backend/cpp/ik-llama-cpp/CMakeLists.txt
+++ b/backend/cpp/ik-llama-cpp/CMakeLists.txt
@@ -1,6 +1,15 @@
-## Multimodal support is provided by the in-tree `mtmd` library target
-## (examples/mtmd/), which the grpc-server links and includes below. clip/llava
-## were pruned upstream; the high-level mtmd_* / mtmd_helper_* API is used instead.
+## Clip/LLaVA library for multimodal support — built locally from copied sources
+set(TARGET myclip)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_include_directories(myclip PUBLIC .)
+target_include_directories(myclip PUBLIC ../..)
+target_include_directories(myclip PUBLIC ../../common)
+target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual)
+endif()

 set(TARGET grpc-server)
 set(CMAKE_CXX_STANDARD 17)
@@ -58,16 +67,12 @@ add_library(hw_grpc_proto
  ${hw_proto_hdrs} )

 add_executable(${TARGET} grpc-server.cpp json.hpp)
-# mtmd public headers (mtmd.h / mtmd-helper.h) live in examples/mtmd/.
-# Linking the mtmd target also propagates this include dir, but we add it
-# explicitly for clarity.
-target_include_directories(${TARGET} PRIVATE ../mtmd)
-target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
  protobuf::${_PROTOBUF_LIBPROTOBUF})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=87fc8701ff4da81a7d2a91ec0695f95eb3066a47
+IK_LLAMA_VERSION?=b84902d2ad27c34f989f23947200c4b91b1568fd
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/ik-llama-cpp/grpc-server.cpp
+++ b/backend/cpp/ik-llama-cpp/grpc-server.cpp
@@ -11,8 +11,8 @@
 #include <memory>
 #include <string>
 #include <getopt.h>
-#include "mtmd.h"
-#include "mtmd-helper.h"
+#include "clip.h"
+#include "llava.h"
 #include "log.h"
 #include "common.h"
 #include "json.hpp"
@@ -45,9 +45,7 @@ using backend::HealthMessage;

 ///// LLAMA.CPP server code below

-// Match mtmd.h and ik_llama's server/common headers, which all use
-// nlohmann::ordered_json; a plain nlohmann::json alias collides at global scope.
-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 struct server_params
 {
@@ -221,11 +219,6 @@ struct llama_client_slot

    // multimodal
    std::vector<slot_image> images;
-    // Full prompt with mtmd media markers (mtmd_default_marker()) substituted in
-    // place of the legacy [img-N] tags, covering the text up to and including the
-    // last image. The text after the last image is kept in params.input_suffix and
-    // decoded through the normal token path so the sampling loop is unchanged.
-    std::string mtmd_prompt;

    // stats
    size_t sent_count = 0;
@@ -259,14 +252,14 @@ struct llama_client_slot

        for (slot_image & img : images)
        {
-            if (img.bitmap) {
-                mtmd_bitmap_free(img.bitmap);
-                img.bitmap = nullptr;
+            free(img.image_embedding);
+            if (img.img_data) {
+                clip_image_u8_free(img.img_data);
            }
+            img.prefix_prompt = "";
        }

        images.clear();
-        mtmd_prompt = "";
    }

    bool has_budget(gpt_params &global_params) {
@@ -403,13 +396,46 @@ struct llama_metrics {
    }
 };

+struct llava_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
 struct llama_server_context
 {
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    const llama_vocab * vocab = nullptr;

-    mtmd_context *mctx = nullptr;
+    clip_ctx *clp_ctx = nullptr;

    gpt_params params;

@@ -465,6 +491,11 @@ struct llama_server_context
        if (!params.mmproj.path.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
+            clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1);
+            if(clp_ctx == nullptr) {
+                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
+                return false;
+            }

            if (params.n_ctx < 2048) { // request larger context for the image embedding
                params.n_ctx = 2048;
@@ -481,24 +512,10 @@ struct llama_server_context
        }

        if (multimodal) {
-            // mtmd_init_from_file requires the already-loaded text model, so it must
-            // run AFTER llama_init_from_gpt_params. It validates the projector
-            // against the model internally and returns nullptr on dim mismatch, so
-            // the explicit clip_n_mmproj_embd check is no longer needed.
-            mtmd_context_params mparams = mtmd_context_params_default();
-            mparams.use_gpu         = params.mmproj_use_gpu;
-            mparams.print_timings   = false;
-            mparams.n_threads       = params.n_threads_mtmd != -1 ? params.n_threads_mtmd
-                                      : params.n_threads_batch != -1 ? params.n_threads_batch
-                                                                     : params.n_threads;
-            mparams.verbosity       = GGML_LOG_LEVEL_INFO;
-            mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED
-                                                        : LLAMA_FLASH_ATTN_TYPE_DISABLED;
-            mparams.image_min_tokens = params.image_min_tokens;
-            mparams.image_max_tokens = params.image_max_tokens;
-            mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
-            if (mctx == nullptr) {
-                LOG_ERR("unable to load multimodal projector: %s", params.mmproj.path.c_str());
+            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
+            const int n_embd_llm  = llama_model_n_embd(model);
+            if (n_embd_clip != n_embd_llm) {
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -848,8 +865,8 @@ struct llama_server_context

                    slot_image img_sl;
                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    img_sl.bitmap = mtmd_helper_bitmap_init_from_buf(mctx, image_buffer.data(), image_buffer.size());
-                    if (img_sl.bitmap == nullptr)
+                    img_sl.img_data = clip_image_u8_init();
+                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
                             __func__,
@@ -862,74 +879,50 @@ struct llama_server_context
                        {"slot_id",   slot->id},
                        {"img_sl_id", img_sl.id}
                    });
+                    img_sl.request_encode_image = true;
                    slot->images.push_back(img_sl);
                }
-                // Translate the legacy [img-N] tags into mtmd media markers, in
-                // order, and collect the matching bitmaps in marker order so they
-                // line up with the markers passed to mtmd_tokenize(). The text after
-                // the last image stays in input_suffix and is decoded through the
-                // normal token path, so the sampling loop is unchanged.
-                // example: system prompt [img-102] user [img-103] describe [img-134]
+                // process prompt
+                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
                if (slot->images.size() > 0 && !slot->prompt.is_array())
                {
-                    const std::string marker = mtmd_default_marker();
                    std::string prompt = slot->prompt.get<std::string>();
-                    std::string built_prompt;
-                    std::vector<slot_image> ordered;
-                    size_t pos = 0, copy_from = 0;
+                    size_t pos = 0, begin_prefix = 0;
                    std::string pattern = "[img-";
-
-                    auto free_images = [&]() {
-                        for (slot_image &img : slot->images) {
-                            if (img.bitmap) {
-                                mtmd_bitmap_free(img.bitmap);
-                                img.bitmap = nullptr;
-                            }
-                        }
-                        slot->images.clear();
-                    };
-
                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
-                        size_t tag_begin = pos;
+                        size_t end_prefix = pos;
                        pos += pattern.length();
                        size_t end_pos = prompt.find(']', pos);
-                        if (end_pos == std::string::npos) {
-                            break;
-                        }
-                        std::string image_id = prompt.substr(pos, end_pos - pos);
-                        try
+                        if (end_pos != std::string::npos)
                        {
-                            int img_id = std::stoi(image_id);
-                            bool found = false;
-                            for (slot_image &img : slot->images)
+                            std::string image_id = prompt.substr(pos, end_pos - pos);
+                            try
                            {
-                                if (img.id == img_id) {
-                                    found = true;
-                                    // text before this tag, then the media marker
-                                    built_prompt += prompt.substr(copy_from, tag_begin - copy_from);
-                                    built_prompt += marker;
-                                    copy_from = end_pos + 1;
-                                    ordered.push_back(img);
-                                    break;
+                                int img_id = std::stoi(image_id);
+                                bool found = false;
+                                for (slot_image &img : slot->images)
+                                {
+                                    if (img.id == img_id) {
+                                        found = true;
+                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
+                                        begin_prefix = end_pos + 1;
+                                        break;
+                                    }
                                }
-                            }
-                            if (!found) {
-                                LOG("ERROR: Image with id: %i, not found.\n", img_id);
-                                free_images();
+                                if (!found) {
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
+                                    slot->images.clear();
+                                    return false;
+                                }
+                            } catch (const std::invalid_argument& e) {
+                                LOG("Invalid image number id in prompt\n");
+                                slot->images.clear();
                                return false;
                            }
-                        } catch (const std::invalid_argument& e) {
-                            LOG("Invalid image number id in prompt\n");
-                            free_images();
-                            return false;
                        }
-                        pos = end_pos + 1;
                    }
-                    // bitmaps are consumed in marker order by mtmd_tokenize()
-                    slot->images = ordered;
-                    slot->mtmd_prompt = built_prompt;
                    slot->prompt = "";
-                    slot->params.input_suffix = prompt.substr(copy_from);
+                    slot->params.input_suffix = prompt.substr(begin_prefix);
                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
                }
            }
@@ -1183,10 +1176,21 @@ struct llama_server_context

    bool process_images(llama_client_slot &slot) const
    {
-        // With the mtmd pipeline, image encoding is no longer eager: the bitmaps
-        // are tokenized and encoded together with the surrounding text inside
-        // ingest_images() via mtmd_tokenize() + mtmd_helper_eval_chunks(). This
-        // just reports whether the slot carries any images to process.
+        for (slot_image &img : slot.images)
+        {
+            if (!img.request_encode_image)
+            {
+                continue;
+            }
+
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+                LOG("Error processing the given image");
+                return false;
+            }
+
+            img.request_encode_image = false;
+        }
+
        return slot.images.size() > 0;
    }

@@ -1431,70 +1435,69 @@ struct llama_server_context
        }
    }

-    // Tokenize the multimodal prompt (text interleaved with media markers) together
-    // with the slot's bitmaps, then decode the resulting chunks into the llama
-    // context via the high-level mtmd helper. The helper runs llama_decode() on the
-    // text chunks and mtmd_encode() + llama_decode() on the image chunks, handling
-    // batching and any pre/post decode setup (e.g. non-causal attention for gemma3).
-    // Advances slot.n_past by the number of positions consumed, then leaves the
-    // post-image suffix tokens in `batch` so the normal decode + sampling loop
-    // produces the first generated token.
+    // for multiple images processing
    bool ingest_images(llama_client_slot &slot, int n_batch)
    {
-        if (mctx == nullptr)
-        {
-            LOG("%s : multimodal context is not initialized\n", __func__);
-            return false;
-        }
+        int image_idx = 0;

-        // bitmaps stay owned by slot.images (freed on reset()); pass non-owning ptrs
-        std::vector<const mtmd_bitmap *> bitmaps;
-        bitmaps.reserve(slot.images.size());
-        for (const slot_image &img : slot.images)
+        while (image_idx < (int) slot.images.size())
        {
-            bitmaps.push_back(img.bitmap);
-        }
+            slot_image &img = slot.images[image_idx];

-        mtmd_input_text inp_txt;
-        inp_txt.text          = slot.mtmd_prompt.c_str();
-        inp_txt.add_special   = add_bos_token;
-        inp_txt.parse_special = true;
+            // process prefix prompt
+            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
+            {
+                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+                llama_batch batch_view = {
+                    n_tokens,
+                    batch.token    + i,
+                    nullptr,
+                    batch.pos      + i,
+                    batch.n_seq_id + i,
+                    batch.seq_id   + i,
+                    batch.logits   + i,
+                };
+                if (llama_decode(ctx, batch_view))
+                {
+                    LOG("%s : failed to eval\n", __func__);
+                    return false;
+                }
+            }

-        mtmd::input_chunks chunks(mtmd_input_chunks_init());
-        int32_t res = mtmd_tokenize(mctx,
-                                    chunks.ptr.get(),
-                                    &inp_txt,
-                                    bitmaps.data(),
-                                    bitmaps.size());
-        if (res != 0)
-        {
-            LOG("%s : failed to tokenize multimodal prompt, res = %d\n", __func__, res);
-            return false;
-        }
+            // process image with llm
+            for (int i = 0; i < img.image_tokens; i += n_batch)
+            {
+                int n_eval = img.image_tokens - i;
+                if (n_eval > n_batch)
+                {
+                    n_eval = n_batch;
+                }

-        const llama_pos start_pos = (llama_pos) system_tokens.size() + slot.n_past;
-        llama_pos new_n_past = start_pos;
-        if (mtmd_helper_eval_chunks(mctx,
-                                    ctx,
-                                    chunks.ptr.get(),
-                                    start_pos,
-                                    slot.id,
-                                    n_batch,
-                                    /*logits_last=*/ false,
-                                    &new_n_past) != 0)
-        {
-            LOG("%s : failed to eval multimodal chunks\n", __func__);
-            return false;
-        }
-        slot.n_past += (int32_t) (new_n_past - start_pos);
+                const int n_embd = llama_model_n_embd(model);
+                float * embd = img.image_embedding + i * n_embd;
+                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
+                if (llama_decode(ctx, llava_batch.batch))
+                {
+                    LOG("%s : failed to eval image\n", __func__);
+                    return false;
+                }
+                slot.n_past += n_eval;
+            }
+            image_idx++;

-        // queue the post-image suffix text for the normal decode + sampling path
-        common_batch_clear(batch);
-        std::vector<llama_token> suffix_tokens = tokenize(slot.params.input_suffix, false);
-        for (llama_token tok : suffix_tokens)
-        {
-            common_batch_add(batch, tok, system_tokens.size() + slot.n_past, { slot.id }, false);
-            slot.n_past += 1;
+            common_batch_clear(batch);
+
+            // append prefix of next image
+            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
+                slot.params.input_suffix : // no more images, then process suffix prompt
+                (json)(slot.images[image_idx].prefix_prompt);
+
+            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
+            for (int i = 0; i < (int) append_tokens.size(); ++i)
+            {
+                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                slot.n_past += 1;
+            }
        }

        return true;
@@ -1881,11 +1884,8 @@ struct llama_server_context

                    const bool has_images = process_images(slot);

-                    // For the multimodal path the whole pre-image / inter-image text is
-                    // tokenized and decoded inside ingest_images() via mtmd, so no prefix
-                    // tokens are queued here; the post-image suffix is appended by
-                    // ingest_images() for the normal decode + sampling loop.
-                    std::vector<llama_token> prefix_tokens = has_images ? std::vector<llama_token>() : prompt_tokens;
+                    // process the prefix of first image
+                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;

                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;

--- a/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch
+++ b/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch
@@ -0,0 +1,11 @@
+--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
+@@ -2494,7 +2494,7 @@
+             }
+             new_data = work.data();
+
+-            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
+            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr, nullptr);
+         } else {
+             new_type = cur->type;
+             new_data = cur->data;
--- a/backend/cpp/ik-llama-cpp/prepare.sh
+++ b/backend/cpp/ik-llama-cpp/prepare.sh
@@ -17,9 +17,28 @@ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -r utils.hpp llama.cpp/examples/grpc-server/
 cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/

-## Multimodal support is provided by the `mtmd` library target (examples/mtmd/),
-## which the grpc-server links and includes directly. No source copy is needed:
-## clip/llava were pruned upstream and the high-level mtmd_* API is used instead.
+## Copy clip/llava files for multimodal support (built as myclip library)
+cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
+# Prepend llama.h include to llava.h
+echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
+cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
+# Copy clip-impl.h if it exists
+if [ -f llama.cpp/examples/llava/clip-impl.h ]; then
+    cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
+fi
+# Copy stb_image.h
+if [ -f llama.cpp/vendor/stb/stb_image.h ]; then
+    cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
+elif [ -f llama.cpp/common/stb_image.h ]; then
+    cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
+fi
+
+## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd)
+if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then
+    sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp
+fi

 set +e
 if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
--- a/backend/cpp/ik-llama-cpp/utils.hpp
+++ b/backend/cpp/ik-llama-cpp/utils.hpp
@@ -11,12 +11,9 @@

 #include "json.hpp"

-#include "mtmd.h"
+#include "clip.h"

-// mtmd.h and ik_llama's entire server/common stack (chat.h, server-common.h,
-// server-task.h, ...) declare `using json = nlohmann::ordered_json`, so match it
-// here: a plain `nlohmann::json` alias collides with mtmd.h's at global scope.
-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 extern bool server_verbose;

@@ -114,12 +111,13 @@ struct slot_image
 {
    int32_t id;

-    // mtmd bitmap (image/audio) decoded from the request buffer. Owned by the
-    // slot; freed via mtmd_bitmap_free() on reset. The high-level mtmd pipeline
-    // (mtmd_tokenize + mtmd_helper_eval_chunks) consumes these directly, so the
-    // legacy eager-encode fields (embedding/tokens) and per-image prefix prompt
-    // are no longer needed.
-    mtmd_bitmap * bitmap = nullptr;
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
 };

 // completion token output with probabilities
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -101,13 +101,4 @@ if(LLAMA_GRPC_BUILD_TESTS)
    target_link_libraries(message_content_test PRIVATE ${_LLAMA_COMMON_TARGET})
    target_compile_features(message_content_test PRIVATE cxx_std_17)
    add_test(NAME message_content_test COMMAND message_content_test)
-
-    # Parent-death watcher test (parent_watch.h) — standard library only, but
-    # needs a threading runtime for std::thread.
-    find_package(Threads REQUIRED)
-    add_executable(parent_watch_test parent_watch_test.cpp parent_watch.h)
-    target_include_directories(parent_watch_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-    target_link_libraries(parent_watch_test PRIVATE Threads::Threads)
-    target_compile_features(parent_watch_test PRIVATE cxx_std_17)
-    add_test(NAME parent_watch_test COMMAND parent_watch_test)
 endif()
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=fdb1db877c526ec90f668eca1b858da5dba85560
+LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
@@ -156,11 +156,11 @@ llama-cpp-grpc: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target ggml-rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc

 llama-cpp-rpc-server: llama-cpp-grpc
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/ggml-rpc-server llama-cpp-rpc-server
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server

 llama.cpp:
 	mkdir -p llama.cpp
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -30,19 +30,6 @@
 #define LOCALAI_HAS_SERVER_SCHEMA 1
 #include "server-schema.cpp"
 #endif
-// server-stream.cpp exists only in llama.cpp after the upstream refactor that
-// added the SSE stream-resumption layer (stream_session/stream_pipe_producer).
-// server-context.cpp calls into it (spipe->cleanup(), stream_aware_should_stop,
-// stream_session_attach_pipe), so its definitions must be part of this
-// translation unit or the link fails with "undefined reference to
-// stream_pipe_producer::cleanup()". The file is self-contained (its only
-// external symbols come from server-common, already pulled in above) and the
-// http route-handler factories it also defines are unused here but harmless.
-// __has_include keeps the source compatible with older pins/forks that predate
-// the split.
-#if __has_include("server-stream.cpp")
-#include "server-stream.cpp"
-#endif
 #include "server-context.cpp"

 // LocalAI
@@ -75,8 +62,6 @@
 #include <windows.h>
 #endif

-#include "parent_watch.h" // best-effort parent-death backstop (see header)
-

 using grpc::Server;
 using grpc::ServerBuilder;
@@ -3444,10 +3429,6 @@ int main(int argc, char** argv) {
    }
  }

-    // Best-effort backstop: self-terminate if the LocalAI process that spawned
-    // us dies without cleaning us up (see parent_watch.h).
-    llama_grpc::start_parent_death_watcher();
-
    server_context ctx_server;
    BackendServiceImpl service(ctx_server);

--- a/backend/cpp/llama-cpp/parent_watch.h
+++ b/backend/cpp/llama-cpp/parent_watch.h
@@ -1,179 +0,0 @@
-// Parent-death watcher (best-effort backstop) for the llama.cpp gRPC backend.
-//
-// LocalAI spawns this backend as a child process and, on a clean shutdown,
-// tears it down itself (SIGTERM -> grace -> SIGKILL). That graceful path only
-// runs when LocalAI receives a catchable signal and lives long enough to run
-// its handlers. If LocalAI is SIGKILLed (e.g. a supervising process's grace
-// period elapses first), that teardown never runs and this backend would be
-// reparented to init and linger, holding VRAM and its listen port.
-//
-// The watcher here is a best-effort backstop for exactly that case: it does
-// NOT replace the graceful teardown, it only covers the "parent vanished
-// without cleaning up" path. It detects reparenting: when the process that
-// spawned this backend dies, the kernel reparents us to the nearest sub-reaper
-// or to init (PID 1), so getppid() stops matching the value captured at
-// startup. This getppid() approach is portable across Linux/macOS (unlike the
-// Linux-only PR_SET_PDEATHSIG), which is why it is used here, mirroring the Go
-// backends' pkg/grpc/parentwatch.go. It is disabled on Windows, which has no
-// equivalent orphan-reparenting semantics.
-//
-// This header is intentionally dependency-free (C++ standard library only) so
-// it can be exercised by a standalone unit test (parent_watch_test.cpp) without
-// building the full llama.cpp + gRPC backend.
-#ifndef LLAMA_GRPC_PARENT_WATCH_H
-#define LLAMA_GRPC_PARENT_WATCH_H
-
-#include <algorithm>
-#include <cctype>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <string>
-#include <thread>
-
-#if !defined(_WIN32)
-#include <unistd.h> // getppid(2), _exit(2)
-#endif
-
-namespace llama_grpc {
-
-// Env var names are shared verbatim with the Go and Python backends for
-// consistency across languages.
-inline const char *kEnvParentWatch()         { return "LOCALAI_BACKEND_PARENT_WATCH"; }
-inline const char *kEnvParentWatchInterval() { return "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL"; }
-
-// Default poll interval in milliseconds. Matches the Go side's 2 * time.Second.
-inline long parent_watch_default_interval_ms() { return 2000; }
-
-namespace detail {
-inline std::string trim_lower(const std::string &in, bool lower) {
-    size_t a = in.find_first_not_of(" \t\r\n");
-    size_t b = in.find_last_not_of(" \t\r\n");
-    if (a == std::string::npos) {
-        return "";
-    }
-    std::string s = in.substr(a, b - a + 1);
-    if (lower) {
-        std::transform(s.begin(), s.end(), s.begin(),
-                       [](unsigned char c) { return std::tolower(c); });
-    }
-    return s;
-}
-} // namespace detail
-
-// parent_watch_enabled reports whether the watcher should run. Enabled by
-// default; a falsey value ("false"/"0"/"no"/"off", case-insensitive) disables
-// it, matching the Go implementation's exact semantics.
-inline bool parent_watch_enabled() {
-#if defined(_WIN32)
-    return false;
-#else
-    const char *v = std::getenv(kEnvParentWatch());
-    if (v == nullptr || v[0] == '\0') {
-        return true;
-    }
-    const std::string s = detail::trim_lower(v, true);
-    return !(s == "false" || s == "0" || s == "no" || s == "off");
-#endif
-}
-
-// parent_watch_interval_ms returns the poll interval in milliseconds. Accepts
-// Go-style duration strings ("500ms", "2s", "1m") for cross-language parity, or
-// a bare number interpreted as seconds. Defaults to
-// parent_watch_default_interval_ms().
-inline long parent_watch_interval_ms() {
-    const long def = parent_watch_default_interval_ms();
-    const char *v = std::getenv(kEnvParentWatchInterval());
-    if (v == nullptr || v[0] == '\0') {
-        return def;
-    }
-    const std::string s = detail::trim_lower(v, false);
-    if (s.empty()) {
-        return def;
-    }
-    size_t i = 0;
-    while (i < s.size() && (std::isdigit((unsigned char)s[i]) || s[i] == '.')) {
-        i++;
-    }
-    if (i == 0) {
-        return def;
-    }
-    double num = 0.0;
-    try {
-        num = std::stod(s.substr(0, i));
-    } catch (...) {
-        return def;
-    }
-    const std::string unit = s.substr(i);
-    long ms;
-    if (unit == "ms") {
-        ms = (long)num;
-    } else if (unit == "s" || unit.empty()) {
-        ms = (long)(num * 1000.0);
-    } else if (unit == "m") {
-        ms = (long)(num * 60000.0);
-    } else {
-        return def; // unrecognized unit
-    }
-    return ms > 0 ? ms : def;
-}
-
-#if !defined(_WIN32)
-// parent_died reports whether this process has been reparented away from the
-// parent it had when the watcher started. Reparenting is the standard POSIX
-// signal that the original parent (here, the LocalAI process that spawned this
-// backend) has exited: the orphan is handed to the nearest sub-reaper or to
-// init (PID 1), so getppid() no longer matches the value captured at startup.
-inline bool parent_died(pid_t orig_ppid) {
-    const pid_t ppid = getppid();
-    return ppid != orig_ppid || ppid == 1;
-}
-
-// watch_parent_death polls until parent_died reports the original parent is
-// gone, then invokes on_death. It blocks, so run it on its own thread.
-inline void watch_parent_death(pid_t orig_ppid, long interval_ms,
-                               const std::function<void()> &on_death) {
-    for (;;) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms));
-        if (parent_died(orig_ppid)) {
-            on_death();
-            return;
-        }
-    }
-}
-#endif
-
-// start_parent_death_watcher installs the best-effort safety net described in
-// the file header on the calling backend process. It is a no-op when disabled,
-// on Windows, or when the process is already orphaned at startup
-// (getppid() <= 1). This is a backstop alongside — never a replacement for —
-// LocalAI's graceful teardown.
-inline void start_parent_death_watcher() {
-#if !defined(_WIN32)
-    if (!parent_watch_enabled()) {
-        return;
-    }
-    const pid_t orig_ppid = getppid();
-    // A parent of 1 (or less) at startup means we were already orphaned (or
-    // launched directly under init) — there is no original parent to watch for.
-    if (orig_ppid <= 1) {
-        return;
-    }
-    const long interval_ms = parent_watch_interval_ms();
-    std::thread([orig_ppid, interval_ms]() {
-        watch_parent_death(orig_ppid, interval_ms, [orig_ppid]() {
-            fprintf(stderr,
-                    "backend parent process (pid %d) exited without stopping "
-                    "this backend; self-terminating to avoid orphaning\n",
-                    (int)orig_ppid);
-            fflush(stderr);
-            _exit(1);
-        });
-    }).detach();
-#endif
-}
-
-} // namespace llama_grpc
-
-#endif // LLAMA_GRPC_PARENT_WATCH_H
--- a/backend/cpp/llama-cpp/parent_watch_test.cpp
+++ b/backend/cpp/llama-cpp/parent_watch_test.cpp
@@ -1,197 +0,0 @@
-// Unit tests for the parent-death watcher (parent_watch.h).
-//
-// Build & run standalone (C++ standard library only, no nlohmann/json needed):
-//   g++ -std=c++17 -pthread parent_watch_test.cpp -o t && ./t
-//
-// The core test (TestDetectsReparent) builds a genuine two-level process tree
-// (test -> middle -> grandchild), lets the middle process die, and asserts the
-// grandchild's watch_parent_death detects the reparenting and self-terminates —
-// mirroring the Go test in pkg/grpc/parentwatch_test.go, but with fork(2).
-//
-// On Windows this file compiles to a no-op success (the watcher is unsupported
-// there), matching parent_watch.h's platform gating.
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-
-#include "parent_watch.h"
-
-static int failures = 0;
-
-static void check(bool ok, const std::string &name) {
-    if (!ok) {
-        failures++;
-        fprintf(stderr, "FAIL: %s\n", name.c_str());
-    } else {
-        fprintf(stderr, "ok:   %s\n", name.c_str());
-    }
-}
-
-// Env-parsing tests are platform-independent and always run.
-static void test_env_parsing() {
-    using namespace llama_grpc;
-
-    // Interval: default when unset.
-    unsetenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL");
-    check(parent_watch_interval_ms() == 2000, "interval default 2000ms");
-
-    setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "500ms", 1);
-    check(parent_watch_interval_ms() == 500, "interval 500ms");
-
-    setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "2s", 1);
-    check(parent_watch_interval_ms() == 2000, "interval 2s");
-
-    setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "1m", 1);
-    check(parent_watch_interval_ms() == 60000, "interval 1m");
-
-    setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "3", 1); // bare number -> seconds
-    check(parent_watch_interval_ms() == 3000, "interval bare 3 -> 3000ms");
-
-    setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "garbage", 1);
-    check(parent_watch_interval_ms() == 2000, "interval garbage -> default");
-    unsetenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL");
-
-#if !defined(_WIN32)
-    // Enabled semantics (POSIX only; always false on Windows).
-    unsetenv("LOCALAI_BACKEND_PARENT_WATCH");
-    check(parent_watch_enabled(), "enabled by default");
-
-    for (const char *falsey : {"false", "0", "no", "off", "OFF", " False "}) {
-        setenv("LOCALAI_BACKEND_PARENT_WATCH", falsey, 1);
-        check(!parent_watch_enabled(), std::string("disabled by '") + falsey + "'");
-    }
-    setenv("LOCALAI_BACKEND_PARENT_WATCH", "true", 1);
-    check(parent_watch_enabled(), "enabled by 'true'");
-    setenv("LOCALAI_BACKEND_PARENT_WATCH", "1", 1);
-    check(parent_watch_enabled(), "enabled by '1'");
-    unsetenv("LOCALAI_BACKEND_PARENT_WATCH");
-#endif
-}
-
-#if !defined(_WIN32)
-
-#include <atomic>
-#include <ctime>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-static bool file_exists(const std::string &p) {
-    struct stat st;
-    return ::stat(p.c_str(), &st) == 0;
-}
-
-static bool wait_for_file(const std::string &p, int timeout_ms) {
-    int waited = 0;
-    while (waited < timeout_ms) {
-        if (file_exists(p)) {
-            return true;
-        }
-        usleep(20 * 1000);
-        waited += 20;
-    }
-    return false;
-}
-
-static void write_file(const std::string &p, const std::string &content) {
-    FILE *f = fopen(p.c_str(), "w");
-    if (f) {
-        fwrite(content.data(), 1, content.size(), f);
-        fclose(f);
-    }
-}
-
-// Builds test -> middle -> grandchild via fork(2). The grandchild arms the REAL
-// watch_parent_death against middle; middle exits, orphaning the grandchild;
-// the watcher must detect the reparenting and self-terminate.
-static void test_detects_reparent() {
-    char tmpl[] = "/tmp/parentwatch_test_XXXXXX";
-    char *dir = mkdtemp(tmpl);
-    if (dir == nullptr) {
-        check(false, "mkdtemp");
-        return;
-    }
-    const std::string ready_file = std::string(dir) + "/ready";
-    const std::string exited_file = std::string(dir) + "/exited";
-
-    pid_t middle = fork();
-    if (middle < 0) {
-        check(false, "fork middle");
-        return;
-    }
-
-    if (middle == 0) {
-        // ---- middle process ----
-        pid_t grandchild = fork();
-        if (grandchild < 0) {
-            _exit(4);
-        }
-        if (grandchild == 0) {
-            // ---- grandchild process ----
-            pid_t orig_ppid = getppid(); // == middle
-            std::thread([&]() {
-                llama_grpc::watch_parent_death(orig_ppid, 50 /*ms*/, [&]() {
-                    write_file(exited_file, "1");
-                    _exit(7);
-                });
-            }).detach();
-
-            // Safety valve: never linger if something goes wrong.
-            std::thread([]() {
-                usleep(30 * 1000 * 1000);
-                _exit(2);
-            }).detach();
-
-            // Signal readiness only after the watcher captured orig_ppid.
-            write_file(ready_file, std::to_string(getpid()));
-            for (;;) {
-                pause();
-            }
-        }
-        // middle: wait until grandchild is ready, then exit to orphan it.
-        if (!wait_for_file(ready_file, 10000)) {
-            _exit(5);
-        }
-        _exit(0);
-    }
-
-    // ---- test (top) process ----
-    int status = 0;
-    waitpid(middle, &status, 0); // reap middle only; grandchild is orphaned
-
-    check(file_exists(ready_file), "grandchild signaled readiness");
-
-    bool detected = wait_for_file(exited_file, 10000);
-    check(detected, "watcher detected parent death and self-terminated");
-
-    // Best-effort cleanup: kill the grandchild if it somehow survived.
-    if (file_exists(ready_file)) {
-        FILE *f = fopen(ready_file.c_str(), "r");
-        if (f) {
-            int pid = 0;
-            if (fscanf(f, "%d", &pid) == 1 && pid > 1) {
-                kill(pid, SIGKILL);
-            }
-            fclose(f);
-        }
-    }
-    unlink(ready_file.c_str());
-    unlink(exited_file.c_str());
-    rmdir(dir);
-}
-
-#endif // !_WIN32
-
-int main() {
-    test_env_parsing();
-#if !defined(_WIN32)
-    test_detects_reparent();
-#endif
-    if (failures == 0) {
-        fprintf(stderr, "\nAll parent_watch tests passed.\n");
-        return 0;
-    }
-    fprintf(stderr, "\n%d parent_watch test(s) failed.\n", failures);
-    return 1;
-}
--- a/backend/cpp/llama-cpp/prepare.sh
+++ b/backend/cpp/llama-cpp/prepare.sh
@@ -22,10 +22,6 @@ cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
 # unit test (compiled only when -DLLAMA_GRPC_BUILD_TESTS=ON).
 cp -r message_content.h llama.cpp/tools/grpc-server/
 cp -r message_content_test.cpp llama.cpp/tools/grpc-server/
-# Parent-death watcher (included by grpc-server.cpp) and its standalone unit
-# test (run via backend/cpp/run-unit-tests.sh; also buildable under ctest).
-cp -r parent_watch.h llama.cpp/tools/grpc-server/
-cp -r parent_watch_test.cpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/

--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server

-PRIVACY_FILTER_VERSION?=735a6c28607ee82afc3a670383f41b55266a3b9a
+PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=

--- a/backend/cpp/run-unit-tests.sh
+++ b/backend/cpp/run-unit-tests.sh
@@ -54,7 +54,7 @@ for test_src in "${tests[@]}"; do
    name="$(basename "$test_src" .cpp)"
    bin="$(mktemp -d)/$name"
    echo "==> $test_src"
-    if ! "$CXX" -std=c++17 -Wall -Wextra -pthread \
+    if ! "$CXX" -std=c++17 -Wall -Wextra \
        -I"$JSON_INC" -I"$(dirname "$test_src")" \
        "$test_src" -o "$bin"; then
        echo "COMPILE FAILED: $test_src" >&2
--- a/backend/go/cloud-proxy/provider_anthropic.go
+++ b/backend/go/cloud-proxy/provider_anthropic.go
@@ -142,12 +142,19 @@ func buildAnthropicRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream boo
 	if req.MaxTokens <= 0 {
 		req.MaxTokens = anthropicDefaultMaxTokens
 	}
-	// Do not forward temperature/top_p. Newer Anthropic reasoning models reject
-	// requests that carry temperature ("`temperature` is deprecated for this
-	// model"), and the OpenAI-compatible clients typically send only the
-	// server-side DEFAULT sampling values rather than user intent — dropping
-	// them loses nothing and lets the upstream apply its own defaults.
-	_ = opts
+	// Newer Anthropic models 400 when both temperature and top_p are
+	// set ("`temperature` and `top_p` cannot both be specified for
+	// this model. Please use only one.") even though their docs only
+	// "recommend" picking one. The OpenAI-compatible chat UI almost
+	// always sends both with default values, so prefer temperature
+	// and drop top_p when both are present.
+	if t := opts.GetTemperature(); t != 0 {
+		v := float64(t)
+		req.Temperature = &v
+	} else if t := opts.GetTopP(); t != 0 {
+		v := float64(t)
+		req.TopP = &v
+	}

 	req.Tools = convertOpenAITools(opts.GetTools())
 	req.ToolChoice = convertOpenAIToolChoice(opts.GetToolChoice())
--- a/backend/go/cloud-proxy/provider_anthropic_test.go
+++ b/backend/go/cloud-proxy/provider_anthropic_test.go
@@ -3,6 +3,7 @@ package main
 import (
 	"encoding/json"
 	"io"
+	"math"
 	"net/http"
 	"net/http/httptest"
 	"strings"
@@ -74,16 +75,15 @@ func TestPredict_Anthropic_BasicMessages(t *testing.T) {
 	g.Expect(captured.Messages).To(HaveLen(1))
 	g.Expect(captured.Messages[0].Role).To(Equal("user"))
 	g.Expect(captured.MaxTokens).To(Equal(int32(32)))
-	// Newer Anthropic reasoning models reject requests carrying temperature
-	// ("`temperature` is deprecated for this model"); clients typically send
-	// only default sampling values, so the translator forwards neither.
-	g.Expect(captured.Temperature).To(BeNil())
+	g.Expect(captured.Temperature).NotTo(BeNil())
+	g.Expect(*captured.Temperature).To(Equal(0.5))
+	// Anthropic 400s when both temperature and top_p are set; the
+	// translator must prefer temperature and drop top_p.
 	g.Expect(captured.TopP).To(BeNil())
 	g.Expect(captured.Stream).To(BeFalse())
 }

-// Sampling parameters are not forwarded at all — the upstream applies its
-// own defaults (newest models reject explicit temperature/top_p).
+// When only top_p is set, it should be forwarded.
 func TestPredict_Anthropic_TopPOnly(t *testing.T) {
 	g := NewWithT(t)
 	srv, captured := fakeAnthropicUpstream(t, func(_ anthropicRequest) (int, string, string) {
@@ -99,7 +99,11 @@ func TestPredict_Anthropic_TopPOnly(t *testing.T) {
 	})
 	g.Expect(err).NotTo(HaveOccurred())
 	g.Expect(captured.Temperature).To(BeNil())
-	g.Expect(captured.TopP).To(BeNil())
+	// PredictOptions.TopP is float32 on the wire; the translator widens
+	// to float64 so 0.9 round-trips as 0.8999999761581421… — compare
+	// with a small tolerance rather than exact equality.
+	g.Expect(captured.TopP).NotTo(BeNil())
+	g.Expect(math.Abs(*captured.TopP - 0.9)).To(BeNumerically("<=", 1e-6))
 }

 func TestPredict_Anthropic_DefaultsMaxTokens(t *testing.T) {
--- a/backend/go/cloud-proxy/provider_openai.go
+++ b/backend/go/cloud-proxy/provider_openai.go
@@ -30,7 +30,7 @@ type openAIRequest struct {
 	Stream           bool            `json:"stream,omitempty"`
 	Temperature      *float64        `json:"temperature,omitempty"`
 	TopP             *float64        `json:"top_p,omitempty"`
-	MaxTokens        *int32          `json:"max_completion_tokens,omitempty"` // newer OpenAI models reject max_tokens ("use max_completion_tokens instead")
+	MaxTokens        *int32          `json:"max_tokens,omitempty"`
 	Stop             []string        `json:"stop,omitempty"`
 	FrequencyPenalty *float64        `json:"frequency_penalty,omitempty"`
 	PresencePenalty  *float64        `json:"presence_penalty,omitempty"`
@@ -107,10 +107,14 @@ func buildOpenAIRequest(opts *pb.PredictOptions, cfg *proxyConfig, stream bool)
 		Tools:      parseRawJSON(opts.GetTools()),
 		ToolChoice: parseRawJSON(opts.GetToolChoice()),
 	}
-	// Do not forward temperature/top_p. Newer OpenAI reasoning models reject
-	// temperature as deprecated, and clients typically send only default
-	// sampling values rather than user intent — let the upstream apply its
-	// own defaults.
+	if t := opts.GetTemperature(); t != 0 {
+		v := float64(t)
+		req.Temperature = &v
+	}
+	if t := opts.GetTopP(); t != 0 {
+		v := float64(t)
+		req.TopP = &v
+	}
 	if n := opts.GetTokens(); n > 0 {
 		req.MaxTokens = &n
 	}
--- a/backend/go/cloud-proxy/provider_openai_test.go
+++ b/backend/go/cloud-proxy/provider_openai_test.go
@@ -74,9 +74,8 @@ func TestPredict_OpenAI_BasicChat(t *testing.T) {
 	g.Expect(captured.Messages).To(HaveLen(2))
 	g.Expect(captured.Messages[0].Role).To(Equal("system"))
 	g.Expect(captured.Messages[1].Role).To(Equal("user"))
-	// Sampling parameters are not forwarded (newest models reject explicit
-	// temperature); token limit is serialized as max_completion_tokens.
-	g.Expect(captured.Temperature).To(BeNil())
+	g.Expect(captured.Temperature).NotTo(BeNil())
+	g.Expect(*captured.Temperature).To(Equal(0.5))
 	g.Expect(captured.MaxTokens).NotTo(BeNil())
 	g.Expect(*captured.MaxTokens).To(Equal(int32(32)))
 	g.Expect(captured.Stream).To(BeFalse())
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=9a26976a8c8cf5af0afcdd04463cf8ba91e96a54
+CRISPASR_VERSION?=8f1218141b792b8868861c1af17ba1e361b05dc0
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/face-detect/.gitignore
+++ b/backend/go/face-detect/.gitignore
@@ -1,18 +0,0 @@
-# Fetched upstream sources
-sources/
-
-# CMake build directories
-build*/
-
-# build artifacts staged in-tree by the Makefile (cp from sources/) or
-# symlinked for local dev; the real sources live in face-detect.cpp upstream.
-*.so
-*.so.*
-facedetect_capi.h
-compile_commands.json
-
-# Compiled backend binary
-face-detect-grpc
-
-# Packaging output
-package/
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -1,110 +0,0 @@
-# face-detect backend Makefile.
-#
-# Upstream pin lives below as FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
-# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
-# convention).
-#
-# Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
-# symlink the .so + header into this directory and skip the clone/cmake steps:
-#
-#   ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
-#   ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
-#   go build -o face-detect-grpc .
-#
-# The default target below does the proper clone-at-pin + cmake build so CI does
-# not need a side-checkout.
-
-FACEDETECT_VERSION?=e22260d5d5490b37b021b7f795079f386d553afd
-FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
-
-GOCMD?=go
-GO_TAGS?=
-JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
-
-BUILD_TYPE?=
-NATIVE?=false
-
-# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
-# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
-RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
-
-# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
-# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
-# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
-# The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
-# side, so only the facedetect_capi_* surface is exported.
-CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-
-ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
-endif
-
-# face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
-# does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
-# -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
-# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
-	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
-	# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
-	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
-	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
-	ifeq ($(CUDA_MAJOR_VERSION),13)
-	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
-		CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
-	endif
-	endif
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
-else ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
-else ifeq ($(BUILD_TYPE),metal)
-	CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
-endif
-
-.PHONY: face-detect-grpc package build clean purge test all
-
-all: face-detect-grpc
-
-# Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
-# as the target so make only re-clones when missing. After a FACEDETECT_VERSION
-# bump, run 'make purge && make' to refetch.
-sources/face-detect.cpp:
-	mkdir -p sources/face-detect.cpp
-	cd sources/face-detect.cpp && \
-	git init -q && \
-	git remote add origin $(FACEDETECT_REPO) && \
-	git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
-	git checkout FETCH_HEAD && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-# Build the shared lib + header out-of-tree, then stage them next to the Go
-# sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
-# them up.
-libfacedetect.so: sources/face-detect.cpp
-	cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
-	cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
-	cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
-	cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
-
-face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
-	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
-
-package: face-detect-grpc
-	bash package.sh
-
-build: package
-
-# Test target. The embed/detect/verify/analyze smoke specs are gated on
-# FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
-# heavy specs auto-skip and only the pure-Go parsing specs run.
-test:
-	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
-
-clean: purge
-	rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
-
-purge:
-	rm -rf sources/face-detect.cpp
--- a/backend/go/face-detect/gofacedetect.go
+++ b/backend/go/face-detect/gofacedetect.go
@@ -1,431 +0,0 @@
-package main
-
-import (
-	"encoding/base64"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"time"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/xlog"
-)
-
-// purego-bound entry points from libfacedetect.so. Names match
-// facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
-// is enough to spot drift.
-//
-// The opaque ctx and the malloc'd char*/float* return values are declared as
-// uintptr so we get the raw pointer back and can release it via the matching
-// capi free function. purego's native string/[]float32 returns would copy and
-// forget the original pointer, leaking the C-owned buffer on every call.
-var (
-	CppAbiVersion  func() int32
-	CppLoad        func(ggufPath string) uintptr
-	CppFree        func(ctx uintptr)
-	CppLastError   func(ctx uintptr) string
-	CppFreeString  func(s uintptr)
-	CppFreeVec     func(v uintptr)
-	CppEmbedPath   func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
-	CppEmbedRGB    func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
-	CppDetectJSON  func(ctx uintptr, imagePath string) uintptr
-	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
-	CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
-)
-
-// FaceDetect implements the face-recognition (biometric) subset of the Backend
-// gRPC service over libfacedetect.so. The C side keeps a single loaded model
-// pack plus a per-ctx last-error buffer and is not reentrant, so
-// base.SingleThread serializes every call.
-type FaceDetect struct {
-	base.SingleThread
-	opts   loadOptions
-	ctxPtr uintptr
-}
-
-func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
-	model := opts.ModelFile
-	if model == "" {
-		model = opts.ModelPath
-	}
-	if !filepath.IsAbs(model) && opts.ModelPath != "" {
-		model = filepath.Join(opts.ModelPath, model)
-	}
-	if model == "" {
-		return errors.New("face-detect: ModelFile is required")
-	}
-
-	f.opts = parseOptions(opts.Options)
-	if f.opts.modelName == "" {
-		f.opts.modelName = filepath.Base(model)
-	}
-
-	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
-	// one backend process per model and serves requests concurrently, so the
-	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
-	// FACEDETECT_THREADS is read by the engine at backend construction, so it
-	// must be set before the capi load. A non-positive Threads means "unset":
-	// leave the env alone so the engine keeps its sane default.
-	threads := opts.Threads
-	if threads > 0 {
-		if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
-			return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
-		}
-		xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
-	}
-
-	xlog.Info("face-detect: loading model", "model", model,
-		"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
-
-	ctx := CppLoad(model)
-	if ctx == 0 {
-		// The last-error buffer lives on the ctx that was never returned, so
-		// surface the path the operator tried to load instead.
-		return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
-	}
-	f.ctxPtr = ctx
-	return nil
-}
-
-// Embeddings returns the L2-normalized ArcFace embedding of the primary face in
-// the supplied image. Mirroring the Python face backend, the image is read from
-// Images[0] as a base64 payload; materializeImage decodes it to a temp file so
-// the path-based C-API can run its own decode (cv2.imread parity). The gRPC
-// server wraps the returned slice in an EmbeddingResult.
-func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
-	if f.ctxPtr == 0 {
-		return nil, errors.New("face-detect: model not loaded")
-	}
-	if len(req.Images) == 0 || req.Images[0] == "" {
-		return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
-	}
-
-	path, cleanup, err := materializeImage(req.Images[0])
-	if err != nil {
-		return nil, err
-	}
-	defer cleanup()
-
-	return f.embedPath(path)
-}
-
-func (f *FaceDetect) embedPath(path string) ([]float32, error) {
-	var vec uintptr
-	var dim int32
-	rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
-	if rc != 0 || vec == 0 || dim <= 0 {
-		return nil, f.lastErr("embed", path)
-	}
-	defer CppFreeVec(vec)
-	// Copy out of the C-owned malloc'd buffer before freeing it. The
-	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
-	// nor moves this buffer and we copy immediately.
-	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
-	out := make([]float32, int(dim))
-	copy(out, src)
-	return out, nil
-}
-
-// Detect runs SCRFD over the image and returns one Detection per face. The
-// C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
-// width/height, so the corners are converted. The 5 facial landmarks the engine
-// also returns are dropped: the Detection message has no field for them.
-func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
-	if f.ctxPtr == 0 {
-		return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
-	}
-	if req.Src == "" {
-		return pb.DetectResponse{}, errors.New("face-detect: src image is required")
-	}
-
-	path, cleanup, err := materializeImage(req.Src)
-	if err != nil {
-		return pb.DetectResponse{}, err
-	}
-	defer cleanup()
-
-	faces, err := f.detectFaces(path)
-	if err != nil {
-		return pb.DetectResponse{}, err
-	}
-
-	dets := make([]*pb.Detection, 0, len(faces))
-	for _, fc := range faces {
-		if req.Threshold > 0 && fc.Score < req.Threshold {
-			continue
-		}
-		x, y, w, h := fc.xywh()
-		dets = append(dets, &pb.Detection{
-			X:          x,
-			Y:          y,
-			Width:      w,
-			Height:     h,
-			Confidence: fc.Score,
-			ClassName:  "face",
-		})
-	}
-	return pb.DetectResponse{Detections: dets}, nil
-}
-
-// FaceVerify embeds the primary face in each image and reports whether they are
-// the same identity by cosine distance against a threshold. A request threshold
-// <= 0 falls back to the model-configured default (verify_threshold option,
-// 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
-// veto internally (verified forced false on a spoof); the per-image liveness
-// scores are not exposed by the verify entry point, so img*_is_real /
-// img*_antispoof_score stay at their zero values.
-func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
-	if f.ctxPtr == 0 {
-		return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
-	}
-	if req.Img1 == "" || req.Img2 == "" {
-		return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
-	}
-
-	path1, cleanup1, err := materializeImage(req.Img1)
-	if err != nil {
-		return pb.FaceVerifyResponse{}, err
-	}
-	defer cleanup1()
-	path2, cleanup2, err := materializeImage(req.Img2)
-	if err != nil {
-		return pb.FaceVerifyResponse{}, err
-	}
-	defer cleanup2()
-
-	threshold := req.Threshold
-	if threshold <= 0 {
-		threshold = f.opts.verifyThreshold
-	}
-
-	antiSpoof := int32(0)
-	if req.AntiSpoofing {
-		antiSpoof = 1
-	}
-
-	started := time.Now()
-	var distance float32
-	var verified int32
-	rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
-		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
-	if rc != 0 {
-		return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
-	}
-	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
-
-	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
-	// matching the Python face backend's reporting.
-	confidence := float32(0)
-	if threshold > 0 {
-		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
-	}
-
-	return pb.FaceVerifyResponse{
-		Verified:         verified != 0,
-		Distance:         distance,
-		Threshold:        threshold,
-		Confidence:       confidence,
-		Model:            f.opts.modelName,
-		Img1Area:         f.bestArea(path1),
-		Img2Area:         f.bestArea(path2),
-		ProcessingTimeMs: elapsedMs,
-	}, nil
-}
-
-// FaceAnalyze runs the genderage head on every detected face. The C-API returns
-// "M"/"F" gender labels and a rounded age; the labels are normalized to the
-// "Man"/"Woman" values the proto documents.
-func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
-	if f.ctxPtr == 0 {
-		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
-	}
-	if req.Img == "" {
-		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
-	}
-
-	path, cleanup, err := materializeImage(req.Img)
-	if err != nil {
-		return pb.FaceAnalyzeResponse{}, err
-	}
-	defer cleanup()
-
-	ptr := CppAnalyzeJSON(f.ctxPtr, path)
-	if ptr == 0 {
-		return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
-	}
-	defer CppFreeString(ptr)
-
-	faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
-	if err != nil {
-		return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
-	}
-	return pb.FaceAnalyzeResponse{Faces: faces}, nil
-}
-
-// faceBox is one entry of the detect/analyze JSON documents the engine emits.
-type faceBox struct {
-	Score  float32   `json:"score"`
-	Box    []float32 `json:"box"`
-	Age    float32   `json:"age"`
-	Gender string    `json:"gender"`
-}
-
-// xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
-// proto carries. A short or missing box yields zeros.
-func (b faceBox) xywh() (x, y, w, h float32) {
-	if len(b.Box) < 4 {
-		return 0, 0, 0, 0
-	}
-	return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
-}
-
-type facesJSON struct {
-	Faces []faceBox `json:"faces"`
-}
-
-func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
-	ptr := CppDetectJSON(f.ctxPtr, path)
-	if ptr == 0 {
-		return nil, f.lastErr("detect", path)
-	}
-	defer CppFreeString(ptr)
-
-	var doc facesJSON
-	if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
-		return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
-	}
-	return doc.Faces, nil
-}
-
-// bestArea returns the FacialArea of the highest-scoring face in an image, or an
-// empty area when detection fails or finds nothing. Best-effort: verify already
-// succeeded, so a missing region must not turn a valid match into an error.
-func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
-	faces, err := f.detectFaces(path)
-	if err != nil || len(faces) == 0 {
-		return &pb.FacialArea{}
-	}
-	best := faces[0]
-	for _, fc := range faces[1:] {
-		if fc.Score > best.Score {
-			best = fc
-		}
-	}
-	x, y, w, h := best.xywh()
-	return &pb.FacialArea{X: x, Y: y, W: w, H: h}
-}
-
-// parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
-// The engine reports gender as "M"/"F"; both the dominant label and the score
-// map are filled with the "Man"/"Woman" form the proto documents.
-func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
-	var parsed facesJSON
-	if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
-		return nil, err
-	}
-
-	out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
-	for _, fc := range parsed.Faces {
-		x, y, w, h := fc.xywh()
-		fa := &pb.FaceAnalysis{
-			Region:         &pb.FacialArea{X: x, Y: y, W: w, H: h},
-			FaceConfidence: fc.Score,
-			Age:            fc.Age,
-		}
-		if label := normalizeGender(fc.Gender); label != "" {
-			fa.DominantGender = label
-			fa.Gender = map[string]float32{label: 1.0}
-		}
-		out = append(out, fa)
-	}
-	return out, nil
-}
-
-// normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
-// proto documents. Unknown codes pass through unchanged.
-func normalizeGender(g string) string {
-	switch strings.ToUpper(strings.TrimSpace(g)) {
-	case "M":
-		return "Man"
-	case "F":
-		return "Woman"
-	case "":
-		return ""
-	default:
-		return g
-	}
-}
-
-// materializeImage decodes a base64 image payload into a temp file and returns
-// its path plus a cleanup func. As a convenience for callers that already pass a
-// filesystem path (e.g. a test fixture), an existing path is used as-is with a
-// no-op cleanup. data: URI prefixes are stripped before decoding.
-func materializeImage(src string) (path string, cleanup func(), err error) {
-	noop := func() {}
-	if src == "" {
-		return "", noop, errors.New("face-detect: empty image input")
-	}
-	if _, statErr := os.Stat(src); statErr == nil {
-		return src, noop, nil
-	}
-
-	payload := src
-	if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
-		payload = payload[i+1:]
-	}
-	data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
-	if decErr != nil || len(data) == 0 {
-		return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
-	}
-
-	tmp, createErr := os.CreateTemp("", "face-detect-*.img")
-	if createErr != nil {
-		return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
-	}
-	cleanup = func() { _ = os.Remove(tmp.Name()) }
-	if _, wErr := tmp.Write(data); wErr != nil {
-		_ = tmp.Close()
-		cleanup()
-		return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
-	}
-	if cErr := tmp.Close(); cErr != nil {
-		cleanup()
-		return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
-	}
-	return tmp.Name(), cleanup, nil
-}
-
-// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
-func (f *FaceDetect) lastErr(op, subject string) error {
-	msg := strings.TrimSpace(CppLastError(f.ctxPtr))
-	if msg == "" {
-		msg = "no error detail"
-	}
-	return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
-}
-
-// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
-// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
-//
-// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
-// moves the buffer and we dereference it immediately to copy the bytes out.
-func goStringFromCPtr(cptr uintptr) string {
-	if cptr == 0 {
-		return ""
-	}
-	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
-	n := 0
-	for *(*byte)(unsafe.Add(p, n)) != 0 {
-		n++
-	}
-	return string(unsafe.Slice((*byte)(p), n))
-}
--- a/backend/go/face-detect/gofacedetect_test.go
+++ b/backend/go/face-detect/gofacedetect_test.go
@@ -1,230 +0,0 @@
-package main
-
-import (
-	"encoding/base64"
-	"os"
-	"sync"
-	"testing"
-
-	"github.com/ebitengine/purego"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestFaceDetect(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "face-detect Backend Suite")
-}
-
-var (
-	libLoadOnce sync.Once
-	libLoadErr  error
-)
-
-// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
-// bridge without spinning up the gRPC server. Records the error (the smoke
-// specs skip themselves) when libfacedetect.so is not loadable from cwd
-// (LD_LIBRARY_PATH or a symlink in ./).
-func ensureLibLoaded() error {
-	libLoadOnce.Do(func() {
-		libName := os.Getenv("FACEDETECT_LIBRARY")
-		if libName == "" {
-			libName = "libfacedetect.so"
-		}
-		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-		if err != nil {
-			libLoadErr = err
-			return
-		}
-		purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
-		purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
-		purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
-		purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
-		purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
-		purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
-		purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
-		purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
-		purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
-		purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
-		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
-	})
-	return libLoadErr
-}
-
-var _ = Describe("parseOptions", func() {
-	It("defaults verify_threshold to 0.35", func() {
-		o := parseOptions(nil)
-		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
-		Expect(o.modelName).To(Equal(""))
-	})
-
-	It("parses verify_threshold, threshold alias and model_name", func() {
-		o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
-		Expect(o.modelName).To(Equal("buffalo_l"))
-
-		o2 := parseOptions([]string{"threshold:0.3"})
-		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
-	})
-
-	It("ignores non-positive thresholds and keeps the default", func() {
-		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
-	})
-})
-
-var _ = Describe("normalizeGender", func() {
-	It("maps M/F codes to Man/Woman", func() {
-		Expect(normalizeGender("M")).To(Equal("Man"))
-		Expect(normalizeGender("f")).To(Equal("Woman"))
-		Expect(normalizeGender(" m ")).To(Equal("Man"))
-	})
-
-	It("passes empty and unknown codes through", func() {
-		Expect(normalizeGender("")).To(Equal(""))
-		Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
-	})
-})
-
-var _ = Describe("faceBox.xywh", func() {
-	It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
-		b := faceBox{Box: []float32{10, 20, 50, 80}}
-		x, y, w, h := b.xywh()
-		Expect(x).To(Equal(float32(10)))
-		Expect(y).To(Equal(float32(20)))
-		Expect(w).To(Equal(float32(40)))
-		Expect(h).To(Equal(float32(60)))
-	})
-
-	It("returns zeros for a short box", func() {
-		x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
-		Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
-	})
-})
-
-var _ = Describe("parseAnalyzeJSON", func() {
-	It("maps region, age and gender for each face", func() {
-		doc := `{"faces":[
-			{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
-			{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
-		faces, err := parseAnalyzeJSON(doc)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(faces).To(HaveLen(2))
-
-		Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
-		Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
-		Expect(faces[0].DominantGender).To(Equal("Man"))
-		Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
-		Expect(faces[0].Region.W).To(Equal(float32(40)))
-		Expect(faces[0].Region.H).To(Equal(float32(60)))
-
-		Expect(faces[1].DominantGender).To(Equal("Woman"))
-	})
-
-	It("tolerates a missing gender field", func() {
-		faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(faces).To(HaveLen(1))
-		Expect(faces[0].DominantGender).To(Equal(""))
-		Expect(faces[0].Gender).To(BeEmpty())
-	})
-
-	It("returns no faces for an empty document", func() {
-		faces, err := parseAnalyzeJSON(`{"faces":[]}`)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(faces).To(BeEmpty())
-	})
-
-	It("returns an error on malformed JSON", func() {
-		_, err := parseAnalyzeJSON(`{not-json`)
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-var _ = Describe("materializeImage", func() {
-	It("decodes a base64 payload to a temp file", func() {
-		payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
-		path, cleanup, err := materializeImage(payload)
-		Expect(err).ToNot(HaveOccurred())
-		defer cleanup()
-		data, rerr := os.ReadFile(path)
-		Expect(rerr).ToNot(HaveOccurred())
-		Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
-	})
-
-	It("strips a data: URI prefix before decoding", func() {
-		payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
-		path, cleanup, err := materializeImage(payload)
-		Expect(err).ToNot(HaveOccurred())
-		defer cleanup()
-		data, rerr := os.ReadFile(path)
-		Expect(rerr).ToNot(HaveOccurred())
-		Expect(data).To(Equal([]byte("hello")))
-	})
-
-	It("uses an existing path as-is", func() {
-		tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
-		Expect(err).ToNot(HaveOccurred())
-		defer func() { _ = os.Remove(tmp.Name()) }()
-		Expect(tmp.Close()).To(Succeed())
-
-		path, cleanup, err := materializeImage(tmp.Name())
-		Expect(err).ToNot(HaveOccurred())
-		defer cleanup()
-		Expect(path).To(Equal(tmp.Name()))
-	})
-
-	It("errors on input that is neither a path nor base64", func() {
-		_, _, err := materializeImage("not base64!!!")
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-// The specs below exercise the real C-API end to end. They run only when both a
-// model GGUF and a test image are provided, and skip cleanly otherwise so the
-// suite stays green without large assets.
-var _ = Describe("FaceDetect end-to-end", Ordered, func() {
-	var (
-		f         *FaceDetect
-		modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
-		imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
-	)
-
-	BeforeAll(func() {
-		if modelPath == "" || imagePath == "" {
-			Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
-		}
-		if err := ensureLibLoaded(); err != nil {
-			Skip("libfacedetect.so not loadable: " + err.Error())
-		}
-		f = &FaceDetect{}
-		Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
-	})
-
-	It("embeds the primary face in an image", func() {
-		emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(emb).ToNot(BeEmpty())
-	})
-
-	It("detects at least one face", func() {
-		resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Detections).ToNot(BeEmpty())
-		Expect(resp.Detections[0].ClassName).To(Equal("face"))
-	})
-
-	It("verifies an image against itself as the same identity", func() {
-		resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Verified).To(BeTrue())
-		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
-	})
-
-	It("analyzes age/gender for each face", func() {
-		resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Faces).ToNot(BeEmpty())
-	})
-})
--- a/backend/go/face-detect/main.go
+++ b/backend/go/face-detect/main.go
@@ -1,65 +0,0 @@
-package main
-
-// Started internally by LocalAI - one gRPC server per loaded model.
-//
-// Loads libfacedetect.so via purego and registers the flat C-API entry points
-// declared in facedetect_capi.h. The library name can be overridden with
-// FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
-// convention in the sibling backends); the default looks for the .so next to
-// this binary (resolved via LD_LIBRARY_PATH by run.sh).
-import (
-	"flag"
-	"fmt"
-	"os"
-
-	"github.com/ebitengine/purego"
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-type LibFuncs struct {
-	FuncPtr any
-	Name    string
-}
-
-func main() {
-	libName := os.Getenv("FACEDETECT_LIBRARY")
-	if libName == "" {
-		libName = "libfacedetect.so"
-	}
-
-	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-	if err != nil {
-		panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
-	}
-
-	// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
-	// uintptr so the raw pointer can be freed via the matching capi free fn.
-	libFuncs := []LibFuncs{
-		{&CppAbiVersion, "facedetect_capi_abi_version"},
-		{&CppLoad, "facedetect_capi_load"},
-		{&CppFree, "facedetect_capi_free"},
-		{&CppLastError, "facedetect_capi_last_error"},
-		{&CppFreeString, "facedetect_capi_free_string"},
-		{&CppFreeVec, "facedetect_capi_free_vec"},
-		{&CppEmbedPath, "facedetect_capi_embed_path"},
-		{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
-		{&CppDetectJSON, "facedetect_capi_detect_path_json"},
-		{&CppVerifyPaths, "facedetect_capi_verify_paths"},
-		{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
-	}
-	for _, lf := range libFuncs {
-		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
-	}
-
-	fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
-
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/face-detect/options.go
+++ b/backend/go/face-detect/options.go
@@ -1,47 +0,0 @@
-package main
-
-import (
-	"strconv"
-	"strings"
-)
-
-// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
-// not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
-// face backend ships with so the two implementations agree on verdicts out of
-// the box.
-const defaultVerifyThreshold float32 = 0.35
-
-// loadOptions holds the parsed model-level options for face-detect.
-type loadOptions struct {
-	verifyThreshold float32
-	modelName       string
-}
-
-func splitOption(o string) (key, value string, ok bool) {
-	i := strings.Index(o, ":")
-	if i < 0 {
-		return "", "", false
-	}
-	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
-}
-
-// parseOptions reads the backend "key:value" option slice. Unknown keys are
-// ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
-func parseOptions(opts []string) loadOptions {
-	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
-	for _, oo := range opts {
-		key, value, ok := splitOption(oo)
-		if !ok {
-			continue
-		}
-		switch key {
-		case "verify_threshold", "threshold":
-			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
-				o.verifyThreshold = float32(f)
-			}
-		case "model_name":
-			o.modelName = value
-		}
-	}
-	return o
-}
--- a/backend/go/face-detect/package.sh
+++ b/backend/go/face-detect/package.sh
@@ -1,68 +0,0 @@
-#!/bin/bash
-#
-# Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
-# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
-# so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
-# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
-# is used instead of the host's.
-
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-REPO_ROOT="${CURDIR}/../../.."
-
-mkdir -p "$CURDIR/package/lib"
-
-cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
-cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
-
-# libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
-# LD_LIBRARY_PATH, which run.sh points at lib/.
-cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
-	echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
-	exit 1
-}
-
-# Detect architecture and copy the core runtime libs libfacedetect.so links
-# against, plus the matching dynamic loader as lib/ld.so.
-if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
-    echo "Detected x86_64 architecture, copying x86_64 libraries..."
-    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
-    echo "Detected ARM64 architecture, copying ARM64 libraries..."
-    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ "$(uname -s)" = "Darwin" ]; then
-    echo "Detected Darwin"
-else
-    echo "Error: Could not detect architecture"
-    exit 1
-fi
-
-# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
-# BUILD_TYPE so the backend can reach the GPU without the runtime base image
-# shipping those drivers.
-GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
-if [ -f "$GPU_LIB_SCRIPT" ]; then
-    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
-    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
-    package_gpu_libs
-fi
-
-echo "Packaging completed successfully"
-ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/face-detect/run.sh
+++ b/backend/go/face-detect/run.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-
-export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
-
-# If a self-contained ld.so was packaged, route through it so the packaged
-# libc / libstdc++ are used instead of the host's (matches the voice-detect /
-# whisper / parakeet backends' runtime layout).
-if [ -f "$CURDIR/lib/ld.so" ]; then
-	echo "Using lib/ld.so"
-	exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
-fi
-
-exec "$CURDIR/face-detect-grpc" "$@"
--- a/backend/go/face-detect/test.sh
+++ b/backend/go/face-detect/test.sh
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-cd "$CURDIR"
-
-echo "Running face-detect backend tests..."
-
-# The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
-# specs run only when a model + image are provided via
-# FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
-# auto-skip.
-LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
-
-echo "face-detect tests completed."
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -1,6 +1,6 @@
 # parakeet-cpp backend Makefile.
 #
-# Upstream pin lives below as PARAKEET_VERSION?=e8acc6172a94e20a952cf1843decace5d771a94b
+# Upstream pin lives below as PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
 # (.github/bump_deps.sh) can find and update it - matches the
 # whisper.cpp / ds4 / vibevoice-cpp convention.
 #
@@ -15,7 +15,7 @@
 # That's what the L0 smoke test uses. The default target below does the
 # proper clone-at-pin + cmake build so CI doesn't need a side-checkout.

-PARAKEET_VERSION?=e8acc6172a94e20a952cf1843decace5d771a94b
+PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9
 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp

 GOCMD?=go
--- a/backend/go/parakeet-cpp/boundary.go
+++ b/backend/go/parakeet-cpp/boundary.go
@@ -1,81 +0,0 @@
-package main
-
-// utteranceBoundary is the single definition of a small state machine that was
-// previously open-coded three times — as a bare `finalEou` bool with an ad-hoc
-// toggle — in the live feed (live.go), the file-stream text path, and the
-// file-stream JSON path (goparakeetcpp.go).
-//
-// It answers one running question: does the decode currently rest on an
-// end-of-utterance boundary? That is the value a closing FinalResult reports as
-// .Eou and the realtime turn detector treats as a commit point.
-//
-// parakeet auto-resets its decoder after every <EOU>/<EOB>, so one streaming
-// session is a sequence of utterances and this is a LATCH, not a monotonic
-// flag: it closes on an <EOU> and reopens as soon as the next utterance starts.
-// (Contrast the realtime API's per-turn `eouSeen`, which only ever goes
-// false->true because each turn gets a fresh stream. Here the stream outlives
-// the turn, so the boundary status must be able to reopen.)
-//
-// The only transitions, over the events one streamFeedResult carries — an
-// <EOU>, an <EOB> (backchannel), or plain speech output (text and/or words):
-//
-//	            <EOU>
-//	   open ───────────► closed
-//	    ▲ ▲ │             │ │
-//	    │ └─┘ <EOB>|speech │ │ <EOU>
-//	    │   (stay open)    │ └─┘ (stay closed)
-//	    └──────────────────┘
-//	         <EOB>|speech
-//
-//	open   = NOT on an utterance boundary: mid-utterance, the last boundary was
-//	         a backchannel <EOB>, or the stream just began (the initial state).
-//	closed = the last meaningful event was an <EOU> with no later speech: a real
-//	         turn boundary.
-//
-// A feed that carries nothing (no eou/eob/text/words — e.g. a finalize flush
-// that produced no tail) is a no-op and leaves the state unchanged, matching
-// the legacy "leave finalEou as it was" behaviour.
-//
-// The state carries no data, so it is modelled as a two-valued type (a named
-// bool) rather than an int enum: every inhabitant is legal, so illegal states
-// are unrepresentable — the payload-free analog of the sealed sum types the
-// realtime machines use (those need interfaces because their states carry data,
-// e.g. Active{ID}, where "Active with no ID" is the illegal combination a scalar
-// cannot even express).
-type utteranceBoundary bool
-
-const (
-	// boundaryOpen is the zero value (false), so a fresh decode starts open —
-	// exactly the legacy `var finalEou bool` (false) initial condition.
-	boundaryOpen   utteranceBoundary = false
-	boundaryClosed utteranceBoundary = true
-)
-
-// observe folds one decode increment into the latch and returns the new state.
-//
-// <EOU> takes priority when a single feed carries both an <EOU> and speech
-// (e.g. {"text":"hello","eou":1}): the utterance both produced that text AND
-// ended, so the decode rests on the boundary. This matches the legacy
-// eou-checked-first ordering at every call site.
-func (b utteranceBoundary) observe(r streamFeedResult) utteranceBoundary {
-	switch {
-	case r.Eou:
-		return boundaryClosed
-	case r.Eob || r.Delta != "" || len(r.Words) > 0:
-		return boundaryOpen
-	default:
-		return b
-	}
-}
-
-// ended reports whether the decode currently rests on an end-of-utterance
-// boundary (a real <EOU>, not a backchannel <EOB>). This is what a closing
-// FinalResult carries as .Eou.
-func (b utteranceBoundary) ended() bool { return b == boundaryClosed }
-
-func (b utteranceBoundary) String() string {
-	if b == boundaryClosed {
-		return "closed"
-	}
-	return "open"
-}
--- a/backend/go/parakeet-cpp/boundary_test.go
+++ b/backend/go/parakeet-cpp/boundary_test.go
@@ -1,92 +0,0 @@
-package main
-
-import (
-	"math/rand/v2"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("utteranceBoundary (decode end-of-utterance latch)", func() {
-	It("starts open: a fresh decode is not on a boundary", func() {
-		var b utteranceBoundary
-		Expect(b).To(Equal(boundaryOpen))
-		Expect(b.ended()).To(BeFalse())
-	})
-
-	DescribeTable("single feed transition from the open state",
-		func(r streamFeedResult, wantEnded bool) {
-			Expect(boundaryOpen.observe(r).ended()).To(Equal(wantEnded))
-		},
-		Entry("<EOU> closes it", streamFeedResult{Eou: true}, true),
-		Entry("<EOU> with text closes it (eou wins)", streamFeedResult{Delta: "hi", Eou: true}, true),
-		Entry("<EOB> stays open (backchannel is not a turn boundary)", streamFeedResult{Eob: true}, false),
-		Entry("plain text stays open", streamFeedResult{Delta: "hello"}, false),
-		Entry("words-only stays open", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
-		Entry("empty feed is a no-op (stays open)", streamFeedResult{}, false),
-	)
-
-	DescribeTable("single feed transition from the closed state",
-		func(r streamFeedResult, wantEnded bool) {
-			Expect(boundaryClosed.observe(r).ended()).To(Equal(wantEnded))
-		},
-		Entry("another <EOU> stays closed", streamFeedResult{Eou: true}, true),
-		Entry("trailing speech reopens it", streamFeedResult{Delta: "and more"}, false),
-		Entry("words reopen it", streamFeedResult{Words: []transcriptWord{{W: "x"}}}, false),
-		Entry("a backchannel <EOB> reopens it", streamFeedResult{Eob: true}, false),
-		Entry("empty feed is a no-op (stays closed)", streamFeedResult{}, true),
-	)
-
-	It("is a latch: <EOU> then trailing speech reopens, then <EOU> closes again", func() {
-		b := boundaryOpen
-		b = b.observe(streamFeedResult{Delta: "turn one", Eou: true})
-		Expect(b.ended()).To(BeTrue())
-		b = b.observe(streamFeedResult{Delta: " and more"})
-		Expect(b.ended()).To(BeFalse(), "trailing speech without an EOU is an open utterance")
-		b = b.observe(streamFeedResult{Eou: true})
-		Expect(b.ended()).To(BeTrue())
-	})
-
-	It("treats a backchannel before a real EOU correctly", func() {
-		b := boundaryOpen
-		b = b.observe(streamFeedResult{Delta: "uh huh", Eob: true})
-		Expect(b.ended()).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
-		b = b.observe(streamFeedResult{Delta: "done", Eou: true})
-		Expect(b.ended()).To(BeTrue())
-	})
-
-	It("matches the reference fold over seeded random feed sequences", func() {
-		// The invariant: after any sequence of feeds, ended() is true iff the
-		// last feed that carried ANY event was an <EOU>. <EOU> takes priority
-		// when a feed carries both an EOU and speech; empty feeds are ignored.
-		for seed := uint64(1); seed <= 200; seed++ {
-			rng := rand.New(rand.NewPCG(seed, seed*2654435761))
-			b := boundaryOpen
-			lastWasEou := false // reference: did the last meaningful feed end on EOU?
-			steps := rng.IntN(30)
-			for i := 0; i < steps; i++ {
-				var r streamFeedResult
-				switch rng.IntN(5) {
-				case 0:
-					r = streamFeedResult{Eou: true}
-				case 1:
-					r = streamFeedResult{Eob: true}
-				case 2:
-					r = streamFeedResult{Delta: "w"}
-				case 3:
-					r = streamFeedResult{Delta: "w", Eou: true} // eou + speech, eou wins
-				case 4:
-					r = streamFeedResult{} // empty: no-op
-				}
-				b = b.observe(r)
-				if r.Eou {
-					lastWasEou = true
-				} else if r.Eob || r.Delta != "" || len(r.Words) > 0 {
-					lastWasEou = false
-				}
-			}
-			Expect(b.ended()).To(Equal(lastWasEou),
-				"seed %d: latch disagreed with the reference fold", seed)
-		}
-	})
-})
--- a/backend/go/parakeet-cpp/driver.go
+++ b/backend/go/parakeet-cpp/driver.go
@@ -1,82 +0,0 @@
-package main
-
-import (
-	"context"
-
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-)
-
-// streamFeedResult is one decode increment from a cache-aware streaming session:
-// the newly-finalized text plus the model's own per-feed boundary tokens
-// (<EOU>/<EOB>) and word timings. It is the single event type both the live
-// (bidi) and file (server-stream) paths fold over, hiding the ABI v4 JSON vs
-// older text-only entry-point split behind one shape.
-type streamFeedResult struct {
-	Delta string
-	Eou   bool
-	Eob   bool
-	Words []transcriptWord
-}
-
-// feedChunk feeds one PCM chunk to the streaming session (or finalizes it, when
-// finalize is true) and returns the unified decode increment. It prefers the
-// ABI v4 JSON entry points (which also carry per-word timestamps) and falls
-// back to the older text-only entry points against an older libparakeet.so.
-//
-// This is the one place the JSON-vs-text choice is made; every consumer works
-// in terms of streamFeedResult.
-func (p *ParakeetCpp) feedChunk(stream uintptr, pcm []float32, finalize bool) (streamFeedResult, error) {
-	if CppStreamFeedJSON != nil {
-		doc, err := p.streamFeedDoc(stream, pcm, finalize)
-		if err != nil {
-			return streamFeedResult{}, err
-		}
-		return streamFeedResult{Delta: doc.Text, Eou: doc.Eou != 0, Eob: doc.Eob != 0, Words: doc.Words}, nil
-	}
-	delta, eou, eob, err := p.streamFeedText(stream, pcm, finalize)
-	if err != nil {
-		return streamFeedResult{}, err
-	}
-	return streamFeedResult{Delta: delta, Eou: eou, Eob: eob}, nil
-}
-
-// feedSlices feeds pcm through the session in streamChunkSamples slices,
-// invoking onFeed for each decode increment. It does NOT finalize: callers
-// decide when the send side is done. The file path finalizes after the whole
-// file; the live path finalizes only when its request channel closes, never
-// between audio messages. Slicing keeps each per-call engineMu hold short so
-// concurrent unary transcription interleaves fairly (the C session buffers
-// internally).
-//
-// If ctx is non-nil it is checked before each slice so a cancelled file
-// transcription stops promptly; the live path passes nil (it is bounded by its
-// request channel instead of a ctx).
-func (p *ParakeetCpp) feedSlices(ctx context.Context, stream uintptr, pcm []float32, onFeed func(streamFeedResult) error) error {
-	for off := 0; off < len(pcm); off += streamChunkSamples {
-		if ctx != nil {
-			if err := ctx.Err(); err != nil {
-				return status.Error(codes.Canceled, "transcription cancelled")
-			}
-		}
-		end := min(off+streamChunkSamples, len(pcm))
-		res, err := p.feedChunk(stream, pcm[off:end], false)
-		if err != nil {
-			return err
-		}
-		if err := onFeed(res); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-// flushTail finalizes the session once and folds the flushed tail (the last
-// ~2 encoder frames of text, which only appear on finalize) through onFeed.
-func (p *ParakeetCpp) flushTail(stream uintptr, onFeed func(streamFeedResult) error) error {
-	res, err := p.feedChunk(stream, nil, true)
-	if err != nil {
-		return err
-	}
-	return onFeed(res)
-}
--- a/backend/go/parakeet-cpp/goparakeetcpp.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp.go
@@ -103,13 +103,12 @@ type transcriptJSON struct {
 //	{"text":"...","eou":0,"eob":0,"frame_sec":0.080000,
 //	 "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...]}
 //
-// "text" is the newly-finalized text since the last call. Under ABI v5 "eou"
-// is 1 iff an <EOU> fired this feed (the user yielded the turn) and "eob" 1
-// iff an <EOB> fired (a backchannel like "uh-huh" ended — NOT a turn
-// boundary). A v4 library has no "eob" field and its "eou" conflates both
-// tokens: Eob stays 0 and Eou keeps the old any-event meaning. "words" are
-// the words finalized this call with absolute (stream-relative) start/end
-// seconds.
+// "text" is the newly-finalized text since the last call; "eou" is 1 when an
+// <EOU> (end of utterance) fired this feed and "eob" is 1 when an <EOB>
+// (backchannel) fired. ABI v4 conflated the two into "eou"; v5 split them, so
+// we read both and treat either as an utterance boundary for segmentation.
+// "words" are the words finalized this call with absolute (stream-relative)
+// start/end seconds.
 type streamFeedJSON struct {
 	Text     string           `json:"text"`
 	Eou      int              `json:"eou"`
@@ -365,7 +364,7 @@ var segmentSeparators = []rune{'.', '?', '!'}
 // the caller requested word granularity; token ids populate each segment's
 // Tokens by time-window membership. Shared by the batched and direct paths.
 func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gapFrames int) pb.TranscriptResult {
-	text, eou := stripEouMarker(strings.TrimSpace(doc.Text))
+	text := strings.TrimSpace(doc.Text)

 	// Frame-unit gap threshold -> seconds (NeMo segment_gap_threshold). 0 = off.
 	gapSeconds := 0.0
@@ -384,7 +383,6 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		return pb.TranscriptResult{
 			Text:     text,
 			Segments: []*pb.TranscriptSegment{{Id: 0, Text: text}},
-			Eou:      eou,
 		}
 	}

@@ -411,25 +409,7 @@ func transcriptResultFromDoc(doc transcriptJSON, opts *pb.TranscriptRequest, gap
 		}
 		segments = append(segments, seg)
 	}
-	return pb.TranscriptResult{Text: text, Segments: segments, Eou: eou}
-}
-
-// stripEouMarker removes a trailing literal <EOU>/<EOB> from offline-decode
-// text and reports whether the decode ended on an end-of-UTTERANCE token. The
-// realtime EOU model's offline decode keeps the special token in the
-// detokenized text (the streaming path strips it and surfaces it as flags
-// instead); user-visible transcripts must never carry either marker, but only
-// <EOU> may confirm the semantic_vad retranscribe cross-check — a decode
-// ending on <EOB> means the last thing heard was a backchannel, not the user
-// yielding the turn.
-func stripEouMarker(text string) (string, bool) {
-	if strings.HasSuffix(text, "<EOU>") {
-		return strings.TrimSpace(strings.TrimSuffix(text, "<EOU>")), true
-	}
-	if strings.HasSuffix(text, "<EOB>") {
-		return strings.TrimSpace(strings.TrimSuffix(text, "<EOB>")), false
-	}
-	return text, false
+	return pb.TranscriptResult{Text: text, Segments: segments}
 }

 // splitWordsIntoSegments groups words into segments exactly as NeMo's
@@ -496,55 +476,41 @@ func tokensInWindow(tokens []transcriptToken, start, end float64) []int32 {
 	return ids
 }

-// streamSegmenter accumulates streaming decode increments into per-utterance
-// segments. <EOU>/<EOB> are the model's own utterance boundaries; each closes a
-// segment. When the feed carries per-word timings (ABI v4 JSON), a closed
-// segment takes its start/end from its first/last word; against an older
-// text-only library (no words) it falls back to segmenting the delta text, so
-// the same assembler serves both paths.
+// streamSegmenter accumulates streaming words into per-utterance segments. EOU
+// is the model's own utterance boundary; each closed segment takes its start/end
+// from its first/last accumulated word.
 type streamSegmenter struct {
-	segs    []*pb.TranscriptSegment
-	cur     []transcriptWord // words for the open segment (ABI v4 JSON path)
-	curText []string         // delta text for the open segment (text-only path)
-	nextID  int32
+	segs   []*pb.TranscriptSegment
+	cur    []transcriptWord
+	nextID int32
 }

-func (s *streamSegmenter) add(r streamFeedResult) {
-	s.cur = append(s.cur, r.Words...)
-	if len(r.Words) == 0 && r.Delta != "" {
-		// Older libparakeet.so with no per-word timing: segment from the text.
-		s.curText = append(s.curText, r.Delta)
-	}
-	// Both <EOU> and <EOB> reset the decoder, so both close a segment.
-	if r.Eou || r.Eob {
+func (s *streamSegmenter) add(doc streamFeedJSON) {
+	s.cur = append(s.cur, doc.Words...)
+	// Close the segment on either turn signal: <EOU> (end of utterance) or
+	// <EOB> (backchannel). ABI v4 reported both via "eou"; v5 split them, so we
+	// OR them here to keep the v4 segmentation boundaries.
+	if doc.Eou != 0 || doc.Eob != 0 {
 		s.flush()
 	}
 }

 func (s *streamSegmenter) flush() {
-	switch {
-	case len(s.cur) > 0:
-		parts := make([]string, len(s.cur))
-		for i, w := range s.cur {
-			parts[i] = w.W
-		}
-		s.segs = append(s.segs, &pb.TranscriptSegment{
-			Id:    s.nextID,
-			Start: secondsToNanos(s.cur[0].Start),
-			End:   secondsToNanos(s.cur[len(s.cur)-1].End),
-			Text:  strings.TrimSpace(strings.Join(parts, " ")),
-		})
-		s.nextID++
-	case len(s.curText) > 0:
-		// No words this segment: emit a text-only segment (no timestamps),
-		// skipping a purely-whitespace one as the legacy text path did.
-		if t := strings.TrimSpace(strings.Join(s.curText, "")); t != "" {
-			s.segs = append(s.segs, &pb.TranscriptSegment{Id: s.nextID, Text: t})
-			s.nextID++
-		}
+	if len(s.cur) == 0 {
+		return
 	}
+	parts := make([]string, len(s.cur))
+	for i, w := range s.cur {
+		parts[i] = w.W
+	}
+	s.segs = append(s.segs, &pb.TranscriptSegment{
+		Id:    s.nextID,
+		Start: secondsToNanos(s.cur[0].Start),
+		End:   secondsToNanos(s.cur[len(s.cur)-1].End),
+		Text:  strings.TrimSpace(strings.Join(parts, " ")),
+	})
+	s.nextID++
 	s.cur = nil
-	s.curText = nil
 }

 func (s *streamSegmenter) segments() []*pb.TranscriptSegment { return s.segs }
@@ -569,119 +535,18 @@ func secondsToNanos(sec float64) int64 {
 	return int64(sec * 1e9)
 }

-// Per-C-call engine serialization for the streaming paths.
-//
-// Every individual C call (begin / feed / finalize / free) takes engineMu and
-// re-checks ctxPtr under the lock; the lock is NEVER held across a stream's
-// lifetime. This is safe because each parakeet.cpp call builds its own ggml
-// graph and all streaming caches live in the session object, not the ctx —
-// the only ctx-shared mutable state is last_error, which is why it is read
-// under the same lock as the failing call. Holding the lock per call (rather
-// than per stream, as this file previously did) keeps a long-lived live
-// session from starving batched unary transcription and vice versa.
-//
-// A stream must not outlive its ctx (C-API contract). Free() takes engineMu
-// and zeroes ctxPtr, so a racing per-call helper returns ModelNotLoaded
-// instead of feeding a freed engine; streamFree of an orphaned session only
-// runs the session destructor, which does not touch the ctx.
-
-// streamBegin opens a cache-aware streaming session. A 0 stream with nil
-// error means the loaded model is not a streaming model.
-func (p *ParakeetCpp) streamBegin(lang string) (uintptr, error) {
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	if p.ctxPtr == 0 {
-		return 0, grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-	if CppStreamBeginLang != nil {
-		return CppStreamBeginLang(p.ctxPtr, lang), nil
-	}
-	return CppStreamBegin(p.ctxPtr), nil
-}
-
-func (p *ParakeetCpp) streamFree(stream uintptr) {
-	if stream == 0 {
-		return
-	}
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	CppStreamFree(stream)
-}
-
-// streamFeedText runs one text-mode feed (or the finalize flush when
-// finalize is true) under engineMu, returning the newly-finalized delta and
-// whether an <EOU>/<EOB> fired during the call.
-func (p *ParakeetCpp) streamFeedText(stream uintptr, pcm []float32, finalize bool) (delta string, eou, eob bool, err error) {
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	if p.ctxPtr == 0 {
-		return "", false, false, grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-	var ret uintptr
-	var events int32
-	if finalize {
-		ret = CppStreamFinalize(stream)
-	} else {
-		ret = CppStreamFeed(stream, pcm, int32(len(pcm)), unsafe.Pointer(&events))
-	}
-	if ret == 0 {
-		// last_error is ctx-shared: read it under the same lock as the call.
-		msg := CppLastError(p.ctxPtr)
-		if msg == "" {
-			msg = "unknown error"
-		}
-		return "", false, false, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-	}
-	delta = goStringFromCPtr(ret)
-	CppFreeString(ret)
-	// ABI v5: eou_out is a bitmask (bit 0 = <EOU>, bit 1 = <EOB>). A v4
-	// library sets 0/1 for either token, which the bit-0 test reads as the
-	// old conflated eou — the EOB distinction simply isn't available there.
-	return delta, events&1 != 0, events&2 != 0, nil
-}
-
-// streamFeedDoc runs one ABI v4 JSON feed (or finalize) under engineMu and
-// returns the parsed {text,eou,frame_sec,words} document.
-func (p *ParakeetCpp) streamFeedDoc(stream uintptr, pcm []float32, finalize bool) (streamFeedJSON, error) {
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
-	if p.ctxPtr == 0 {
-		return streamFeedJSON{}, grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-	var ret uintptr
-	if finalize {
-		ret = CppStreamFinalizeJSON(stream)
-	} else {
-		ret = CppStreamFeedJSON(stream, pcm, int32(len(pcm)))
-	}
-	if ret == 0 {
-		msg := CppLastError(p.ctxPtr)
-		if msg == "" {
-			msg = "unknown error"
-		}
-		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
-	}
-	raw := goStringFromCPtr(ret)
-	CppFreeString(ret)
-	var doc streamFeedJSON
-	if err := json.Unmarshal([]byte(raw), &doc); err != nil {
-		return streamFeedJSON{}, fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
-	}
-	return doc, nil
-}
-
 // AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
-// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it through
-// the shared decode driver (feedSlices/flushTail), and emits each
-// newly-finalized text run as a TranscriptStreamResponse delta. <EOU>/<EOB>
-// events close the current segment; a closing FinalResult carries the full
-// transcript, the per-utterance segments, and whether the file ended on an
-// utterance boundary.
+// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
+// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
+// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
+// current segment; a closing FinalResult carries the full transcript and the
+// per-utterance segments.
 //
 // stream_begin returns 0 for models that are not cache-aware streaming models
-// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those this
-// returns codes.Unimplemented rather than faking a stream from an offline
-// decode — see the stream==0 branch and grpcerrors.StreamTranscriptionUnsupported.
+// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
+// back to a single offline transcription emitted as one delta plus a closing
+// FinalResult, matching LocalAI's non-streaming streaming contract (and the
+// whisper backend), so the streaming endpoint works for every model.
 func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
 	defer close(results)

@@ -695,73 +560,185 @@ func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.Tra
 		return status.Error(codes.Canceled, "transcription cancelled")
 	}

-	stream, err := p.streamBegin(opts.GetLanguage())
-	if err != nil {
-		return err
+	var stream uintptr
+	if CppStreamBeginLang != nil {
+		stream = CppStreamBeginLang(p.ctxPtr, opts.GetLanguage())
+	} else {
+		stream = CppStreamBegin(p.ctxPtr)
 	}
 	if stream == 0 {
-		// Not a cache-aware streaming model. Report the missing capability
-		// honestly instead of decoding offline and emitting it as one "delta"
-		// + final: a client that asked for streaming must learn the model
-		// cannot stream, not receive a batch result dressed as a stream (which
-		// is indistinguishable except qualitatively, and silently breaks any
-		// feature that genuinely needs incremental output). Callers wanting a
-		// plain transcript use the unary AudioTranscription path. This mirrors
-		// AudioTranscriptionLive, which already returns Unimplemented here.
-		return grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp",
-			"loaded model is not a cache-aware streaming model")
+		// Not a cache-aware streaming model: run a normal offline
+		// transcription and emit it as one delta + a closing final result.
+		res, err := p.AudioTranscription(ctx, opts)
+		if err != nil {
+			return err
+		}
+		if t := strings.TrimSpace(res.Text); t != "" {
+			results <- &pb.TranscriptStreamResponse{Delta: t}
+		}
+		results <- &pb.TranscriptStreamResponse{FinalResult: &res}
+		return nil
 	}
-	defer p.streamFree(stream)
+	defer CppStreamFree(stream)
+	// The C engine is a single shared context: a streaming session and a batched
+	// unary dispatch must never touch it at once, so hold engineMu for the whole
+	// stream. This lock is intentionally taken AFTER the non-streaming fallback
+	// above returns: that fallback goes through AudioTranscription -> the batcher
+	// -> runBatch, which itself acquires engineMu, so locking here first would
+	// deadlock. Do not hoist this lock above the fallback.
+	p.engineMu.Lock()
+	defer p.engineMu.Unlock()

 	data, duration, err := decodeWavMono16k(opts.Dst)
 	if err != nil {
 		return err
 	}

-	// Fold the shared decode driver's per-feed increments into the streamed
-	// deltas and the closing batch result: words/text accumulate into
-	// per-utterance segments (streamSegmenter), and the utterance-boundary
-	// latch (boundary.go) records whether the file ended on an <EOU>. These
-	// are the offline path's concern — the live RPC carries none of them.
+	// ABI v4: when the streaming JSON entry points are present, drive them so the
+	// per-utterance segments carry per-word start/end timestamps. Falls through to
+	// the text-only loop below against an older libparakeet.so. Runs under the
+	// engineMu already held above.
+	if CppStreamFeedJSON != nil {
+		return p.streamJSON(ctx, stream, data, duration, results)
+	}
+
 	var (
 		full     strings.Builder
-		seg      streamSegmenter
-		boundary utteranceBoundary
+		segText  strings.Builder
+		segments []*pb.TranscriptSegment
+		segID    int32
 	)
-	emit := func(r streamFeedResult) error {
-		if r.Delta != "" {
-			full.WriteString(r.Delta)
-			results <- &pb.TranscriptStreamResponse{Delta: r.Delta}
+
+	flushSegment := func() {
+		t := strings.TrimSpace(segText.String())
+		segText.Reset()
+		if t == "" {
+			return
 		}
-		seg.add(r)
-		boundary = boundary.observe(r)
+		segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
+		segID++
+	}
+
+	// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
+	// it, accumulates the text, and sends a delta when non-empty. A 0 return
+	// is an error (vs the "" empty-but-non-NULL no-new-text case).
+	emitDelta := func(ret uintptr) error {
+		if ret == 0 {
+			msg := CppLastError(p.ctxPtr)
+			if msg == "" {
+				msg = "unknown error"
+			}
+			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+		}
+		delta := goStringFromCPtr(ret)
+		CppFreeString(ret)
+		if delta == "" {
+			return nil
+		}
+		full.WriteString(delta)
+		segText.WriteString(delta)
+		results <- &pb.TranscriptStreamResponse{Delta: delta}
 		return nil
 	}

-	if err := p.feedSlices(ctx, stream, data, emit); err != nil {
-		return err
-	}
-	if err := p.flushTail(stream, emit); err != nil {
-		return err
-	}
-	seg.flush() // close a trailing utterance that never saw an <EOU>
+	for off := 0; off < len(data); off += streamChunkSamples {
+		if err := ctx.Err(); err != nil {
+			return status.Error(codes.Canceled, "transcription cancelled")
+		}
+		end := min(off+streamChunkSamples, len(data))
+		chunk := data[off:end]

-	// final.Text is the exact concatenation of the streamed deltas (full is
-	// their accumulation), so concat(deltas) == FinalResult.Text holds even
-	// when the model prepends a leading space to the first word (SentencePiece
-	// detokenization). This matches the whisper backend's streaming contract.
-	// The single-segment fallback stays trimmed.
-	fullText := full.String()
-	segments := seg.segments()
-	if trimmed := strings.TrimSpace(fullText); len(segments) == 0 && trimmed != "" {
-		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: trimmed})
+		var eou int32
+		ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
+		if err := emitDelta(ret); err != nil {
+			return err
+		}
+		if eou != 0 {
+			flushSegment()
+		}
+	}
+
+	// Flush the streaming tail (final encoder chunk).
+	if err := emitDelta(CppStreamFinalize(stream)); err != nil {
+		return err
+	}
+	flushSegment()
+
+	text := strings.TrimSpace(full.String())
+	if len(segments) == 0 && text != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
 	}
 	results <- &pb.TranscriptStreamResponse{
 		FinalResult: &pb.TranscriptResult{
-			Text:     fullText,
+			Text:     text,
+			Segments: segments,
+			Duration: duration,
+		},
+	}
+	return nil
+}
+
+// streamJSON drives the streaming JSON entry points (present since ABI v4): each
+// feed/finalize returns a {text,eou,eob,frame_sec,words} document. The
+// newly-finalized text is emitted as a delta (unchanged streaming contract)
+// while words are accumulated into per-utterance segments (closed on <EOU> or
+// <EOB>) so the closing FinalResult carries timestamped segments. Runs under
+// engineMu (already held by the caller).
+func (p *ParakeetCpp) streamJSON(ctx context.Context, stream uintptr, data []float32,
+	duration float32, results chan *pb.TranscriptStreamResponse) error {
+	var (
+		full strings.Builder
+		seg  streamSegmenter
+	)
+	// consume frees the malloc'd char* (a 0 return is an error), parses the JSON,
+	// emits the delta, and routes words through the segmenter.
+	consume := func(ret uintptr) error {
+		if ret == 0 {
+			msg := CppLastError(p.ctxPtr)
+			if msg == "" {
+				msg = "unknown error"
+			}
+			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+		}
+		raw := goStringFromCPtr(ret)
+		CppFreeString(ret)
+		var doc streamFeedJSON
+		if err := json.Unmarshal([]byte(raw), &doc); err != nil {
+			return fmt.Errorf("parakeet-cpp: decode stream json: %w", err)
+		}
+		if doc.Text != "" {
+			full.WriteString(doc.Text)
+			results <- &pb.TranscriptStreamResponse{Delta: doc.Text}
+		}
+		seg.add(doc)
+		return nil
+	}
+
+	for off := 0; off < len(data); off += streamChunkSamples {
+		if err := ctx.Err(); err != nil {
+			return status.Error(codes.Canceled, "transcription cancelled")
+		}
+		end := min(off+streamChunkSamples, len(data))
+		chunk := data[off:end]
+		if err := consume(CppStreamFeedJSON(stream, chunk, int32(len(chunk)))); err != nil {
+			return err
+		}
+	}
+	if err := consume(CppStreamFinalizeJSON(stream)); err != nil {
+		return err
+	}
+	seg.flush() // close any trailing utterance that never saw an EOU
+
+	text := strings.TrimSpace(full.String())
+	segments := seg.segments()
+	if len(segments) == 0 && text != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
+	}
+	results <- &pb.TranscriptStreamResponse{
+		FinalResult: &pb.TranscriptResult{
+			Text:     text,
 			Segments: segments,
 			Duration: duration,
-			Eou:      boundary.ended(),
 		},
 	}
 	return nil
@@ -826,10 +803,6 @@ func (p *ParakeetCpp) Free() error {
 		close(p.batStop)
 		p.batStop = nil
 	}
-	// engineMu so an in-flight streaming call (which locks per C call and
-	// re-checks ctxPtr under the lock) can never feed into a freed ctx.
-	p.engineMu.Lock()
-	defer p.engineMu.Unlock()
 	if p.ctxPtr != 0 {
 		CppFree(p.ctxPtr)
 		p.ctxPtr = 0
--- a/backend/go/parakeet-cpp/goparakeetcpp_test.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go
@@ -14,8 +14,6 @@ import (
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
 )

 func TestParakeetCpp(t *testing.T) {
@@ -203,29 +201,6 @@ var _ = Describe("ParakeetCpp", func() {
 	})

 	Context("AudioTranscriptionStream", func() {
-		It("returns the typed Unimplemented signal for non-streaming models (no offline fallback)", func() {
-			// stream_begin == 0 means the loaded model is not a cache-aware
-			// streaming model. The backend must surface that, not silently
-			// decode offline and fake a one-shot "stream".
-			savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
-			defer func() { CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang }()
-			CppStreamBeginLang = nil
-			CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
-
-			p := &ParakeetCpp{ctxPtr: 1}
-			results := make(chan *pb.TranscriptStreamResponse, 8)
-			err := p.AudioTranscriptionStream(context.Background(),
-				&pb.TranscriptRequest{Dst: "ignored.wav"}, results)
-			Expect(status.Code(err)).To(Equal(codes.Unimplemented))
-
-			// Honest signal: nothing was emitted — no faked batch result.
-			var emitted []*pb.TranscriptStreamResponse
-			for r := range results {
-				emitted = append(emitted, r)
-			}
-			Expect(emitted).To(BeEmpty())
-		})
-
 		It("streams deltas and a closing FinalResult from a cache-aware model", func() {
 			// Streaming needs a cache-aware streaming model (e.g.
 			// realtime_eou); the offline test model would fail stream_begin.
--- a/backend/go/parakeet-cpp/live.go
+++ b/backend/go/parakeet-cpp/live.go
@@ -1,186 +0,0 @@
-package main
-
-import (
-	"strings"
-	"time"
-
-	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/xlog"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-)
-
-// liveSampleRate is the only PCM rate the parakeet C streaming API accepts.
-const liveSampleRate = 16000
-
-// AudioTranscriptionLive drives one cache-aware streaming session over audio
-// fed incrementally by the caller (the realtime API's semantic_vad turn
-// detection). Contract:
-//
-//   - the first request must carry a Config; a Config mid-stream resets the
-//     decode session (free + begin) and drops accumulated transcript state;
-//   - a Ready ack is sent right after a successful stream_begin so callers
-//     can degrade synchronously when the model has no streaming support
-//     (LiveTranscriptionUnsupported, codes.Unimplemented);
-//   - every feed that produced output is forwarded as {delta, eou, words};
-//     the <EOU>/<EOB> flag is the model's own utterance boundary and the
-//     decoder auto-resets after it, so one session spans many utterances;
-//   - closing the send side finalizes: the held-back tail chunk is flushed
-//     (the last ~2 encoder frames of words only appear here) and a terminal
-//     FinalResult carries the full transcript Text only. Per-utterance
-//     segments, duration, and the terminal <EOU> flag are NOT produced here —
-//     the realtime core consumes the streamed per-feed tokens and the final
-//     Text; those batch fields are the file path's concern (see
-//     AudioTranscriptionStream).
-//
-// Engine access is serialized per C call (streamBegin/streamFeed*/streamFree
-// take engineMu internally), never for the session lifetime — unary
-// transcription keeps flowing between feeds.
-func (p *ParakeetCpp) AudioTranscriptionLive(in <-chan *pb.TranscriptLiveRequest, out chan<- *pb.TranscriptLiveResponse) error {
-	defer close(out)
-
-	if p.ctxPtr == 0 {
-		return grpcerrors.ModelNotLoaded("parakeet-cpp")
-	}
-
-	first, ok := <-in
-	if !ok {
-		return nil // caller closed without sending anything
-	}
-	cfg := first.GetConfig()
-	if cfg == nil {
-		return status.Error(codes.InvalidArgument, "parakeet-cpp: first live message must carry a config")
-	}
-	if err := validateLiveConfig(cfg); err != nil {
-		return err
-	}
-
-	stream, err := p.streamBegin(cfg.GetLanguage())
-	if err != nil {
-		return err
-	}
-	if stream == 0 {
-		return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
-			"loaded model is not a cache-aware streaming model")
-	}
-	// stream is reassigned on a mid-stream Config reset; free whatever is
-	// current when the RPC unwinds.
-	defer func() { p.streamFree(stream) }()
-
-	out <- &pb.TranscriptLiveResponse{Ready: true}
-
-	var (
-		full    strings.Builder
-		fedSecs float64
-
-		// behindSec accumulates how far decode wall time has fallen behind
-		// the audio it was fed. A live caller feeds in real time, so a
-		// persistent positive backlog means every downstream signal —
-		// including the <EOU> the turn detector waits on — arrives that many
-		// seconds late. Warned once per session; reset by a Config reset.
-		behindSec    float64
-		behindWarned bool
-	)
-
-	// emit forwards one decode increment: it streams the per-feed tokens the
-	// realtime turn detector consumes (delta/eou/eob/words) and accumulates the
-	// running transcript for the closing FinalResult. No segmentation or
-	// boundary latch here — the live consumer reads only the streamed tokens
-	// and the final Text; per-utterance segments and the terminal <EOU> flag
-	// are an offline-path concern (see AudioTranscriptionStream / boundary.go).
-	emit := func(r streamFeedResult) error {
-		if r.Delta != "" {
-			full.WriteString(r.Delta)
-		}
-		if r.Delta != "" || r.Eou || r.Eob || len(r.Words) > 0 {
-			out <- &pb.TranscriptLiveResponse{
-				Delta: r.Delta,
-				Eou:   r.Eou,
-				Eob:   r.Eob,
-				Words: liveWordsToProto(r.Words),
-			}
-		}
-		return nil
-	}
-
-	for req := range in {
-		switch payload := req.GetPayload().(type) {
-		case *pb.TranscriptLiveRequest_Config:
-			if err := validateLiveConfig(payload.Config); err != nil {
-				return err
-			}
-			// Reset: a fresh decode session, dropping accumulated state.
-			p.streamFree(stream)
-			stream, err = p.streamBegin(payload.Config.GetLanguage())
-			if err != nil {
-				return err
-			}
-			if stream == 0 {
-				return grpcerrors.LiveTranscriptionUnsupported("parakeet-cpp",
-					"loaded model is not a cache-aware streaming model")
-			}
-			full.Reset()
-			fedSecs = 0
-		case *pb.TranscriptLiveRequest_Audio:
-			pcm := payload.Audio.GetPcm()
-			audioSec := float64(len(pcm)) / liveSampleRate
-			fedSecs += audioSec
-			start := time.Now()
-			// nil ctx: a live session is bounded by this request channel, not a
-			// context — cancellation is the caller closing the stream.
-			if err := p.feedSlices(nil, stream, pcm, emit); err != nil {
-				return err
-			}
-			wallSec := time.Since(start).Seconds()
-			behindSec += wallSec - audioSec
-			if behindSec < 0 {
-				behindSec = 0
-			}
-			xlog.Debug("parakeet-cpp: live feed",
-				"audio_ms", int(audioSec*1000), "wall_ms", int(wallSec*1000),
-				"behind_ms", int(behindSec*1000), "fed_s", fedSecs)
-			if behindSec > 1 && !behindWarned {
-				behindWarned = true
-				xlog.Warn("parakeet-cpp: live decode is falling behind real time; "+
-					"end-of-utterance signals will arrive late",
-					"behind_s", behindSec, "fed_s", fedSecs)
-			}
-		}
-	}
-
-	// Send side closed: flush the streaming tail and emit the final transcript.
-	// The live FinalResult carries only Text — the authoritative full-turn
-	// transcript the realtime core commits. Per-utterance segments, duration,
-	// and the terminal <EOU> flag are not produced on the live path.
-	if err := p.flushTail(stream, emit); err != nil {
-		return err
-	}
-	out <- &pb.TranscriptLiveResponse{
-		FinalResult: &pb.TranscriptResult{Text: strings.TrimSpace(full.String())},
-	}
-	return nil
-}
-
-func validateLiveConfig(cfg *pb.TranscriptLiveConfig) error {
-	if sr := cfg.GetSampleRate(); sr != 0 && sr != liveSampleRate {
-		return status.Errorf(codes.InvalidArgument,
-			"parakeet-cpp: unsupported live sample_rate %d (only %d)", sr, liveSampleRate)
-	}
-	return nil
-}
-
-func liveWordsToProto(words []transcriptWord) []*pb.TranscriptWord {
-	if len(words) == 0 {
-		return nil
-	}
-	out := make([]*pb.TranscriptWord, len(words))
-	for i, w := range words {
-		out[i] = &pb.TranscriptWord{
-			Start: secondsToNanos(w.Start),
-			End:   secondsToNanos(w.End),
-			Text:  w.W,
-		}
-	}
-	return out
-}
--- a/backend/go/parakeet-cpp/live_test.go
+++ b/backend/go/parakeet-cpp/live_test.go
@@ -1,417 +0,0 @@
-package main
-
-import (
-	"sync"
-	"time"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-)
-
-// The live-RPC specs drive AudioTranscriptionLive entirely against stubbed
-// Cpp* package vars (the same seam batcher_test.go uses), so they run
-// without libparakeet.so.
-
-// liveCstrPool hands out NUL-terminated C-style strings backed by Go memory
-// and keeps them alive for the duration of a spec (goStringFromCPtr reads
-// through the raw pointer; Go's GC must not collect the backing array while
-// a stub's return value is in flight).
-type liveCstrPool struct {
-	mu   sync.Mutex
-	bufs [][]byte
-}
-
-func (p *liveCstrPool) cstr(s string) uintptr {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	b := append([]byte(s), 0)
-	p.bufs = append(p.bufs, b)
-	return uintptr(unsafe.Pointer(&b[0]))
-}
-
-// liveStubs swaps every C entry point the live path touches and returns a
-// restore func for AfterEach.
-func liveStubs() (restore func()) {
-	savedBegin, savedBeginLang := CppStreamBegin, CppStreamBeginLang
-	savedFeed, savedFeedJSON := CppStreamFeed, CppStreamFeedJSON
-	savedFinalize, savedFinalizeJSON := CppStreamFinalize, CppStreamFinalizeJSON
-	savedFree, savedLastError := CppStreamFree, CppLastError
-	savedFreeString := CppFreeString
-	return func() {
-		CppStreamBegin, CppStreamBeginLang = savedBegin, savedBeginLang
-		CppStreamFeed, CppStreamFeedJSON = savedFeed, savedFeedJSON
-		CppStreamFinalize, CppStreamFinalizeJSON = savedFinalize, savedFinalizeJSON
-		CppStreamFree, CppLastError = savedFree, savedLastError
-		CppFreeString = savedFreeString
-	}
-}
-
-// runLive starts the RPC on its own goroutine and returns the request
-// channel plus a collector for everything the backend emitted.
-func runLive(p *ParakeetCpp) (chan *pb.TranscriptLiveRequest, chan *pb.TranscriptLiveResponse, chan error) {
-	in := make(chan *pb.TranscriptLiveRequest)
-	out := make(chan *pb.TranscriptLiveResponse, 32)
-	errCh := make(chan error, 1)
-	go func() { errCh <- p.AudioTranscriptionLive(in, out) }()
-	return in, out, errCh
-}
-
-func liveConfig(lang string) *pb.TranscriptLiveRequest {
-	return &pb.TranscriptLiveRequest{
-		Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{Language: lang}},
-	}
-}
-
-func liveAudio(pcm []float32) *pb.TranscriptLiveRequest {
-	return &pb.TranscriptLiveRequest{
-		Payload: &pb.TranscriptLiveRequest_Audio{Audio: &pb.TranscriptLiveAudio{Pcm: pcm}},
-	}
-}
-
-func collectLive(out chan *pb.TranscriptLiveResponse) []*pb.TranscriptLiveResponse {
-	var got []*pb.TranscriptLiveResponse
-	for r := range out {
-		got = append(got, r)
-	}
-	return got
-}
-
-var _ = Describe("AudioTranscriptionLive (stubbed C API)", func() {
-	var (
-		pool    *liveCstrPool
-		restore func()
-		p       *ParakeetCpp
-	)
-
-	BeforeEach(func() {
-		pool = &liveCstrPool{}
-		restore = liveStubs()
-		p = &ParakeetCpp{ctxPtr: 1}
-
-		CppStreamBeginLang = nil
-		CppStreamBegin = func(ctx uintptr) uintptr { return 7 }
-		CppStreamFree = func(s uintptr) {}
-		CppFreeString = func(s uintptr) {}
-		CppLastError = func(ctx uintptr) string { return "stub error" }
-		CppStreamFeed = nil
-		CppStreamFeedJSON = nil
-		CppStreamFinalize = nil
-		CppStreamFinalizeJSON = nil
-	})
-
-	AfterEach(func() { restore() })
-
-	It("rejects a stream whose first message is not a config", func() {
-		in, out, errCh := runLive(p)
-		in <- liveAudio([]float32{0.1})
-		close(in)
-
-		err := <-errCh
-		Expect(status.Code(err)).To(Equal(codes.InvalidArgument))
-		Expect(collectLive(out)).To(BeEmpty())
-	})
-
-	It("rejects a non-16k sample rate", func() {
-		in, _, errCh := runLive(p)
-		in <- &pb.TranscriptLiveRequest{
-			Payload: &pb.TranscriptLiveRequest_Config{Config: &pb.TranscriptLiveConfig{SampleRate: 8000}},
-		}
-		close(in)
-		Expect(status.Code(<-errCh)).To(Equal(codes.InvalidArgument))
-	})
-
-	It("returns the typed Unimplemented signal for non-streaming models, before any ack", func() {
-		CppStreamBegin = func(ctx uintptr) uintptr { return 0 }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		close(in)
-
-		err := <-errCh
-		Expect(grpcerrors.IsLiveTranscriptionUnsupported(err)).To(BeTrue())
-		Expect(collectLive(out)).To(BeEmpty())
-	})
-
-	It("streams deltas, eou flags and words on the JSON path and finalizes on close", func() {
-		var freed []uintptr
-		CppStreamFree = func(s uintptr) { freed = append(freed, s) }
-		feeds := 0
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			feeds++
-			switch feeds {
-			case 1:
-				return pool.cstr(`{"text":"hello ","eou":0,"frame_sec":0.08,` +
-					`"words":[{"w":"hello","start":0.1,"end":0.4,"conf":0.9}]}`)
-			default:
-				return pool.cstr(`{"text":"world","eou":1,"frame_sec":0.08,` +
-					`"words":[{"w":"world","start":0.5,"end":0.8,"conf":0.9}]}`)
-			}
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("en")
-		in <- liveAudio(make([]float32, 100))
-		in <- liveAudio(make([]float32, 200))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4)) // ready, two deltas, final
-
-		Expect(got[0].Ready).To(BeTrue())
-
-		Expect(got[1].Delta).To(Equal("hello "))
-		Expect(got[1].Eou).To(BeFalse())
-		Expect(got[1].Words).To(HaveLen(1))
-		Expect(got[1].Words[0].Text).To(Equal("hello"))
-
-		Expect(got[2].Delta).To(Equal("world"))
-		Expect(got[2].Eou).To(BeTrue())
-
-		final := got[3].FinalResult
-		Expect(final).NotTo(BeNil())
-		Expect(final.Text).To(Equal("hello world"))
-		// The live FinalResult carries only Text. Per-utterance segments,
-		// duration and the terminal eou flag are an offline-path concern (see
-		// boundary.go / AudioTranscriptionStream); the realtime core reads the
-		// streamed per-feed tokens above plus this Text.
-		Expect(final.Eou).To(BeFalse())
-		Expect(final.Segments).To(BeEmpty())
-		Expect(final.Duration).To(BeZero())
-
-		Expect(freed).To(Equal([]uintptr{7}))
-	})
-
-	It("falls back to the text feed (eou out-param) when the JSON entry points are absent", func() {
-		feeds := 0
-		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
-			feeds++
-			if feeds == 2 {
-				*(*int32)(eouOut) = 1
-				return pool.cstr("done")
-			}
-			return pool.cstr("first ")
-		}
-		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4))
-		Expect(got[1].Delta).To(Equal("first "))
-		Expect(got[1].Eou).To(BeFalse())
-		Expect(got[2].Delta).To(Equal("done"))
-		Expect(got[2].Eou).To(BeTrue())
-		Expect(got[3].FinalResult.Text).To(Equal("first done"))
-	})
-
-	It("forwards <EOB> as eob — a backchannel, never an eou (ABI v5 JSON)", func() {
-		feeds := 0
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			feeds++
-			if feeds == 1 {
-				return pool.cstr(`{"text":"uh-huh","eou":0,"eob":1,"frame_sec":0.08,` +
-					`"words":[{"w":"uh-huh","start":0.1,"end":0.3,"conf":0.9}]}`)
-			}
-			return pool.cstr(`{"text":"the turn","eou":1,"eob":0,"frame_sec":0.08,` +
-				`"words":[{"w":"the","start":0.5,"end":0.6,"conf":0.9},{"w":"turn","start":0.6,"end":0.8,"conf":0.9}]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"eob":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4))
-		Expect(got[1].Eob).To(BeTrue())
-		Expect(got[1].Eou).To(BeFalse(), "a backchannel must not masquerade as a turn boundary")
-		Expect(got[2].Eou).To(BeTrue())
-	})
-
-	It("maps the v5 eou_out bitmask on the text path (bit0 <EOU>, bit1 <EOB>)", func() {
-		feeds := 0
-		CppStreamFeed = func(s uintptr, pcm []float32, n int32, eouOut unsafe.Pointer) uintptr {
-			feeds++
-			if feeds == 1 {
-				*(*int32)(eouOut) = 2 // <EOB> only
-				return pool.cstr("uh-huh")
-			}
-			*(*int32)(eouOut) = 1 // <EOU>
-			return pool.cstr(" done")
-		}
-		CppStreamFinalize = func(s uintptr) uintptr { return pool.cstr("") }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		Expect(got).To(HaveLen(4))
-		Expect(got[1].Eob).To(BeTrue())
-		Expect(got[1].Eou).To(BeFalse())
-		Expect(got[2].Eou).To(BeTrue())
-		Expect(got[2].Eob).To(BeFalse())
-	})
-
-	It("accumulates trailing text after an EOU into the final transcript", func() {
-		feeds := 0
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			feeds++
-			if feeds == 1 {
-				return pool.cstr(`{"text":"turn one","eou":1,"frame_sec":0.08,"words":[]}`)
-			}
-			return pool.cstr(`{"text":" and more","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		final := got[len(got)-1].FinalResult
-		Expect(final.Text).To(Equal("turn one and more"))
-	})
-
-	It("resets the decode session on a mid-stream config", func() {
-		var begun, freed int
-		CppStreamBegin = func(ctx uintptr) uintptr { begun++; return uintptr(10 + begun) }
-		CppStreamFree = func(s uintptr) { freed++ }
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			return pool.cstr(`{"text":"x","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-		in <- liveConfig("") // reset
-		in <- liveAudio(make([]float32, 10))
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-
-		got := collectLive(out)
-		final := got[len(got)-1].FinalResult
-		Expect(final.Text).To(Equal("x"), "pre-reset transcript dropped")
-		Expect(begun).To(Equal(2))
-		Expect(freed).To(Equal(2), "old session freed on reset, new one on unwind")
-	})
-
-	It("does not hold engineMu between feeds (unary work interleaves with a live session)", func() {
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-		CppStreamFinalizeJSON = func(s uintptr) uintptr {
-			return pool.cstr(`{"text":"","eou":0,"frame_sec":0.08,"words":[]}`)
-		}
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-
-		// The session is open and idle between feeds: the engine lock must be
-		// acquirable, which is what lets batched unary transcription proceed
-		// mid-session. Under stream-lifetime locking this probe would block
-		// until the stream ended and the Eventually would time out.
-		locked := make(chan struct{})
-		go func() {
-			p.engineMu.Lock()
-			p.engineMu.Unlock() //nolint:staticcheck // probe: acquire-release proves availability
-			close(locked)
-		}()
-		Eventually(locked, time.Second).Should(BeClosed())
-
-		close(in)
-		Expect(<-errCh).NotTo(HaveOccurred())
-		collectLive(out)
-	})
-
-	It("errors out and reads last_error under the lock when a feed fails", func() {
-		CppStreamFeedJSON = func(s uintptr, pcm []float32, n int32) uintptr { return 0 }
-
-		in, out, errCh := runLive(p)
-		in <- liveConfig("")
-		in <- liveAudio(make([]float32, 10))
-
-		err := <-errCh
-		Expect(err).To(MatchError(ContainSubstring("stub error")))
-		got := collectLive(out)
-		Expect(got).To(HaveLen(1)) // just the ready ack
-		close(in)
-	})
-})
-
-var _ = Describe("stripEouMarker", func() {
-	It("strips a trailing <EOU> and reports it", func() {
-		text, eou := stripEouMarker("it is certainly very like the old portrait<EOU>")
-		Expect(text).To(Equal("it is certainly very like the old portrait"))
-		Expect(eou).To(BeTrue())
-	})
-
-	It("strips a trailing <EOB> WITHOUT reporting an utterance end", func() {
-		// A decode ending on a backchannel must not confirm the
-		// retranscribe gate — the user was acknowledging, not yielding.
-		text, eou := stripEouMarker("uh-huh<EOB>")
-		Expect(text).To(Equal("uh-huh"))
-		Expect(eou).To(BeFalse())
-	})
-
-	It("leaves marker-free text alone", func() {
-		text, eou := stripEouMarker("plain transcript")
-		Expect(text).To(Equal("plain transcript"))
-		Expect(eou).To(BeFalse())
-	})
-
-	It("does not strip a marker in the middle of the text", func() {
-		text, eou := stripEouMarker("a<EOU>b")
-		Expect(text).To(Equal("a<EOU>b"))
-		Expect(eou).To(BeFalse())
-	})
-})
-
-var _ = Describe("transcriptResultFromDoc EOU handling", func() {
-	It("strips the offline marker from text and sets the result flag", func() {
-		doc := transcriptJSON{Text: "the old portrait<EOU>"}
-		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
-		Expect(res.Text).To(Equal("the old portrait"))
-		Expect(res.Eou).To(BeTrue())
-		Expect(res.Segments).To(HaveLen(1))
-		Expect(res.Segments[0].Text).To(Equal("the old portrait"))
-	})
-
-	It("reports eou=false for marker-free decodes", func() {
-		doc := transcriptJSON{Text: "no marker here"}
-		res := transcriptResultFromDoc(doc, &pb.TranscriptRequest{}, 0)
-		Expect(res.Text).To(Equal("no marker here"))
-		Expect(res.Eou).To(BeFalse())
-	})
-})
--- a/backend/go/parakeet-cpp/segments_test.go
+++ b/backend/go/parakeet-cpp/segments_test.go
@@ -106,7 +106,7 @@ var _ = Describe("transcriptResultFromDoc (multi-segment)", func() {
 var _ = Describe("streaming segment assembly", func() {
 	It("closes a segment with start/end from its words on EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "hello world", Eou: true, Words: []transcriptWord{
+		acc.add(streamFeedJSON{Text: "hello world", Eou: 1, Words: []transcriptWord{
 			{W: "hello", Start: 0.0, End: 0.4}, {W: "world", Start: 0.4, End: 0.9},
 		}})
 		segs := acc.segments()
@@ -118,9 +118,9 @@ var _ = Describe("streaming segment assembly", func() {

 	It("buffers words across feeds until EOU", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "hi", Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
+		acc.add(streamFeedJSON{Text: "hi", Eou: 0, Words: []transcriptWord{{W: "hi", Start: 0, End: 0.3}}})
 		Expect(acc.segments()).To(BeEmpty())
-		acc.add(streamFeedResult{Delta: "there", Eou: true, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
+		acc.add(streamFeedJSON{Text: "there", Eou: 1, Words: []transcriptWord{{W: "there", Start: 0.3, End: 0.7}}})
 		Expect(acc.segments()).To(HaveLen(1))
 		Expect(acc.segments()[0].Text).To(Equal("hi there"))
 	})
@@ -129,7 +129,7 @@ var _ = Describe("streaming segment assembly", func() {
 	// field; a backchannel must still close the segment as it did in v4.
 	It("closes a segment on EOB (backchannel) too", func() {
 		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "uh huh", Eob: true, Words: []transcriptWord{
+		acc.add(streamFeedJSON{Text: "uh huh", Eou: 0, Eob: 1, Words: []transcriptWord{
 			{W: "uh", Start: 0.0, End: 0.2}, {W: "huh", Start: 0.2, End: 0.5},
 		}})
 		segs := acc.segments()
@@ -137,18 +137,4 @@ var _ = Describe("streaming segment assembly", func() {
 		Expect(segs[0].Text).To(Equal("uh huh"))
 		Expect(segs[0].End).To(Equal(secondsToNanos(0.5)))
 	})
-
-	// Older text-only libparakeet.so: no per-word timings, so a segment is cut
-	// from the delta text on each <EOU>/<EOB> (no timestamps), one per utterance.
-	It("falls back to text segments when the feed carries no words", func() {
-		acc := &streamSegmenter{}
-		acc.add(streamFeedResult{Delta: "first turn", Eou: true})
-		acc.add(streamFeedResult{Delta: "second turn", Eou: true})
-		segs := acc.segments()
-		Expect(segs).To(HaveLen(2))
-		Expect(segs[0].Text).To(Equal("first turn"))
-		Expect(segs[1].Text).To(Equal("second turn"))
-		Expect(segs[0].Start).To(Equal(int64(0)), "no per-word timing on the text path")
-		Expect(segs[0].End).To(Equal(int64(0)))
-	})
 })
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=2574f5936571645f784b77623e1f09bad97d948a
+STABLEDIFFUSION_GGML_VERSION?=8caa3f908ae6d4a4bef531e73b9a969f266a3d1f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/stablediffusion-ggml/cpp/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/cpp/gosd.cpp
@@ -798,7 +798,6 @@ void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed) {
 int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count) {

    sd_image_t* results;
-    int num_results_out = 0;

    std::vector<int> skip_layers = {7, 8, 9};

@@ -995,14 +994,10 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
            sd_ctx_params_to_str(&ctx_params),
            sd_img_gen_params_to_str(p));

-    bool gen_ok = generate_image(sd_c, p, &results, &num_results_out);
+    results = generate_image(sd_c, p);

    std::free(p);

-    if (!gen_ok || num_results_out == 0) {
-        results = NULL;
-    }
-
    if (results == NULL) {
        fprintf (stderr, "NO results\n");
        if (input_image_buffer) free(input_image_buffer);
--- a/backend/go/voice-detect/.gitignore
+++ b/backend/go/voice-detect/.gitignore
@@ -1,18 +0,0 @@
-# Fetched upstream sources
-sources/
-
-# CMake build directories
-build*/
-
-# build artifacts staged in-tree by the Makefile (cp from sources/) or
-# symlinked for local dev; the real sources live in voice-detect.cpp upstream.
-*.so
-*.so.*
-voicedetect_capi.h
-compile_commands.json
-
-# Compiled backend binary
-voice-detect-grpc
-
-# Packaging output
-package/
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -1,107 +0,0 @@
-# voice-detect backend Makefile.
-#
-# Upstream pin lives below as VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
-# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
-#
-# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
-# symlink the .so + header into this directory and skip the clone/cmake steps:
-#
-#   ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
-#   ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
-#   go build -o voice-detect-grpc .
-#
-# The default target below does the proper clone-at-pin + cmake build so CI does
-# not need a side-checkout.
-
-VOICEDETECT_VERSION?=1db1759572c90faef6f3a78c36b5941a096a9f89
-VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
-
-GOCMD?=go
-GO_TAGS?=
-JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
-
-BUILD_TYPE?=
-NATIVE?=false
-
-# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
-# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
-RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
-
-# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
-# self-contained: dlopen needs no libggml*.so alongside it, only system libs
-# (libstdc++/libgomp/libc) that the runtime image already provides.
-CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-
-ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
-endif
-
-# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
-# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
-# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
-# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
-	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
-	# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
-	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
-	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
-	ifeq ($(CUDA_MAJOR_VERSION),13)
-	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
-		CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
-	endif
-	endif
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
-else ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
-else ifeq ($(BUILD_TYPE),metal)
-	CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON
-endif
-
-.PHONY: voice-detect-grpc package build clean purge test all
-
-all: voice-detect-grpc
-
-# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
-# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
-# bump, run 'make purge && make' to refetch.
-sources/voice-detect.cpp:
-	mkdir -p sources/voice-detect.cpp
-	cd sources/voice-detect.cpp && \
-	git init -q && \
-	git remote add origin $(VOICEDETECT_REPO) && \
-	git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
-	git checkout FETCH_HEAD && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-# Build the shared lib + header out-of-tree, then stage them next to the Go
-# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
-# them up.
-libvoicedetect.so: sources/voice-detect.cpp
-	cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
-	cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
-	cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
-	cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
-
-voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
-	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
-
-package: voice-detect-grpc
-	bash package.sh
-
-build: package
-
-# Test target. The embed/verify/analyze smoke specs are gated on
-# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
-# heavy specs auto-skip and only the pure-Go parsing specs run.
-test:
-	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
-
-clean: purge
-	rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
-
-purge:
-	rm -rf sources/voice-detect.cpp
--- a/backend/go/voice-detect/govoicedetect.go
+++ b/backend/go/voice-detect/govoicedetect.go
@@ -1,273 +0,0 @@
-package main
-
-import (
-	"encoding/json"
-	"errors"
-	"fmt"
-	"math"
-	"os"
-	"path/filepath"
-	"strconv"
-	"strings"
-	"time"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/xlog"
-)
-
-// purego-bound entry points from libvoicedetect.so. Names match
-// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
-// is enough to spot drift.
-//
-// The opaque ctx and the malloc'd char*/float* return values are declared as
-// uintptr so we get the raw pointer back and can release it via the matching
-// capi free function. purego's native string/[]float32 returns would copy and
-// forget the original pointer, leaking the C-owned buffer on every call.
-var (
-	CppAbiVersion  func() int32
-	CppLoad        func(ggufPath string) uintptr
-	CppFree        func(ctx uintptr)
-	CppLastError   func(ctx uintptr) string
-	CppFreeString  func(s uintptr)
-	CppFreeVec     func(v uintptr)
-	CppEmbedPath   func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
-	CppEmbedPCM    func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
-	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
-	CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
-)
-
-// VoiceDetect implements the speaker-recognition voice subset of the Backend
-// gRPC service over libvoicedetect.so. The C side keeps a single loaded model
-// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
-// serializes every call.
-type VoiceDetect struct {
-	base.SingleThread
-	opts   loadOptions
-	ctxPtr uintptr
-}
-
-func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
-	model := opts.ModelFile
-	if model == "" {
-		model = opts.ModelPath
-	}
-	if !filepath.IsAbs(model) && opts.ModelPath != "" {
-		model = filepath.Join(opts.ModelPath, model)
-	}
-	if model == "" {
-		return errors.New("voice-detect: ModelFile is required")
-	}
-
-	v.opts = parseOptions(opts.Options)
-	if v.opts.modelName == "" {
-		v.opts.modelName = filepath.Base(model)
-	}
-
-	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
-	// one backend process per model and serves requests concurrently, so the
-	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
-	// VOICEDETECT_THREADS is read by the engine at backend construction, so it
-	// must be set before the capi load. A non-positive Threads means "unset":
-	// leave the env alone so the engine keeps its sane default.
-	threads := opts.Threads
-	if threads > 0 {
-		if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
-			return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
-		}
-		xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
-	}
-
-	xlog.Info("voice-detect: loading model", "model", model,
-		"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
-
-	ctx := CppLoad(model)
-	if ctx == 0 {
-		// The last-error buffer lives on the ctx that was never returned, so
-		// surface the path the operator tried to load instead.
-		return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
-	}
-	v.ctxPtr = ctx
-	return nil
-}
-
-// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
-// The request carries a filesystem PATH; the HTTP layer materializes
-// base64/URL/data-URI inputs to a temp file before the gRPC call.
-func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
-	if v.ctxPtr == 0 {
-		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
-	}
-	if req.Audio == "" {
-		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
-	}
-	emb, err := v.embedPath(req.Audio)
-	if err != nil {
-		return pb.VoiceEmbedResponse{}, err
-	}
-	return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
-}
-
-func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
-	var vec uintptr
-	var dim int32
-	rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
-	if rc != 0 || vec == 0 || dim <= 0 {
-		return nil, v.lastErr("embed", path)
-	}
-	defer CppFreeVec(vec)
-	// Copy out of the C-owned malloc'd buffer before freeing it. The
-	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
-	// nor moves this buffer and we copy immediately.
-	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
-	out := make([]float32, int(dim))
-	copy(out, src)
-	return out, nil
-}
-
-// VoiceVerify embeds two clips and reports whether they are the same speaker by
-// cosine distance against a threshold. A request threshold <= 0 falls back to
-// the model-configured default (verify_threshold option, 0.25 if unset).
-func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
-	if v.ctxPtr == 0 {
-		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
-	}
-	if req.Audio1 == "" || req.Audio2 == "" {
-		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
-	}
-
-	threshold := req.Threshold
-	if threshold <= 0 {
-		threshold = v.opts.verifyThreshold
-	}
-
-	started := time.Now()
-	var distance float32
-	var verified int32
-	rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
-		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
-	if rc != 0 {
-		return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
-	}
-	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
-
-	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
-	// matching the Python speaker-recognition backend's reporting.
-	confidence := float32(0)
-	if threshold > 0 {
-		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
-	}
-
-	return pb.VoiceVerifyResponse{
-		Verified:         verified != 0,
-		Distance:         distance,
-		Threshold:        threshold,
-		Confidence:       confidence,
-		Model:            v.opts.modelName,
-		ProcessingTimeMs: elapsedMs,
-	}, nil
-}
-
-// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
-// always evaluates every supported head, so the request's actions filter is
-// advisory and the full analysis is returned as a single segment (the engine
-// does not produce time-bounded segments).
-func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
-	if v.ctxPtr == 0 {
-		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
-	}
-	if req.Audio == "" {
-		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
-	}
-
-	ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
-	if ptr == 0 {
-		return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
-	}
-	defer CppFreeString(ptr)
-
-	seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
-	if err != nil {
-		return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
-	}
-	return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
-}
-
-// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
-//
-//	{"age":42.0,
-//	 "gender":{"label":"female","female":0.88,"male":0.12},
-//	 "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
-//
-// gender is a mixed object (a "label" string plus per-class float scores), so
-// it is decoded into raw messages and split in parseAnalyzeJSON.
-type analyzeJSON struct {
-	Age     float32                    `json:"age"`
-	Gender  map[string]json.RawMessage `json:"gender"`
-	Emotion struct {
-		Label  string             `json:"label"`
-		Scores map[string]float32 `json:"scores"`
-	} `json:"emotion"`
-}
-
-// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
-// start/end stay 0: the model emits a single whole-utterance result, not
-// time-bounded segments.
-func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
-	var a analyzeJSON
-	if err := json.Unmarshal([]byte(doc), &a); err != nil {
-		return nil, err
-	}
-
-	seg := &pb.VoiceAnalysis{
-		Age:             a.Age,
-		DominantEmotion: a.Emotion.Label,
-		Emotion:         a.Emotion.Scores,
-	}
-
-	if len(a.Gender) > 0 {
-		gender := make(map[string]float32, len(a.Gender))
-		for k, raw := range a.Gender {
-			if k == "label" {
-				_ = json.Unmarshal(raw, &seg.DominantGender)
-				continue
-			}
-			var score float32
-			if err := json.Unmarshal(raw, &score); err == nil {
-				gender[k] = score
-			}
-		}
-		seg.Gender = gender
-	}
-
-	return seg, nil
-}
-
-// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
-func (v *VoiceDetect) lastErr(op, subject string) error {
-	msg := strings.TrimSpace(CppLastError(v.ctxPtr))
-	if msg == "" {
-		msg = "no error detail"
-	}
-	return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
-}
-
-// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
-// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
-//
-// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
-// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
-// moves the buffer and we dereference it immediately to copy the bytes out.
-func goStringFromCPtr(cptr uintptr) string {
-	if cptr == 0 {
-		return ""
-	}
-	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
-	n := 0
-	for *(*byte)(unsafe.Add(p, n)) != 0 {
-		n++
-	}
-	return string(unsafe.Slice((*byte)(p), n))
-}
--- a/backend/go/voice-detect/govoicedetect_test.go
+++ b/backend/go/voice-detect/govoicedetect_test.go
@@ -1,144 +0,0 @@
-package main
-
-import (
-	"os"
-	"sync"
-	"testing"
-
-	"github.com/ebitengine/purego"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func TestVoiceDetect(t *testing.T) {
-	RegisterFailHandler(Fail)
-	RunSpecs(t, "voice-detect Backend Suite")
-}
-
-var (
-	libLoadOnce sync.Once
-	libLoadErr  error
-)
-
-// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
-// bridge without spinning up the gRPC server. Records the error (the smoke
-// specs skip themselves) when libvoicedetect.so is not loadable from cwd
-// (LD_LIBRARY_PATH or a symlink in ./).
-func ensureLibLoaded() error {
-	libLoadOnce.Do(func() {
-		libName := os.Getenv("VOICEDETECT_LIBRARY")
-		if libName == "" {
-			libName = "libvoicedetect.so"
-		}
-		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-		if err != nil {
-			libLoadErr = err
-			return
-		}
-		purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
-		purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
-		purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
-		purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
-		purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
-		purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
-		purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
-		purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
-		purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
-		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
-	})
-	return libLoadErr
-}
-
-var _ = Describe("parseOptions", func() {
-	It("defaults verify_threshold to 0.25", func() {
-		o := parseOptions(nil)
-		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
-		Expect(o.modelName).To(Equal(""))
-	})
-
-	It("parses verify_threshold, threshold alias and model_name", func() {
-		o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
-		Expect(o.modelName).To(Equal("ecapa"))
-
-		o2 := parseOptions([]string{"threshold:0.3"})
-		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
-	})
-
-	It("ignores non-positive thresholds and keeps the default", func() {
-		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
-		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
-	})
-})
-
-var _ = Describe("parseAnalyzeJSON", func() {
-	It("maps age, gender label+scores and emotion label+scores", func() {
-		doc := `{"age":42.0,
-			"gender":{"label":"female","female":0.88,"male":0.12},
-			"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
-		seg, err := parseAnalyzeJSON(doc)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
-		Expect(seg.Start).To(Equal(float32(0)))
-		Expect(seg.End).To(Equal(float32(0)))
-
-		Expect(seg.DominantGender).To(Equal("female"))
-		Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
-		Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
-		// The "label" entry is consumed into DominantGender, not the score map.
-		Expect(seg.Gender).ToNot(HaveKey("label"))
-
-		Expect(seg.DominantEmotion).To(Equal("neutral"))
-		Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
-		Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
-	})
-
-	It("tolerates a missing gender block", func() {
-		seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(seg.DominantGender).To(Equal(""))
-		Expect(seg.DominantEmotion).To(Equal("happy"))
-	})
-
-	It("returns an error on malformed JSON", func() {
-		_, err := parseAnalyzeJSON(`{not-json`)
-		Expect(err).To(HaveOccurred())
-	})
-})
-
-// The specs below exercise the real C-API end to end. They run only when both a
-// model GGUF and a test WAV are provided, and skip cleanly otherwise so the
-// suite stays green without large assets.
-var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
-	var (
-		v         *VoiceDetect
-		modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
-		wavPath   = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
-	)
-
-	BeforeAll(func() {
-		if modelPath == "" || wavPath == "" {
-			Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
-		}
-		if err := ensureLibLoaded(); err != nil {
-			Skip("libvoicedetect.so not loadable: " + err.Error())
-		}
-		v = &VoiceDetect{}
-		Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
-	})
-
-	It("embeds an audio clip", func() {
-		resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Embedding).ToNot(BeEmpty())
-		Expect(resp.Model).ToNot(BeEmpty())
-	})
-
-	It("verifies a clip against itself as the same speaker", func() {
-		resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
-		Expect(err).ToNot(HaveOccurred())
-		Expect(resp.Verified).To(BeTrue())
-		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
-	})
-})
--- a/backend/go/voice-detect/main.go
+++ b/backend/go/voice-detect/main.go
@@ -1,64 +0,0 @@
-package main
-
-// Started internally by LocalAI - one gRPC server per loaded model.
-//
-// Loads libvoicedetect.so via purego and registers the flat C-API entry points
-// declared in voicedetect_capi.h. The library name can be overridden with
-// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
-// convention in the sibling backends); the default looks for the .so next to
-// this binary (resolved via LD_LIBRARY_PATH by run.sh).
-import (
-	"flag"
-	"fmt"
-	"os"
-
-	"github.com/ebitengine/purego"
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-type LibFuncs struct {
-	FuncPtr any
-	Name    string
-}
-
-func main() {
-	libName := os.Getenv("VOICEDETECT_LIBRARY")
-	if libName == "" {
-		libName = "libvoicedetect.so"
-	}
-
-	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
-	if err != nil {
-		panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
-	}
-
-	// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
-	// uintptr so the raw pointer can be freed via the matching capi free fn.
-	libFuncs := []LibFuncs{
-		{&CppAbiVersion, "voicedetect_capi_abi_version"},
-		{&CppLoad, "voicedetect_capi_load"},
-		{&CppFree, "voicedetect_capi_free"},
-		{&CppLastError, "voicedetect_capi_last_error"},
-		{&CppFreeString, "voicedetect_capi_free_string"},
-		{&CppFreeVec, "voicedetect_capi_free_vec"},
-		{&CppEmbedPath, "voicedetect_capi_embed_path"},
-		{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
-		{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
-		{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
-	}
-	for _, lf := range libFuncs {
-		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
-	}
-
-	fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
-
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
-		panic(err)
-	}
-}
--- a/backend/go/voice-detect/options.go
+++ b/backend/go/voice-detect/options.go
@@ -1,46 +0,0 @@
-package main
-
-import (
-	"strconv"
-	"strings"
-)
-
-// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
-// not set one. Matches the Python speaker-recognition backend's default so the
-// two implementations agree on verdicts out of the box.
-const defaultVerifyThreshold float32 = 0.25
-
-// loadOptions holds the parsed model-level options for voice-detect.
-type loadOptions struct {
-	verifyThreshold float32
-	modelName       string
-}
-
-func splitOption(o string) (key, value string, ok bool) {
-	i := strings.Index(o, ":")
-	if i < 0 {
-		return "", "", false
-	}
-	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
-}
-
-// parseOptions reads the backend "key:value" option slice. Unknown keys are
-// ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
-func parseOptions(opts []string) loadOptions {
-	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
-	for _, oo := range opts {
-		key, value, ok := splitOption(oo)
-		if !ok {
-			continue
-		}
-		switch key {
-		case "verify_threshold", "threshold":
-			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
-				o.verifyThreshold = float32(f)
-			}
-		case "model_name":
-			o.modelName = value
-		}
-	}
-	return o
-}
--- a/backend/go/voice-detect/package.sh
+++ b/backend/go/voice-detect/package.sh
@@ -1,68 +0,0 @@
-#!/bin/bash
-#
-# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
-# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
-# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
-# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
-# is used instead of the host's.
-
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-REPO_ROOT="${CURDIR}/../../.."
-
-mkdir -p "$CURDIR/package/lib"
-
-cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
-cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
-
-# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
-# LD_LIBRARY_PATH, which run.sh points at lib/.
-cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
-	echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
-	exit 1
-}
-
-# Detect architecture and copy the core runtime libs libvoicedetect.so links
-# against, plus the matching dynamic loader as lib/ld.so.
-if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
-    echo "Detected x86_64 architecture, copying x86_64 libraries..."
-    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
-    echo "Detected ARM64 architecture, copying ARM64 libraries..."
-    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
-    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
-    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
-    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
-    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
-elif [ "$(uname -s)" = "Darwin" ]; then
-    echo "Detected Darwin"
-else
-    echo "Error: Could not detect architecture"
-    exit 1
-fi
-
-# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
-# BUILD_TYPE so the backend can reach the GPU without the runtime base image
-# shipping those drivers.
-GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
-if [ -f "$GPU_LIB_SCRIPT" ]; then
-    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
-    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
-    package_gpu_libs
-fi
-
-echo "Packaging completed successfully"
-ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/voice-detect/run.sh
+++ b/backend/go/voice-detect/run.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-
-export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
-
-# If a self-contained ld.so was packaged, route through it so the packaged
-# libc / libstdc++ are used instead of the host's (matches the whisper /
-# parakeet backends' runtime layout).
-if [ -f "$CURDIR/lib/ld.so" ]; then
-	echo "Using lib/ld.so"
-	exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
-fi
-
-exec "$CURDIR/voice-detect-grpc" "$@"
--- a/backend/go/voice-detect/test.sh
+++ b/backend/go/voice-detect/test.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-CURDIR=$(dirname "$(realpath "$0")")
-cd "$CURDIR"
-
-echo "Running voice-detect backend tests..."
-
-# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
-# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
-# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
-LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
-
-echo "voice-detect tests completed."
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=6fc7c33b4c3a2cec83e4b65abd5e96a890480375
+WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/whisper/run.sh
+++ b/backend/go/whisper/run.sh
@@ -13,14 +13,8 @@ if [ "$(uname)" != "Darwin" ]; then
 fi

 if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single fallback variant (Metal/Accelerate). The cmake build emits a
-	# Mach-O named .so, but tolerate .dylib too — pick whichever exists so the Go
-	# loader doesn't panic on a hardcoded name that isn't on disk.
-	if [ -e "$CURDIR/libgowhisper-fallback.dylib" ]; then
-		LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
-	else
-		LIBRARY="$CURDIR/libgowhisper-fallback.so"
-	fi
+	# macOS: single dylib variant (Metal or Accelerate)
+	LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
 	export DYLD_LIBRARY_PATH="$CURDIR"/lib:$DYLD_LIBRARY_PATH
 else
 	LIBRARY="$CURDIR/libgowhisper-fallback.so"
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -209,78 +209,6 @@
    nvidia-cuda-12: "cuda12-ced"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
- &voicedetect
-  name: "voice-detect"
-  alias: "voice-detect"
-  license: mit
-  icon: https://avatars.githubusercontent.com/u/95302084
-  description: |
-    voice-detect speaker recognition and voice analysis.
-    voice-detect.cpp is a C++/ggml engine that produces L2-normalised
-    speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
-    ERes2Net, CAM++) for voice verification and 1:N identification, plus
-    a wav2vec2 age / gender / emotion analysis head. It replaces the
-    Python speaker-recognition backend and is exposed through the Voice*
-    gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
-    CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
-  urls:
-    - https://github.com/mudler/voice-detect.cpp
-  tags:
-    - voice-recognition
-    - speaker-verification
-    - speaker-embedding
-    - CPU
-    - GPU
-    - CUDA
-    - HIP
-  capabilities:
-    default: "cpu-voice-detect"
-    nvidia: "cuda12-voice-detect"
-    intel: "intel-sycl-f16-voice-detect"
-    metal: "metal-voice-detect"
-    amd: "rocm-voice-detect"
-    vulkan: "vulkan-voice-detect"
-    nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
-    nvidia-cuda-13: "cuda13-voice-detect"
-    nvidia-cuda-12: "cuda12-voice-detect"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
- &facedetect
-  name: "face-detect"
-  alias: "face-detect"
-  license: mit
-  icon: https://avatars.githubusercontent.com/u/95302084
-  description: |
-    face-detect face detection, embedding, verification and analysis.
-    face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face
-    detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face
-    embeddings for verification and 1:N identification, plus a landmark /
-    age / gender analysis head. It replaces the Python insightface backend
-    and is exposed through the Embedding, Detect and Face* gRPC rpcs and
-    the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD
-    ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
-  urls:
-    - https://github.com/mudler/face-detect.cpp
-  tags:
-    - face-recognition
-    - face-verification
-    - face-embedding
-    - CPU
-    - GPU
-    - CUDA
-    - HIP
-  capabilities:
-    default: "cpu-face-detect"
-    nvidia: "cuda12-face-detect"
-    intel: "intel-sycl-f16-face-detect"
-    metal: "metal-face-detect"
-    amd: "rocm-face-detect"
-    vulkan: "vulkan-face-detect"
-    nvidia-l4t: "nvidia-l4t-arm64-face-detect"
-    nvidia-cuda-13: "cuda13-face-detect"
-    nvidia-cuda-12: "cuda12-face-detect"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -1428,6 +1356,7 @@
    intel: "intel-fish-speech"
    amd: "rocm-fish-speech"
    nvidia-l4t: "nvidia-l4t-fish-speech"
+    metal: "metal-fish-speech"
    default: "cpu-fish-speech"
    nvidia-cuda-13: "cuda13-fish-speech"
    nvidia-cuda-12: "cuda12-fish-speech"
@@ -2899,236 +2828,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
-## voice-detect
- !!merge <<: *voicedetect
-  name: "voice-detect-development"
-  capabilities:
-    default: "cpu-voice-detect-development"
-    nvidia: "cuda12-voice-detect-development"
-    intel: "intel-sycl-f16-voice-detect-development"
-    metal: "metal-voice-detect-development"
-    amd: "rocm-voice-detect-development"
-    vulkan: "vulkan-voice-detect-development"
-    nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
-    nvidia-cuda-13: "cuda13-voice-detect-development"
-    nvidia-cuda-12: "cuda12-voice-detect-development"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
- !!merge <<: *voicedetect
-  name: "nvidia-l4t-arm64-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "nvidia-l4t-arm64-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-nvidia-l4t-arm64-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cpu-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-cpu-voice-detect
- !!merge <<: *voicedetect
-  name: "cpu-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-cpu-voice-detect
- !!merge <<: *voicedetect
-  name: "metal-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "metal-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda12-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda12-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
-  name: "rocm-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
-  name: "rocm-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f32-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f32-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f16-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
-  name: "intel-sycl-f16-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
-  name: "vulkan-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
-  name: "vulkan-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-voice-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
- !!merge <<: *voicedetect
-  name: "cuda13-voice-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
-## face-detect
- !!merge <<: *facedetect
-  name: "face-detect-development"
-  capabilities:
-    default: "cpu-face-detect-development"
-    nvidia: "cuda12-face-detect-development"
-    intel: "intel-sycl-f16-face-detect-development"
-    metal: "metal-face-detect-development"
-    amd: "rocm-face-detect-development"
-    vulkan: "vulkan-face-detect-development"
-    nvidia-l4t: "nvidia-l4t-arm64-face-detect-development"
-    nvidia-cuda-13: "cuda13-face-detect-development"
-    nvidia-cuda-12: "cuda12-face-detect-development"
-    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development"
-    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development"
- !!merge <<: *facedetect
-  name: "nvidia-l4t-arm64-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
-  name: "nvidia-l4t-arm64-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-nvidia-l4t-arm64-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-nvidia-l4t-arm64-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cpu-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-cpu-face-detect
- !!merge <<: *facedetect
-  name: "cpu-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect"
-  mirrors:
-    - localai/localai-backends:master-cpu-face-detect
- !!merge <<: *facedetect
-  name: "metal-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
-  name: "metal-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
-  name: "cuda12-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
-  name: "cuda12-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
-  name: "rocm-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
-  name: "rocm-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f32-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f32-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f16-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
-  name: "intel-sycl-f16-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
-  name: "vulkan-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-vulkan-face-detect
- !!merge <<: *facedetect
-  name: "vulkan-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-vulkan-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-face-detect"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect"
-  mirrors:
-    - localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect
- !!merge <<: *facedetect
-  name: "cuda13-face-detect-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect"
-  mirrors:
-    - localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
@@ -5171,6 +4870,7 @@
    intel: "intel-fish-speech-development"
    amd: "rocm-fish-speech-development"
    nvidia-l4t: "nvidia-l4t-fish-speech-development"
+    metal: "metal-fish-speech-development"
    default: "cpu-fish-speech-development"
    nvidia-cuda-13: "cuda13-fish-speech-development"
    nvidia-cuda-12: "cuda12-fish-speech-development"
@@ -5246,6 +4946,16 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech
+- !!merge <<: *fish-speech
+  name: "metal-fish-speech"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-fish-speech"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-fish-speech
+- !!merge <<: *fish-speech
+  name: "metal-fish-speech-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-fish-speech"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-fish-speech
 ## faster-qwen3-tts
 - !!merge <<: *faster-qwen3-tts
  name: "faster-qwen3-tts-development"
--- a/backend/python/ace-step/requirements-hipblas.txt
+++ b/backend/python/ace-step/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm7.0
-torch==2.10.0+rocm7.0
+torch==2.12.0+cpu
 torchaudio
 torchvision

--- a/backend/python/common/grpc_auth.py
+++ b/backend/python/common/grpc_auth.py
@@ -11,8 +11,6 @@ import os

 import grpc

-from parent_watch import start_parent_death_watcher
-

 class _AbortHandler(grpc.RpcMethodHandler):
    """A method handler that immediately aborts with UNAUTHENTICATED."""
@@ -72,13 +70,6 @@ def get_auth_interceptors(*, aio: bool = False):

    Returns an empty list when LOCALAI_GRPC_AUTH_TOKEN is not set.
    """
-    # Arm the best-effort parent-death backstop here: this is the single helper
-    # every LocalAI Python backend invokes exactly once while building its gRPC
-    # server (mirroring how the Go watcher arms in pkg/grpc's shared serve path).
-    # start_parent_death_watcher() is idempotent and a no-op when disabled or on
-    # unsupported platforms — see parent_watch.py.
-    start_parent_death_watcher()
-
    token = os.environ.get("LOCALAI_GRPC_AUTH_TOKEN", "")
    if not token:
        return []
--- a/backend/python/common/mlx_utils.py
+++ b/backend/python/common/mlx_utils.py
@@ -20,15 +20,7 @@ def split_reasoning(text, think_start, think_end):
    Returns ``(reasoning_content, remaining_text)``. When ``think_start`` is
    empty or not found, returns ``("", text)`` unchanged.
    """
-    if not think_start or not text:
-        return "", text
-    if think_start not in text:
-        # Models like Qwen3.5 open assistant turns already INSIDE thinking, so
-        # the generated text carries only the closing tag. Everything before it
-        # is reasoning that would otherwise leak into the content.
-        if think_end and think_end in text:
-            head, _, tail = text.partition(think_end)
-            return head.strip(), tail.strip()
+    if not think_start or not text or think_start not in text:
        return "", text
    pattern = re.compile(
        re.escape(think_start) + r"(.*?)" + re.escape(think_end or ""),
--- a/backend/python/common/mlx_utils_test.py
+++ b/backend/python/common/mlx_utils_test.py
@@ -1,75 +0,0 @@
-"""Unit tests for the mlx/mlx-vlm shared helpers (mlx_utils.py).
-
-Run standalone (Python standard library only, no backend venv needed):
-    python3 -m unittest mlx_utils_test
-
-These mirror the server-less helper tests in backend/python/mlx/test.py
-(TestSharedHelpers), but live here so they run on any platform: the mlx
-test module imports grpc/backend_pb2 at import time and needs the MLX venv,
-whereas mlx_utils only needs the standard library.
-"""
-
-import types
-import unittest
-
-from mlx_utils import parse_tool_calls, split_reasoning
-
-
-class TestSplitReasoning(unittest.TestCase):
-    def test_both_tags(self):
-        r, c = split_reasoning(
-            "<think>step 1\nstep 2</think>The answer is 42.", "<think>", "</think>"
-        )
-        self.assertEqual(r, "step 1\nstep 2")
-        self.assertEqual(c, "The answer is 42.")
-
-    def test_implicit_opener_only_closing_tag(self):
-        # Qwen3.5 opens the assistant turn already inside thinking, so the
-        # output carries only the closing tag; everything before it is reasoning.
-        r, c = split_reasoning(
-            "The user is asking about the weather.\n</think>\n\nThe weather in Rome is sunny.",
-            "<think>",
-            "</think>",
-        )
-        self.assertEqual(r, "The user is asking about the weather.")
-        self.assertEqual(c, "The weather in Rome is sunny.")
-
-    def test_no_tags_at_all(self):
-        r, c = split_reasoning("just text", "<think>", "</think>")
-        self.assertEqual(r, "")
-        self.assertEqual(c, "just text")
-
-    def test_empty_think_end_and_no_opener_match(self):
-        # No think_end to anchor on, and the opener is absent → return unchanged.
-        r, c = split_reasoning("no opener here", "<think>", "")
-        self.assertEqual(r, "")
-        self.assertEqual(c, "no opener here")
-
-    def test_empty_text(self):
-        r, c = split_reasoning("", "<think>", "</think>")
-        self.assertEqual(r, "")
-        self.assertEqual(c, "")
-
-
-class TestParseToolCalls(unittest.TestCase):
-    def test_with_shim(self):
-        tm = types.SimpleNamespace(
-            tool_call_start="<tool_call>",
-            tool_call_end="</tool_call>",
-            parse_tool_call=lambda body, tools: {
-                "name": "get_weather",
-                "arguments": {"location": body.strip()},
-            },
-        )
-        calls, remaining = parse_tool_calls(
-            "Sure: <tool_call>Paris</tool_call>", tm, tools=None
-        )
-        self.assertEqual(len(calls), 1)
-        self.assertEqual(calls[0]["name"], "get_weather")
-        self.assertEqual(calls[0]["arguments"], '{"location": "Paris"}')
-        self.assertEqual(calls[0]["index"], 0)
-        self.assertNotIn("<tool_call>", remaining)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/backend/python/common/parent_watch.py
+++ b/backend/python/common/parent_watch.py
@@ -1,149 +0,0 @@
-"""Parent-death watcher (best-effort backstop) for LocalAI Python backends.
-
-LocalAI spawns each backend as a child process and, on a clean shutdown, tears
-it down itself (SIGTERM -> grace -> SIGKILL). That graceful path only runs when
-LocalAI receives a catchable signal and lives long enough to run its handlers.
-If LocalAI is SIGKILLed (e.g. a supervising process's grace period elapses
-first), that teardown never runs and this backend would be reparented to init
-and linger, holding GPU/VRAM and its listen port.
-
-The watcher here is a best-effort backstop for exactly that case: it does NOT
-replace the graceful teardown, it only covers the "parent vanished without
-cleaning up" path. It detects reparenting: when the process that spawned this
-backend dies, the kernel reparents us to the nearest sub-reaper or to init
-(PID 1), so os.getppid() stops matching the value captured at startup. This
-getppid() approach is portable across Linux/macOS (unlike the Linux-only
-PR_SET_PDEATHSIG), which is why it is used here, mirroring the Go backends'
-pkg/grpc/parentwatch.go and the C++ backends' parent_watch.h. It is disabled on
-Windows, which has no equivalent orphan-reparenting semantics.
-
-Env vars (shared verbatim across the Go, C++ and Python backends):
-  LOCALAI_BACKEND_PARENT_WATCH           enabled by default; a falsey value
-                                         ("false"/"0"/"no"/"off", case-insensitive)
-                                         disables it.
-  LOCALAI_BACKEND_PARENT_WATCH_INTERVAL  poll interval as a Go-style duration
-                                         string ("500ms", "2s", "1m") or a bare
-                                         number of seconds. Defaults to 2s.
-"""
-
-import os
-import sys
-import threading
-
-ENV_PARENT_WATCH = "LOCALAI_BACKEND_PARENT_WATCH"
-ENV_PARENT_WATCH_INTERVAL = "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL"
-
-_DEFAULT_INTERVAL_SECONDS = 2.0
-
-# Guard so repeated calls (e.g. get_auth_interceptors invoked more than once)
-# only ever arm a single watcher thread per process.
-_started = False
-_started_lock = threading.Lock()
-
-
-def _enabled():
-    """Report whether the watcher should run in this process."""
-    # Windows does not reparent orphans to a well-known init PID, so the
-    # getppid() heuristic used here doesn't apply there.
-    if os.name == "nt" or sys.platform.startswith("win"):
-        return False
-    val = os.environ.get(ENV_PARENT_WATCH, "").strip().lower()
-    if val in ("false", "0", "no", "off"):
-        return False
-    return True
-
-
-def _interval_seconds():
-    """Return the configured poll interval in seconds, or the default.
-
-    Accepts Go-style duration strings ("500ms", "2s", "1m") for cross-language
-    parity, or a bare number interpreted as seconds.
-    """
-    raw = os.environ.get(ENV_PARENT_WATCH_INTERVAL, "").strip()
-    if not raw:
-        return _DEFAULT_INTERVAL_SECONDS
-    # Split numeric prefix from unit suffix.
-    i = 0
-    while i < len(raw) and (raw[i].isdigit() or raw[i] == "." or (i == 0 and raw[i] in "+-")):
-        i += 1
-    if i == 0:
-        return _DEFAULT_INTERVAL_SECONDS
-    try:
-        num = float(raw[:i])
-    except ValueError:
-        return _DEFAULT_INTERVAL_SECONDS
-    unit = raw[i:].lower()
-    if unit == "ms":
-        seconds = num / 1000.0
-    elif unit in ("s", ""):
-        seconds = num
-    elif unit == "m":
-        seconds = num * 60.0
-    else:
-        return _DEFAULT_INTERVAL_SECONDS
-    return seconds if seconds > 0 else _DEFAULT_INTERVAL_SECONDS
-
-
-def _parent_died(orig_ppid):
-    """Report whether this process has been reparented away from orig_ppid.
-
-    Reparenting is the standard POSIX signal that the original parent (here, the
-    LocalAI process that spawned this backend) has exited: the orphan is handed
-    to the nearest sub-reaper or to init (PID 1), so os.getppid() no longer
-    matches the value captured at startup.
-    """
-    ppid = os.getppid()
-    return ppid != orig_ppid or ppid == 1
-
-
-def _watch(orig_ppid, interval, on_death):
-    """Poll until _parent_died reports the original parent is gone, then call
-    on_death. Blocks, so run it on its own (daemon) thread."""
-    import time
-
-    while True:
-        time.sleep(interval)
-        if _parent_died(orig_ppid):
-            on_death()
-            return
-
-
-def start_parent_death_watcher():
-    """Install the best-effort safety net described in this module's docstring.
-
-    No-op when disabled, on Windows, when already orphaned at startup
-    (os.getppid() <= 1), or if already started. This is a backstop alongside —
-    never a replacement for — LocalAI's graceful teardown.
-    """
-    global _started
-    if not _enabled():
-        return
-    with _started_lock:
-        if _started:
-            return
-        orig_ppid = os.getppid()
-        # A parent of 1 (or less) at startup means we were already orphaned (or
-        # launched directly under init) — there is no original parent to watch.
-        if orig_ppid <= 1:
-            return
-        interval = _interval_seconds()
-
-        def on_death():
-            print(
-                "backend parent process (pid {}) exited without stopping this "
-                "backend; self-terminating to avoid orphaning".format(orig_ppid),
-                file=sys.stderr,
-                flush=True,
-            )
-            # Immediate, non-cleanup exit: this is a shutdown safety net and the
-            # normal graceful path is already gone.
-            os._exit(1)
-
-        thread = threading.Thread(
-            target=_watch,
-            args=(orig_ppid, interval, on_death),
-            name="parent-death-watcher",
-            daemon=True,
-        )
-        thread.start()
-        _started = True
--- a/backend/python/common/parent_watch_test.py
+++ b/backend/python/common/parent_watch_test.py
@@ -1,150 +0,0 @@
-"""Unit tests for the parent-death watcher (parent_watch.py).
-
-Run standalone (Python standard library only, no backend venv needed):
-    python3 -m unittest parent_watch_test
-
-The core test (test_detects_reparent) builds a genuine two-level process tree
-(test -> middle -> grandchild) with os.fork, lets the middle process die, and
-asserts the grandchild's parent_watch._watch detects the reparenting and
-self-terminates — mirroring the Go test in pkg/grpc/parentwatch_test.go and the
-C++ test in backend/cpp/llama-cpp/parent_watch_test.cpp.
-"""
-
-import os
-import sys
-import tempfile
-import threading
-import time
-import unittest
-
-import parent_watch
-
-
-class TestParentWatchEnvParsing(unittest.TestCase):
-    def setUp(self):
-        self._saved = {
-            k: os.environ.get(k)
-            for k in (parent_watch.ENV_PARENT_WATCH, parent_watch.ENV_PARENT_WATCH_INTERVAL)
-        }
-        for k in self._saved:
-            os.environ.pop(k, None)
-
-    def tearDown(self):
-        for k, v in self._saved.items():
-            if v is None:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v
-
-    def test_interval_default(self):
-        self.assertEqual(parent_watch._interval_seconds(), 2.0)
-
-    def test_interval_units(self):
-        cases = {"500ms": 0.5, "2s": 2.0, "1m": 60.0, "3": 3.0, "0.5s": 0.5}
-        for raw, expected in cases.items():
-            os.environ[parent_watch.ENV_PARENT_WATCH_INTERVAL] = raw
-            self.assertAlmostEqual(parent_watch._interval_seconds(), expected, msg=raw)
-
-    def test_interval_garbage_falls_back(self):
-        os.environ[parent_watch.ENV_PARENT_WATCH_INTERVAL] = "garbage"
-        self.assertEqual(parent_watch._interval_seconds(), 2.0)
-
-    @unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only")
-    def test_enabled_default(self):
-        self.assertTrue(parent_watch._enabled())
-
-    @unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only")
-    def test_disabled_by_falsey(self):
-        for val in ("false", "0", "no", "off", "OFF", " False "):
-            os.environ[parent_watch.ENV_PARENT_WATCH] = val
-            self.assertFalse(parent_watch._enabled(), msg=val)
-
-    @unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only")
-    def test_enabled_by_truthy(self):
-        for val in ("true", "1", "yes", "on"):
-            os.environ[parent_watch.ENV_PARENT_WATCH] = val
-            self.assertTrue(parent_watch._enabled(), msg=val)
-
-
-@unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "fork/reparent is POSIX only")
-class TestParentWatchReparent(unittest.TestCase):
-    def _wait_for_file(self, path, timeout=10.0):
-        deadline = time.time() + timeout
-        while time.time() < deadline:
-            if os.path.exists(path):
-                return True
-            time.sleep(0.02)
-        return False
-
-    def test_detects_reparent(self):
-        tmpdir = tempfile.mkdtemp(prefix="parentwatch_test_")
-        ready_file = os.path.join(tmpdir, "ready")
-        exited_file = os.path.join(tmpdir, "exited")
-
-        middle = os.fork()
-        if middle == 0:
-            # ---- middle process ----
-            grandchild = os.fork()
-            if grandchild == 0:
-                # ---- grandchild process: arm the REAL watcher against middle ----
-                orig_ppid = os.getppid()
-
-                def on_death():
-                    with open(exited_file, "w") as f:
-                        f.write("1")
-                    os._exit(7)
-
-                threading.Thread(
-                    target=parent_watch._watch,
-                    args=(orig_ppid, 0.05, on_death),
-                    daemon=True,
-                ).start()
-
-                # Safety valve: never linger if something goes wrong.
-                def bail():
-                    time.sleep(30)
-                    os._exit(2)
-
-                threading.Thread(target=bail, daemon=True).start()
-
-                # Signal readiness only after the watcher captured orig_ppid.
-                with open(ready_file, "w") as f:
-                    f.write(str(os.getpid()))
-                while True:
-                    time.sleep(1)
-            else:
-                # middle: wait until grandchild is ready, then exit to orphan it.
-                if not self._wait_for_file(ready_file):
-                    os._exit(5)
-                os._exit(0)
-
-        # ---- test (top) process ----
-        os.waitpid(middle, 0)  # reap middle only; grandchild is orphaned
-
-        self.assertTrue(os.path.exists(ready_file), "grandchild never signaled readiness")
-        self.assertTrue(
-            self._wait_for_file(exited_file),
-            "watcher did not detect parent death within timeout",
-        )
-
-        # Best-effort cleanup: kill the grandchild if it somehow survived.
-        try:
-            with open(ready_file) as f:
-                pid = int(f.read().strip())
-            if pid > 1:
-                os.kill(pid, 9)
-        except (OSError, ValueError):
-            pass
-        for p in (ready_file, exited_file):
-            try:
-                os.remove(p)
-            except OSError:
-                pass
-        try:
-            os.rmdir(tmpdir)
-        except OSError:
-            pass
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/backend/python/common/python_utils.py
+++ b/backend/python/common/python_utils.py
@@ -58,18 +58,7 @@ def messages_to_dicts(proto_messages):
            d["reasoning_content"] = msg.reasoning_content
        if msg.tool_calls:
            try:
-                tool_calls = json.loads(msg.tool_calls)
-                # Chat templates (e.g. Qwen) iterate function.arguments as a
-                # mapping, but the OpenAI wire format carries it as a JSON
-                # string — decode it back so the template's .items() works.
-                for tc in tool_calls:
-                    fn = tc.get("function") if isinstance(tc, dict) else None
-                    if isinstance(fn, dict) and isinstance(fn.get("arguments"), str):
-                        try:
-                            fn["arguments"] = json.loads(fn["arguments"])
-                        except json.JSONDecodeError:
-                            pass
-                d["tool_calls"] = tool_calls
+                d["tool_calls"] = json.loads(msg.tool_calls)
            except json.JSONDecodeError:
                pass
        result.append(d)
--- a/backend/python/common/python_utils_test.py
+++ b/backend/python/common/python_utils_test.py
@@ -1,122 +0,0 @@
-"""Unit tests for the shared python backend helpers (python_utils.py).
-
-Run standalone (Python standard library only, no backend venv needed):
-    python3 -m unittest python_utils_test
-
-These mirror the server-less helper tests in backend/python/mlx/test.py
-(TestSharedHelpers), but live here so they run on any platform: the mlx
-test module imports grpc/backend_pb2 at import time and needs the MLX venv,
-whereas python_utils has no third-party dependency. Proto Message objects
-are faked with types.SimpleNamespace (real proto fields default to "").
-"""
-
-import json
-import types
-import unittest
-
-from python_utils import messages_to_dicts, parse_options
-
-
-def _msg(**fields):
-    """Fake a proto Message: every unset field is the empty string, as protobuf."""
-    defaults = {
-        "role": "",
-        "content": "",
-        "name": "",
-        "tool_call_id": "",
-        "reasoning_content": "",
-        "tool_calls": "",
-    }
-    defaults.update(fields)
-    return types.SimpleNamespace(**defaults)
-
-
-class TestParseOptions(unittest.TestCase):
-    def test_type_inference(self):
-        opts = parse_options(
-            ["temperature:0.7", "max_tokens:128", "trust:true", "name:hello", "no_colon_skipped"]
-        )
-        self.assertEqual(opts["temperature"], 0.7)
-        self.assertEqual(opts["max_tokens"], 128)
-        self.assertIs(opts["trust"], True)
-        self.assertEqual(opts["name"], "hello")
-        self.assertNotIn("no_colon_skipped", opts)
-
-
-class TestMessagesToDicts(unittest.TestCase):
-    def test_basic_fields(self):
-        out = messages_to_dicts(
-            [
-                _msg(role="user", content="hi"),
-                _msg(role="tool", content="42", tool_call_id="call_1", name="f"),
-            ]
-        )
-        self.assertEqual(out[0], {"role": "user", "content": "hi"})
-        self.assertEqual(out[1]["tool_call_id"], "call_1")
-        self.assertEqual(out[1]["name"], "f")
-
-    def test_tool_call_arguments_string_decoded_to_mapping(self):
-        # OpenAI wire format ships function.arguments as a JSON *string*; chat
-        # templates iterate it as a mapping, so it must come back as a dict.
-        out = messages_to_dicts(
-            [
-                _msg(
-                    role="assistant",
-                    tool_calls=json.dumps(
-                        [
-                            {
-                                "id": "call_1",
-                                "type": "function",
-                                "function": {
-                                    "name": "get_weather",
-                                    "arguments": '{"location": "Rome"}',
-                                },
-                            }
-                        ]
-                    ),
-                )
-            ]
-        )
-        args = out[0]["tool_calls"][0]["function"]["arguments"]
-        self.assertEqual(args, {"location": "Rome"})
-        self.assertEqual(dict(args.items()), {"location": "Rome"})
-
-    def test_tool_call_arguments_already_mapping_is_idempotent(self):
-        out = messages_to_dicts(
-            [
-                _msg(
-                    role="assistant",
-                    tool_calls=json.dumps(
-                        [{"function": {"name": "f", "arguments": {"a": 1}}}]
-                    ),
-                )
-            ]
-        )
-        self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], {"a": 1})
-
-    def test_tool_call_arguments_invalid_json_left_as_string(self):
-        out = messages_to_dicts(
-            [
-                _msg(
-                    role="assistant",
-                    tool_calls=json.dumps(
-                        [{"function": {"name": "f", "arguments": "not-json"}}]
-                    ),
-                )
-            ]
-        )
-        self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], "not-json")
-
-    def test_tool_call_without_function_key(self):
-        out = messages_to_dicts(
-            [_msg(role="assistant", tool_calls=json.dumps([{"id": "call_1"}]))]
-        )
-        self.assertEqual(out[0]["tool_calls"], [{"id": "call_1"}])
-
-    def test_tool_calls_invalid_json_dropped(self):
-        out = messages_to_dicts([_msg(role="assistant", tool_calls="{not json")])
-        self.assertNotIn("tool_calls", out[0])
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/backend/python/fish-speech/install.sh
+++ b/backend/python/fish-speech/install.sh
@@ -13,17 +13,6 @@ fi
 # fish-speech uses pyrootutils which requires a .project-root marker
 touch "${backend_dir}/.project-root"

-# On darwin arm64 the transitive `tokenizers` dep compiles its Rust extension
-# from source (Linux uses prebuilt manylinux wheels, so it never compiles
-# there). The pinned tokenizers crate that fish-speech's stack resolves to
-# contains a `&T` -> `&mut T` cast that trips the now-deny-by-default
-# `invalid_reference_casting` lint in the macOS runner's newer Rust toolchain,
-# breaking the build (seen in the v4.5.5 release CI fish-speech darwin/metal
-# job). Allow that lint so the unchanged third-party crate compiles as before.
-# Append rather than clobber any pre-existing RUSTFLAGS; harmless on Linux
-# where no Rust compile happens.
-export RUSTFLAGS="${RUSTFLAGS:-} -A invalid_reference_casting"
-
 installRequirements

 # Clone fish-speech source (the pip package doesn't include inference modules)
--- a/backend/python/fish-speech/requirements-mps.txt
+++ b/backend/python/fish-speech/requirements-mps.txt
@@ -0,0 +1,2 @@
+torch
+torchaudio
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@@ -3,5 +3,4 @@ protobuf
 certifi
 packaging==24.1
 pip
-chardet
-click
+chardet
--- a/backend/python/llama-cpp-quantization/requirements-cpu.txt
+++ b/backend/python/llama-cpp-quantization/requirements-cpu.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.10.0
+torch==2.12.0+cpu
 transformers>=4.56.2
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/llama-cpp-quantization/requirements-mps.txt
+++ b/backend/python/llama-cpp-quantization/requirements-mps.txt
@@ -1,4 +1,4 @@
-torch==2.10.0
+torch==2.12.0+cpu
 transformers>=4.56.2
 huggingface-hub>=1.3.0
 sentencepiece
--- a/backend/python/sglang/backend.py
+++ b/backend/python/sglang/backend.py
@@ -147,25 +147,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                d["reasoning_content"] = msg.reasoning_content
            if msg.tool_calls:
                try:
-                    tool_calls = json.loads(msg.tool_calls)
+                    d["tool_calls"] = json.loads(msg.tool_calls)
                except json.JSONDecodeError:
                    pass
-                else:
-                    # OpenAI wire format carries function.arguments as a
-                    # JSON-encoded string, but chat templates (e.g. Qwen3)
-                    # iterate over it as a mapping. The vllm backend
-                    # already parses arguments before applying the chat
-                    # template (PR #10256); mirror that here so the
-                    # sglang backend works with the same wire format.
-                    if isinstance(tool_calls, list):
-                        for tc in tool_calls:
-                            func = tc.get("function") if isinstance(tc, dict) else None
-                            if isinstance(func, dict) and isinstance(func.get("arguments"), str):
-                                try:
-                                    func["arguments"] = json.loads(func["arguments"])
-                                except json.JSONDecodeError:
-                                    pass
-                    d["tool_calls"] = tool_calls
            result.append(d)
        return result

--- a/backend/python/sglang/requirements-cpu.txt
+++ b/backend/python/sglang/requirements-cpu.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
-torch==2.9.0
+torch==2.12.0+cpu
 torchvision
 torchaudio
 transformers
--- a/backend/python/sglang/requirements-cublas12.txt
+++ b/backend/python/sglang/requirements-cublas12.txt
@@ -6,7 +6,7 @@
 # for cublas12 so uv consults this index alongside PyPI.
 --extra-index-url https://download.pytorch.org/whl/cu128
 accelerate
-torch==2.9.1
+torch==2.12.0+cpu
 torchvision
 torchaudio
 transformers
--- a/backend/python/vllm-omni/requirements-cublas12.txt
+++ b/backend/python/vllm-omni/requirements-cublas12.txt
@@ -1,4 +1,4 @@
 accelerate
-torch==2.7.0
+torch==2.12.0+cu130
 transformers
 bitsandbytes
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -748,12 +748,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # When (A) native streaming ran cleanly, per-delta yields above already
        # delivered everything — do NOT extract again on the full text or we'd
        # duplicate content/tool_calls into the final chunk.
-        # NOTE: `native_streaming` is a capability flag ("streaming parser is
-        # available"), not a state flag ("streaming actually ran"). For
-        # non-streaming requests it is still True but the per-delta loop was
-        # never entered, so we MUST still run extract_tool_calls here. Hence
-        # the explicit `streaming and …` guard on both branches.
-        if has_tool_parser and not (streaming and native_streaming and not native_streaming_error):
+        if has_tool_parser and not (native_streaming and not native_streaming_error):
            try:
                tp = tp_instance
                if tp is None:
@@ -775,7 +770,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        ))
            except Exception as e:
                print(f"Tool parser error: {e}", file=sys.stderr)
-        elif streaming and native_streaming and not native_streaming_error:
+        elif native_streaming and not native_streaming_error:
            # Per-delta path already emitted content + tool_calls; the final
            # chat_delta should carry only metadata (token counts, logprobs).
            content = ""
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -35,21 +35,6 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi

-# AMD ROCm: vLLM ships prebuilt ROCm wheels, but on a DEDICATED index
-# (https://wheels.vllm.ai/rocm/), NOT PyPI, and ONLY for CPython 3.12. On any
-# other Python the installer silently falls back to the CUDA-only PyPI wheel,
-# which is unusable on an AMD GPU (import fails, so the backend never finds the
-# vllm module). Force Python 3.12 before the venv is created (matches the
-# intel/l4t13 cp312 bump); the hipblas branch below pulls vllm from the ROCm
-# wheel index. unsafe-best-match lets uv consult that index and PyPI together.
-# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
-if [ "x${BUILD_TYPE}" == "xhipblas" ]; then
-    PYTHON_VERSION="3.12"
-    PYTHON_PATCH="12"
-    PY_STANDALONE_TAG="20251120"
-    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
-fi
-
 # cublas13 pulls the vLLM wheel from a per-tag cu130 index (PyPI's vllm wheel
 # is built against CUDA 12 and won't load on cu130). uv's default per-package
 # first-match strategy would still pick the PyPI wheel, so allow it to consult
@@ -119,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
    # vllm pin (requirements-cublas13-after.txt, bumped independently against
    # vllm/vllm) until vllm-metal supports a newer vLLM.
-    VLLM_METAL_VERSION="v0.3.0.dev20260701212152"
+    VLLM_METAL_VERSION="v0.3.0.dev20260622062346"

    # The coupled vLLM source version is whatever this vllm-metal release builds
    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
@@ -209,22 +194,6 @@ elif [ "x${BUILD_TYPE}" == "xintel" ]; then
        export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
        VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
    popd
-# AMD ROCm: install vllm from its dedicated ROCm wheel index instead of the
-# CUDA-only PyPI wheel. installRequirements brings the base ROCm
-# torch/transformers (requirements-hipblas.txt), then we pull vllm (plus the
-# matching ROCm torch, via --upgrade) from wheels.vllm.ai/rocm. This is the
-# method upstream prescribes for AMD; the Python-3.12 pin is set above.
-# There is intentionally no requirements-hipblas-after.txt: a bare `vllm`
-# there would resolve to the CUDA wheel, and installRequirements never loads
-# a ${BUILD_TYPE}-after file for hipblas anyway (BUILD_TYPE == BUILD_PROFILE).
-# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
-elif [ "x${BUILD_TYPE}" == "xhipblas" ]; then
-    installRequirements
-
-    # --upgrade reconciles the base ROCm torch to whatever the vllm ROCm wheel
-    # pins; --extra-index-url adds the ROCm wheel repository on top of PyPI.
-    uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
-        --extra-index-url https://wheels.vllm.ai/rocm/ --upgrade vllm
 # FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
 # requirements-cpu-after.txt and compiles vllm locally against the host's
 # actual CPU. Not used by default because it takes ~30-40 minutes, but
--- a/backend/python/vllm/requirements-cublas13-after.txt
+++ b/backend/python/vllm/requirements-cublas13-after.txt
@@ -3,8 +3,8 @@
 # on a cu130 host. Pull the cu130-flavoured wheel from vLLM's per-tag index
 # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
 # so uv consults this index alongside PyPI.
--extra-index-url https://wheels.vllm.ai/0.24.0/cu130
+--extra-index-url https://wheels.vllm.ai/0.23.0/cu130
 # VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
 # which pins this exact vLLM version. Bumping vllm here means coordinating with a
 # vllm-metal release that supports the new version, or macOS/Metal builds break.
-vllm==0.24.0
+vllm==0.23.0
--- a/backend/python/vllm/requirements-hipblas-after.txt
+++ b/backend/python/vllm/requirements-hipblas-after.txt
@@ -0,0 +1 @@
+vllm
--- a/backend/rust/kokoros/src/service.rs
+++ b/backend/rust/kokoros/src/service.rs
@@ -351,16 +351,6 @@ impl Backend for KokorosService {
        Err(Status::unimplemented("Not supported"))
    }

-    type AudioTranscriptionLiveStream =
-        ReceiverStream<Result<backend::TranscriptLiveResponse, Status>>;
-
-    async fn audio_transcription_live(
-        &self,
-        _: Request<tonic::Streaming<backend::TranscriptLiveRequest>>,
-    ) -> Result<Response<Self::AudioTranscriptionLiveStream>, Status> {
-        Err(Status::unimplemented("Not supported"))
-    }
-
    async fn diarize(
        &self,
        _: Request<backend::DiarizeRequest>,
--- a/cmd/launcher/internal/launcher.go
+++ b/cmd/launcher/internal/launcher.go
@@ -207,20 +207,12 @@ func (l *Launcher) StartLocalAI() error {
 	}

 	// Build command arguments
-	dataPath := l.GetDataPath()
 	args := []string{
 		"run",
 		"--models-path", l.config.ModelsPath,
 		"--backends-path", l.config.BackendsPath,
 		"--address", l.config.Address,
 		"--log-level", l.config.LogLevel,
-		// Keep persistent data and dynamic config under the launcher's data
-		// directory (~/.localai) rather than letting the server resolve them
-		// to ${basepath}/{data,configuration}. ${basepath} expands to the
-		// launcher process's CWD (often the user's home root), which puts
-		// ~/data and ~/configuration outside ~/.localai. See #10610.
-		"--data-path", filepath.Join(dataPath, "data"),
-		"--localai-config-dir", filepath.Join(dataPath, "configuration"),
 	}

 	l.localaiCmd = exec.CommandContext(l.ctx, binaryPath, args...)
@@ -437,7 +429,7 @@ func (l *Launcher) CheckForUpdates() (bool, string, error) {
 }

 // DownloadUpdate downloads the latest version
-func (l *Launcher) DownloadUpdate(version string, progressCallback func(downloaded, total int64)) error {
+func (l *Launcher) DownloadUpdate(version string, progressCallback func(float64)) error {
 	return l.releaseManager.DownloadRelease(version, progressCallback)
 }

@@ -494,6 +486,7 @@ func (l *Launcher) showDownloadLocalAIDialog() {
 	fyne.DoAndWait(func() {
 		// Create a standalone window for the download dialog
 		dialogWindow := l.app.NewWindow("LocalAI Installation Required")
+		dialogWindow.Resize(fyne.NewSize(500, 350))
 		dialogWindow.CenterOnScreen()
 		dialogWindow.SetCloseIntercept(func() {
 			dialogWindow.Close()
@@ -555,7 +548,6 @@ func (l *Launcher) showDownloadLocalAIDialog() {
 		)

 		dialogWindow.SetContent(content)
-		resizeToContent(dialogWindow, content)
 		dialogWindow.Show()
 	})
 }
@@ -629,134 +621,88 @@ func (l *Launcher) showDownloadError(title, message string) {
 }

 // showDownloadProgress shows a standalone progress window for downloading LocalAI
-// after a fresh install (no LocalAI binary present yet).
 func (l *Launcher) showDownloadProgress(version, title string) {
-	l.showDownloadProgressWindow(version, title, func(win fyne.Window) {
-		dialog.ShowConfirm("Installation Complete",
-			"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
-			func(bool) {
-				win.Close()
-				l.updateStatus("LocalAI installed successfully")
-				if l.systray != nil {
-					l.systray.recreateMenu()
-				}
-			}, win)
-	})
-}
-
-// showDownloadProgressWindow renders the download progress popup shared by every
-// "download/upgrade LocalAI" entry point. It owns the progress bar, the
-// human-readable byte readout, resume-aware retry, and content-fit window
-// sizing so the behaviour stays identical everywhere. onSuccess runs (on the UI
-// goroutine) once the download verifies, and is responsible for the success
-// dialog and any follow-up; the window is passed in so it can be parented/closed.
-func (l *Launcher) showDownloadProgressWindow(version, title string, onSuccess func(win fyne.Window)) {
 	fyne.DoAndWait(func() {
+		// Create progress window
 		progressWindow := l.app.NewWindow("Downloading LocalAI")
+		progressWindow.Resize(fyne.NewSize(400, 250))
 		progressWindow.CenterOnScreen()
 		progressWindow.SetCloseIntercept(func() {
 			progressWindow.Close()
 		})

+		// Progress bar
 		progressBar := widget.NewProgressBar()
 		progressBar.SetValue(0)

 		// Status label. Truncate with an ellipsis so a long "Download failed:
 		// <url>" message can't stretch the window (and progress bar) to fit the
-		// whole error on one line.
+		// whole error on one line; the full error is shown in the dialog below.
 		statusLabel := widget.NewLabel("Preparing download...")
 		statusLabel.Truncation = fyne.TextTruncateEllipsis

+		// Release notes button
 		releaseNotesButton := widget.NewButton("View Release Notes", func() {
 			releaseNotesURL, err := l.githubReleaseNotesURL(version)
 			if err != nil {
 				log.Printf("Failed to parse URL: %v", err)
 				return
 			}
+
 			l.app.OpenURL(releaseNotesURL)
 		})

-		// Retry button: hidden until a download fails. GitHub downloads are
-		// flaky, and the underlying download resumes from the partial file, so
-		// a retry continues where it left off rather than starting over.
-		retryButton := widget.NewButton("Retry", nil)
-		retryButton.Importance = widget.HighImportance
-		retryButton.Hide()
-
-		buttonRow := container.NewHBox(releaseNotesButton, retryButton)
-		content := container.NewVBox(
+		// Progress container
+		progressContainer := container.NewVBox(
 			widget.NewLabel(title),
 			progressBar,
 			statusLabel,
 			widget.NewSeparator(),
-			buttonRow,
+			releaseNotesButton,
 		)
-		progressWindow.SetContent(content)
-		resizeToContent(progressWindow, content)
-
-		var startDownload func()
-		startDownload = func() {
-			retryButton.Hide()
-			progressBar.SetValue(0)
-			statusLabel.SetText("Preparing download...")
-			resizeToContent(progressWindow, content)
-
-			go func() {
-				err := l.DownloadUpdate(version, func(downloaded, total int64) {
-					fyne.Do(func() {
-						if total > 0 {
-							progressBar.SetValue(float64(downloaded) / float64(total))
-							statusLabel.SetText(fmt.Sprintf("Downloading… %s / %s", formatBytes(downloaded), formatBytes(total)))
-						} else {
-							statusLabel.SetText(fmt.Sprintf("Downloading… %s", formatBytes(downloaded)))
-						}
-					})
-				})
-
-				fyne.Do(func() {
-					if err != nil {
-						statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
-						retryButton.Show()
-						resizeToContent(progressWindow, content)
-						return
-					}
-					progressBar.SetValue(1.0)
-					statusLabel.SetText("Download complete")
-					onSuccess(progressWindow)
-				})
-			}()
-		}
-		retryButton.OnTapped = startDownload

+		progressWindow.SetContent(progressContainer)
 		progressWindow.Show()
-		startDownload()
+
+		// Start download in background
+		go func() {
+			err := l.DownloadUpdate(version, func(progress float64) {
+				// Update progress bar
+				fyne.Do(func() {
+					progressBar.SetValue(progress)
+					percentage := int(progress * 100)
+					statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
+				})
+			})
+
+			// Handle completion
+			fyne.Do(func() {
+				if err != nil {
+					statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
+					// Show error dialog
+					dialog.ShowError(err, progressWindow)
+				} else {
+					statusLabel.SetText("Download completed successfully!")
+					progressBar.SetValue(1.0)
+
+					// Show success dialog
+					dialog.ShowConfirm("Installation Complete",
+						"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
+						func(close bool) {
+							progressWindow.Close()
+							// Update status and refresh systray menu
+							l.updateStatus("LocalAI installed successfully")
+
+							if l.systray != nil {
+								l.systray.recreateMenu()
+							}
+						}, progressWindow)
+				}
+			})
+		}()
 	})
 }

-// resizeToContent sizes a window to fit its content (with a sane minimum width)
-// so the dialog doesn't show a large blank gap below the last widget.
-func resizeToContent(w fyne.Window, content fyne.CanvasObject) {
-	size := content.MinSize()
-	if size.Width < 400 {
-		size.Width = 400
-	}
-	w.Resize(size)
-}
-
-// formatBytes renders a byte count as a human-readable size (e.g. "12.3 MB").
-func formatBytes(b int64) string {
-	const unit = 1024
-	if b < unit {
-		return fmt.Sprintf("%d B", b)
-	}
-	div, exp := int64(unit), 0
-	for n := b / unit; n >= unit; n /= unit {
-		div *= unit
-		exp++
-	}
-	return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
-}
-
 // monitorLogs monitors the output of LocalAI and adds it to the log buffer
 func (l *Launcher) monitorLogs(reader io.Reader, prefix string) {
 	scanner := bufio.NewScanner(reader)
--- a/cmd/launcher/internal/release_manager.go
+++ b/cmd/launcher/internal/release_manager.go
@@ -11,7 +11,6 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
-	"path"
 	"path/filepath"
 	"runtime"
 	"strings"
@@ -51,12 +50,6 @@ type ReleaseManager struct {
 	ChecksumsPath string
 	// MetadataPath is where version metadata is stored
 	MetadataPath string
-	// BaseDownloadURL is the base URL release assets are downloaded from
-	// (defaults to https://github.com; overridable for testing)
-	BaseDownloadURL string
-	// RetryBackoff is the base wait between download attempts; the Nth retry
-	// waits N*RetryBackoff (defaults to 1s; lowered in tests)
-	RetryBackoff time.Duration
 	// HTTPClient is the HTTP client used for downloads
 	HTTPClient *http.Client
 }
@@ -69,94 +62,28 @@ func NewReleaseManager() *ReleaseManager {
 	metadataPath := filepath.Join(homeDir, ".localai", "metadata")

 	return &ReleaseManager{
-		GitHubOwner:     "mudler",
-		GitHubRepo:      "LocalAI",
-		BinaryPath:      binaryPath,
-		CurrentVersion:  internal.PrintableVersion(),
-		ChecksumsPath:   checksumsPath,
-		MetadataPath:    metadataPath,
-		BaseDownloadURL: "https://github.com",
-		RetryBackoff:    1 * time.Second,
-		HTTPClient:      httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
+		GitHubOwner:    "mudler",
+		GitHubRepo:     "LocalAI",
+		BinaryPath:     binaryPath,
+		CurrentVersion: internal.PrintableVersion(),
+		ChecksumsPath:  checksumsPath,
+		MetadataPath:   metadataPath,
+		HTTPClient:     httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
 	}
 }

-// GetLatestRelease resolves the latest LocalAI release.
-//
-// It first follows the github.com "releases/latest" redirect, which reveals the
-// latest tag in the final URL and—crucially—is NOT subject to the
-// 60-requests/hour unauthenticated rate limit of api.github.com. That limit is
-// per-IP, so on shared/NAT/CGNAT/cloud addresses the API returns 403 almost
-// immediately (e.g. on a fresh install with no LocalAI present yet). The
-// redirect avoids that entirely. The richer JSON API is kept only as a fallback.
-//
-// Only the version is consumed by callers, so the redirect's tag is sufficient.
+// GetLatestRelease fetches the latest release information from GitHub
 func (rm *ReleaseManager) GetLatestRelease() (*Release, error) {
-	version, redirectErr := rm.latestVersionFromRedirect()
-	if redirectErr == nil {
-		return &Release{Version: version}, nil
-	}
-	log.Printf("Could not resolve latest version via release redirect (%v); falling back to GitHub API", redirectErr)
-
-	release, apiErr := rm.latestReleaseFromAPI()
-	if apiErr != nil {
-		// Surface both failures so a rate-limited API doesn't mask the (usually
-		// more relevant) redirect error.
-		return nil, fmt.Errorf("failed to fetch latest release: %v (redirect: %v)", apiErr, redirectErr)
-	}
-	return release, nil
-}
-
-// latestVersionFromRedirect returns the latest tag by following the github.com
-// "releases/latest" redirect to ".../releases/tag/<tag>".
-func (rm *ReleaseManager) latestVersionFromRedirect() (string, error) {
-	url := fmt.Sprintf("%s/%s/%s/releases/latest", rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo)
-
-	resp, err := rm.HTTPClient.Get(url)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		return "", fmt.Errorf("unexpected status %s", resp.Status)
-	}
-
-	// After the redirect is followed, the final request URL is the tag page.
-	version := path.Base(resp.Request.URL.Path)
-	if version == "" || version == "." || version == "latest" {
-		return "", fmt.Errorf("could not determine version from %s", resp.Request.URL.String())
-	}
-	return version, nil
-}
-
-// latestReleaseFromAPI fetches the latest release JSON from api.github.com. This
-// is the fallback path; it is rate-limited unless GITHUB_TOKEN is set.
-func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
 	url := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", rm.GitHubOwner, rm.GitHubRepo)

-	req, err := http.NewRequest(http.MethodGet, url, nil)
-	if err != nil {
-		return nil, err
-	}
-	req.Header.Set("Accept", "application/vnd.github+json")
-	// An optional token lifts the unauthenticated 60/hour limit to 5000/hour.
-	if token := os.Getenv("GITHUB_TOKEN"); token != "" {
-		req.Header.Set("Authorization", "Bearer "+token)
-	}
-
-	resp, err := rm.HTTPClient.Do(req)
+	resp, err := rm.HTTPClient.Get(url)
 	if err != nil {
 		return nil, fmt.Errorf("failed to fetch latest release: %w", err)
 	}
 	defer resp.Body.Close()

 	if resp.StatusCode != http.StatusOK {
-		if (resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusTooManyRequests) &&
-			resp.Header.Get("X-RateLimit-Remaining") == "0" {
-			return nil, fmt.Errorf("GitHub API rate limit exceeded (status %d); retry later or set GITHUB_TOKEN to raise the limit", resp.StatusCode)
-		}
-		return nil, fmt.Errorf("status %d", resp.StatusCode)
+		return nil, fmt.Errorf("failed to fetch latest release: status %d", resp.StatusCode)
 	}

 	// Parse the JSON response properly
@@ -179,7 +106,7 @@ func (rm *ReleaseManager) latestReleaseFromAPI() (*Release, error) {
 }

 // DownloadRelease downloads a specific version of LocalAI
-func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(downloaded, total int64)) error {
+func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(float64)) error {
 	// Ensure the binary directory exists
 	if err := os.MkdirAll(rm.BinaryPath, 0755); err != nil {
 		return fmt.Errorf("failed to create binary directory: %w", err)
@@ -190,16 +117,16 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
 	localPath := filepath.Join(rm.BinaryPath, "local-ai")

 	// Download the binary
-	downloadURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/%s",
-		rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, binaryName)
+	downloadURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/%s",
+		rm.GitHubOwner, rm.GitHubRepo, version, binaryName)

 	if err := rm.downloadFile(downloadURL, localPath, progressCallback); err != nil {
 		return fmt.Errorf("failed to download binary: %w", err)
 	}

 	// Download and verify checksums
-	checksumURL := fmt.Sprintf("%s/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
-		rm.BaseDownloadURL, rm.GitHubOwner, rm.GitHubRepo, version, version)
+	checksumURL := fmt.Sprintf("https://github.com/%s/%s/releases/download/%s/LocalAI-%s-checksums.txt",
+		rm.GitHubOwner, rm.GitHubRepo, version, version)

 	checksumPath := filepath.Join(rm.BinaryPath, "checksums.txt")
 	manualChecksumPath := filepath.Join(rm.ChecksumsPath, fmt.Sprintf("checksums-%s.txt", version))
@@ -227,10 +154,6 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
 	// Verify the checksum if we have a checksum file
 	if _, err := os.Stat(checksumPath); err == nil {
 		if err := rm.VerifyChecksum(localPath, checksumPath, binaryName); err != nil {
-			// Discard the corrupt binary (and any leftover partial) so the next
-			// retry starts from a clean slate rather than resuming corruption.
-			os.Remove(localPath)
-			os.Remove(localPath + ".part")
 			return fmt.Errorf("checksum verification failed: %w", err)
 		}
 		log.Printf("Checksum verification successful")
@@ -273,88 +196,44 @@ func (rm *ReleaseManager) GetBinaryName(version string) string {
 }

 // downloadFile downloads a file from a URL to a local path with optional progress callback
-func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(downloaded, total int64)) error {
+func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(float64)) error {
 	return rm.downloadFileWithRetry(url, filepath, progressCallback, 3)
 }

-// downloadFileWithRetry downloads a file with retry and HTTP Range resume.
-//
-// The body is streamed to "<dest>.part" and only renamed to dest on success, so
-// a dropped connection leaves a partial file that the next attempt continues via
-// a "Range: bytes=N-" request instead of restarting from zero. This matters for
-// GitHub release downloads, which are large and flaky.
-func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallback func(downloaded, total int64), maxRetries int) error {
-	partPath := dest + ".part"
+// downloadFileWithRetry downloads a file from a URL with retry logic
+func (rm *ReleaseManager) downloadFileWithRetry(url, filepath string, progressCallback func(float64), maxRetries int) error {
 	var lastErr error

 	for attempt := 1; attempt <= maxRetries; attempt++ {
 		if attempt > 1 {
 			log.Printf("Retrying download (attempt %d/%d): %s", attempt, maxRetries, url)
-			time.Sleep(time.Duration(attempt) * rm.RetryBackoff)
+			time.Sleep(time.Duration(attempt) * time.Second)
 		}

-		// Resume from however much we already have on disk.
-		var offset int64
-		if fi, err := os.Stat(partPath); err == nil {
-			offset = fi.Size()
-		}
-
-		req, err := http.NewRequest(http.MethodGet, url, nil)
-		if err != nil {
-			return err
-		}
-		if offset > 0 {
-			req.Header.Set("Range", fmt.Sprintf("bytes=%d-", offset))
-		}
-
-		resp, err := rm.HTTPClient.Do(req)
+		resp, err := rm.HTTPClient.Get(url)
 		if err != nil {
 			lastErr = err
 			continue
 		}

-		switch resp.StatusCode {
-		case http.StatusOK:
-			// Server ignored the Range (or we had nothing): start fresh.
-			offset = 0
-		case http.StatusPartialContent:
-			// Resume: append to the existing partial file.
-		case http.StatusRequestedRangeNotSatisfiable:
-			// Stale or already-complete partial: discard and restart fresh.
-			resp.Body.Close()
-			os.Remove(partPath)
-			lastErr = fmt.Errorf("partial download no longer valid (status %s), restarting", resp.Status)
-			continue
-		default:
+		if resp.StatusCode != http.StatusOK {
 			resp.Body.Close()
 			lastErr = fmt.Errorf("bad status: %s", resp.Status)
 			continue
 		}

-		var out *os.File
-		if offset > 0 {
-			out, err = os.OpenFile(partPath, os.O_WRONLY|os.O_APPEND, 0644)
-		} else {
-			out, err = os.Create(partPath)
-		}
+		out, err := os.Create(filepath)
 		if err != nil {
 			resp.Body.Close()
 			return err
 		}

-		// On a 206 the Content-Length is the remaining bytes, so the full size
-		// is what we already have plus what's still to come.
-		total := resp.ContentLength
-		if offset > 0 && total > 0 {
-			total += offset
-		}
-
+		// Create a progress reader if callback is provided
 		var reader io.Reader = resp.Body
-		if progressCallback != nil && total > 0 {
+		if progressCallback != nil && resp.ContentLength > 0 {
 			reader = &progressReader{
 				Reader:   resp.Body,
-				Total:    total,
-				Current:  offset,
+				Total:    resp.ContentLength,
 				Callback: progressCallback,
 			}
 		}
@@ -364,14 +243,11 @@ func (rm *ReleaseManager) downloadFileWithRetry(url, dest string, progressCallba
 		out.Close()

 		if err != nil {
-			// Keep the partial file so the next attempt can resume from it.
 			lastErr = err
+			os.Remove(filepath)
 			continue
 		}

-		if err := os.Rename(partPath, dest); err != nil {
-			return err
-		}
 		return nil
 	}

@@ -446,21 +322,20 @@ func (rm *ReleaseManager) saveVersionMetadata(version string) error {
 	return nil
 }

-// progressReader wraps an io.Reader to provide download progress as a
-// (downloaded, total) byte count so callers can render both a progress bar and
-// a human-readable size.
+// progressReader wraps an io.Reader to provide download progress
 type progressReader struct {
 	io.Reader
 	Total    int64
 	Current  int64
-	Callback func(downloaded, total int64)
+	Callback func(float64)
 }

 func (pr *progressReader) Read(p []byte) (int, error) {
 	n, err := pr.Reader.Read(p)
 	pr.Current += int64(n)
 	if pr.Callback != nil {
-		pr.Callback(pr.Current, pr.Total)
+		progress := float64(pr.Current) / float64(pr.Total)
+		pr.Callback(progress)
 	}
 	return n, err
 }
--- a/cmd/launcher/internal/release_manager_test.go
+++ b/cmd/launcher/internal/release_manager_test.go
@@ -1,17 +1,9 @@
 package launcher_test

 import (
-	"crypto/sha256"
-	"encoding/hex"
-	"fmt"
-	"net/http"
-	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"runtime"
-	"strconv"
-	"strings"
-	"sync"
 	"time"

 	. "github.com/onsi/ginkgo/v2"
@@ -186,221 +178,4 @@ var _ = Describe("ReleaseManager", func() {
 			Expect(err.Error()).To(ContainSubstring("checksum not found"))
 		})
 	})
-
-	Describe("DownloadRelease resume and retry", func() {
-		var (
-			version    string
-			binaryName string
-			content    []byte
-			checksums  string
-			finalPath  string
-			partPath   string
-		)
-
-		BeforeEach(func() {
-			version = "v9.9.9"
-			binaryName = rm.GetBinaryName(version)
-
-			// Deterministic, non-trivial content so resume/append bugs surface.
-			content = make([]byte, 4096)
-			for i := range content {
-				content[i] = byte(i % 251)
-			}
-			sum := sha256.Sum256(content)
-			checksums = fmt.Sprintf("%s  %s\n", hex.EncodeToString(sum[:]), binaryName)
-
-			finalPath = filepath.Join(tempDir, "local-ai")
-			partPath = finalPath + ".part"
-
-			// Isolate the persistent checksum/metadata dirs to the temp dir so
-			// the test never touches the real ~/.localai and existing checksum
-			// files don't short-circuit the download.
-			rm.ChecksumsPath = filepath.Join(tempDir, "checksums")
-			rm.MetadataPath = filepath.Join(tempDir, "metadata")
-			rm.GitHubOwner = "owner"
-			rm.GitHubRepo = "repo"
-			rm.RetryBackoff = time.Millisecond
-
-			Expect(os.MkdirAll(tempDir, 0755)).To(Succeed())
-		})
-
-		It("resumes from a partial .part file using a Range request", func() {
-			Expect(os.WriteFile(partPath, content[:1024], 0644)).To(Succeed())
-
-			var mu sync.Mutex
-			sawRange := false
-			binBytesServed := 0
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
-					var start int
-					_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
-					mu.Lock()
-					sawRange = true
-					mu.Unlock()
-					w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
-					w.WriteHeader(http.StatusPartialContent)
-					n, _ := w.Write(content[start:])
-					mu.Lock()
-					binBytesServed += n
-					mu.Unlock()
-					return
-				}
-				w.WriteHeader(http.StatusOK)
-				n, _ := w.Write(content)
-				mu.Lock()
-				binBytesServed += n
-				mu.Unlock()
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).ToNot(HaveOccurred())
-
-			got, err := os.ReadFile(finalPath)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(got).To(Equal(content))
-			Expect(sawRange).To(BeTrue(), "expected the download to resume with a Range request")
-			Expect(binBytesServed).To(Equal(len(content)-1024), "expected only the remaining bytes to be served")
-			Expect(partPath).ToNot(BeAnExistingFile())
-		})
-
-		It("starts fresh when the server ignores the Range header (200)", func() {
-			// A stale/garbage partial that must NOT be appended to.
-			Expect(os.WriteFile(partPath, []byte("garbage-garbage-garbage"), 0644)).To(Succeed())
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				// Ignore any Range and always serve the full body.
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(content)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).ToNot(HaveOccurred())
-
-			got, err := os.ReadFile(finalPath)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(got).To(Equal(content))
-		})
-
-		It("restarts the download when the partial is stale (416)", func() {
-			// Oversized partial -> requested Range start is beyond the content.
-			Expect(os.WriteFile(partPath, make([]byte, len(content)+10), 0644)).To(Succeed())
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				if rangeHdr := r.Header.Get("Range"); rangeHdr != "" {
-					var start int
-					_, _ = fmt.Sscanf(rangeHdr, "bytes=%d-", &start)
-					if start >= len(content) {
-						w.WriteHeader(http.StatusRequestedRangeNotSatisfiable)
-						return
-					}
-					w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, len(content)-1, len(content)))
-					w.WriteHeader(http.StatusPartialContent)
-					_, _ = w.Write(content[start:])
-					return
-				}
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(content)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).ToNot(HaveOccurred())
-
-			got, err := os.ReadFile(finalPath)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(got).To(Equal(content))
-		})
-
-		It("removes the downloaded file when checksum verification fails", func() {
-			bad := []byte("this is definitely not the expected binary content")
-
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					// Checksums are for `content`, but we serve `bad`.
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(bad)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			err := rm.DownloadRelease(version, nil)
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("checksum"))
-			Expect(finalPath).ToNot(BeAnExistingFile())
-			Expect(partPath).ToNot(BeAnExistingFile())
-		})
-
-		It("reports progress as downloaded and total byte counts", func() {
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				if strings.HasSuffix(r.URL.Path, "checksums.txt") {
-					_, _ = w.Write([]byte(checksums))
-					return
-				}
-				w.Header().Set("Content-Length", strconv.Itoa(len(content)))
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write(content)
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-
-			var mu sync.Mutex
-			var lastDownloaded, lastTotal int64
-			err := rm.DownloadRelease(version, func(downloaded, total int64) {
-				mu.Lock()
-				lastDownloaded = downloaded
-				lastTotal = total
-				mu.Unlock()
-			})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(lastTotal).To(Equal(int64(len(content))))
-			Expect(lastDownloaded).To(Equal(int64(len(content))))
-		})
-	})
-
-	Describe("GetLatestRelease", func() {
-		It("resolves the latest version from the releases/latest redirect", func() {
-			// The github.com redirect path must be preferred over the
-			// rate-limited api.github.com, so a working redirect yields the tag
-			// without ever needing the API.
-			srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				switch {
-				case strings.HasSuffix(r.URL.Path, "/releases/latest"):
-					http.Redirect(w, r, "/owner/repo/releases/tag/v9.9.9", http.StatusFound)
-				case strings.HasSuffix(r.URL.Path, "/releases/tag/v9.9.9"):
-					w.WriteHeader(http.StatusOK)
-				default:
-					w.WriteHeader(http.StatusNotFound)
-				}
-			}))
-			defer srv.Close()
-			rm.BaseDownloadURL = srv.URL
-			rm.GitHubOwner = "owner"
-			rm.GitHubRepo = "repo"
-
-			release, err := rm.GetLatestRelease()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(release.Version).To(Equal("v9.9.9"))
-		})
-	})
 })
--- a/cmd/launcher/internal/systray_manager.go
+++ b/cmd/launcher/internal/systray_manager.go
@@ -443,23 +443,84 @@ func (sm *SystrayManager) showStartupErrorDialog(err error) {
 	})
 }

-// showDownloadProgress shows a progress window for downloading updates. The
-// progress UI (byte readout, resume-aware retry, sizing) is shared with the
-// other download entry points via the launcher; only the post-success behaviour
-// (restart prompt + systray refresh) is specific to the update flow.
+// showDownloadProgress shows a progress window for downloading updates
 func (sm *SystrayManager) showDownloadProgress(version string) {
-	sm.launcher.showDownloadProgressWindow(version, fmt.Sprintf("Downloading LocalAI version %s", version), func(win fyne.Window) {
-		dialog.ShowConfirm("Update Downloaded",
-			"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
-			func(restart bool) {
-				if restart {
-					sm.app.Quit()
-				}
-				win.Close()
-			}, win)
+	// Create a new window for download progress
+	progressWindow := sm.app.NewWindow("Downloading LocalAI Update")
+	progressWindow.Resize(fyne.NewSize(400, 250))
+	progressWindow.CenterOnScreen()

-		sm.hasUpdateAvailable = false
-		sm.latestVersion = ""
-		sm.recreateMenu()
+	// Progress bar
+	progressBar := widget.NewProgressBar()
+	progressBar.SetValue(0)
+
+	// Status label. Truncate with an ellipsis so a long "Download failed:
+	// <url>" message can't stretch the window (and progress bar) to fit the
+	// whole error on one line; the full error is shown in the dialog below.
+	statusLabel := widget.NewLabel("Preparing download...")
+	statusLabel.Truncation = fyne.TextTruncateEllipsis
+
+	// Release notes button
+	releaseNotesButton := widget.NewButton("View Release Notes", func() {
+		releaseNotesURL, err := sm.launcher.githubReleaseNotesURL(version)
+		if err != nil {
+			log.Printf("Failed to parse URL: %v", err)
+			return
+		}
+
+		sm.app.OpenURL(releaseNotesURL)
 	})
+
+	// Progress container
+	progressContainer := container.NewVBox(
+		widget.NewLabel(fmt.Sprintf("Downloading LocalAI version %s", version)),
+		progressBar,
+		statusLabel,
+		widget.NewSeparator(),
+		releaseNotesButton,
+	)
+
+	progressWindow.SetContent(progressContainer)
+	progressWindow.Show()
+
+	// Start download in background
+	go func() {
+		err := sm.launcher.DownloadUpdate(version, func(progress float64) {
+			// Update progress bar
+			fyne.Do(func() {
+				progressBar.SetValue(progress)
+				percentage := int(progress * 100)
+				statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
+			})
+		})
+
+		// Handle completion
+		fyne.Do(func() {
+			if err != nil {
+				statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
+				// Show error dialog
+				dialog.ShowError(err, progressWindow)
+			} else {
+				statusLabel.SetText("Download completed successfully!")
+				progressBar.SetValue(1.0)
+
+				// Show restart dialog
+				dialog.ShowConfirm("Update Downloaded",
+					"LocalAI has been updated successfully. Please restart the launcher to use the new version.",
+					func(restart bool) {
+						if restart {
+							sm.app.Quit()
+						}
+						progressWindow.Close()
+					}, progressWindow)
+			}
+		})
+
+		// Update systray menu
+		if err == nil {
+			sm.hasUpdateAvailable = false
+			sm.latestVersion = ""
+			sm.recreateMenu()
+		}
+	}()
 }
--- a/cmd/launcher/internal/ui.go
+++ b/cmd/launcher/internal/ui.go
@@ -490,19 +490,14 @@ func (ui *LauncherUI) downloadUpdate() {
 	ui.UpdateStatus("Downloading update " + version + "...")

 	go func() {
-		err := ui.launcher.DownloadUpdate(version, func(downloaded, total int64) {
+		err := ui.launcher.DownloadUpdate(version, func(progress float64) {
+			// Update progress bar
 			fyne.Do(func() {
-				if total > 0 {
-					ui.progressBar.SetValue(float64(downloaded) / float64(total))
-				}
+				ui.progressBar.SetValue(progress)
 			})
-			// The progress bar already shows the percentage, so report the
-			// human-readable size here instead of repeating the percent.
-			if total > 0 {
-				ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s / %s", version, formatBytes(downloaded), formatBytes(total)))
-			} else {
-				ui.UpdateStatus(fmt.Sprintf("Downloading update %s… %s", version, formatBytes(downloaded)))
-			}
+			// Update status with percentage
+			percentage := int(progress * 100)
+			ui.UpdateStatus(fmt.Sprintf("Downloading update %s... %d%%", version, percentage))
 		})

 		fyne.Do(func() {
@@ -603,6 +598,82 @@ func (ui *LauncherUI) LoadConfiguration() {
 	log.Printf("UI LoadConfiguration: configuration loaded successfully")
 }

+// showDownloadProgress shows a progress window for downloading LocalAI
+func (ui *LauncherUI) showDownloadProgress(version, title string) {
+	fyne.DoAndWait(func() {
+		// Create progress window using the launcher's app
+		progressWindow := ui.launcher.app.NewWindow("Downloading LocalAI")
+		progressWindow.Resize(fyne.NewSize(400, 250))
+		progressWindow.CenterOnScreen()
+
+		// Progress bar
+		progressBar := widget.NewProgressBar()
+		progressBar.SetValue(0)
+
+		// Status label. Truncate with an ellipsis so a long "Download failed:
+		// <url>" message can't stretch the window (and progress bar) to fit the
+		// whole error on one line; the full error is shown in the dialog below.
+		statusLabel := widget.NewLabel("Preparing download...")
+		statusLabel.Truncation = fyne.TextTruncateEllipsis
+
+		// Release notes button
+		releaseNotesButton := widget.NewButton("View Release Notes", func() {
+			releaseNotesURL, err := ui.launcher.githubReleaseNotesURL(version)
+			if err != nil {
+				log.Printf("Failed to parse URL: %v", err)
+				return
+			}
+
+			ui.launcher.app.OpenURL(releaseNotesURL)
+		})
+
+		// Progress container
+		progressContainer := container.NewVBox(
+			widget.NewLabel(title),
+			progressBar,
+			statusLabel,
+			widget.NewSeparator(),
+			releaseNotesButton,
+		)
+
+		progressWindow.SetContent(progressContainer)
+		progressWindow.Show()
+
+		// Start download in background
+		go func() {
+			err := ui.launcher.DownloadUpdate(version, func(progress float64) {
+				// Update progress bar
+				fyne.Do(func() {
+					progressBar.SetValue(progress)
+					percentage := int(progress * 100)
+					statusLabel.SetText(fmt.Sprintf("Downloading... %d%%", percentage))
+				})
+			})
+
+			// Handle completion
+			fyne.Do(func() {
+				if err != nil {
+					statusLabel.SetText(fmt.Sprintf("Download failed: %v", err))
+					// Show error dialog
+					dialog.ShowError(err, progressWindow)
+				} else {
+					statusLabel.SetText("Download completed successfully!")
+					progressBar.SetValue(1.0)
+
+					// Show success dialog
+					dialog.ShowConfirm("Installation Complete",
+						"LocalAI has been downloaded and installed successfully. You can now start LocalAI from the launcher.",
+						func(close bool) {
+							progressWindow.Close()
+							// Update status
+							ui.UpdateStatus("LocalAI installed successfully")
+						}, progressWindow)
+				}
+			})
+		}()
+	})
+}
+
 // UpdateRunningState updates UI based on LocalAI running state
 func (ui *LauncherUI) UpdateRunningState(isRunning bool) {
 	fyne.Do(func() {
--- a/contrib/macos/sign-and-notarize.sh
+++ b/contrib/macos/sign-and-notarize.sh
@@ -71,42 +71,13 @@ cmd_notarize() {
  echo "[notarize] notarized and stapled $dmg"
 }

-# Notarize and staple the .app bundle itself. Stapling the dmg alone is not
-# enough: an app with no embedded ticket has no local proof of notarization, so
-# Gatekeeper falls back to an online check — and the app then fails to launch on
-# a machine that is offline / behind a firewall, or once it has been copied out
-# of the dmg. Stapling the bundle makes it verify offline. notarytool needs an
-# archive for a bundle, so we zip it first.
-cmd_notarize_app() {
-  local app="$1"
-  if [ -z "${MACOS_NOTARY_KEY:-}" ]; then
-    echo "[notarize] MACOS_NOTARY_KEY unset: skipping notarization of $app"
-    return 0
-  fi
-  local keyfile zip
-  keyfile="$(mktemp).p8"
-  zip="$(mktemp).zip"
-  echo "$MACOS_NOTARY_KEY" | base64 --decode > "$keyfile"
-  ditto -c -k --keepParent "$app" "$zip"
-  xcrun notarytool submit "$zip" \
-    --key "$keyfile" \
-    --key-id "${MACOS_NOTARY_KEY_ID:?}" \
-    --issuer "${MACOS_NOTARY_ISSUER_ID:?}" \
-    --wait
-  rm -f "$keyfile" "$zip"
-  xcrun stapler staple "$app"
-  xcrun stapler validate "$app"
-  echo "[notarize] notarized and stapled $app"
-}
-
 main() {
  local sub="${1:-}"; shift || true
  case "$sub" in
-    import-cert)  cmd_import_cert ;;
-    sign)         cmd_sign "$@" ;;
-    notarize)     cmd_notarize "$@" ;;
-    notarize-app) cmd_notarize_app "$@" ;;
-    *) echo "usage: $0 {import-cert|sign <path>|notarize <dmg>|notarize-app <app>}" >&2; exit 2 ;;
+    import-cert) cmd_import_cert ;;
+    sign)        cmd_sign "$@" ;;
+    notarize)    cmd_notarize "$@" ;;
+    *) echo "usage: $0 {import-cert|sign <path>|notarize <dmg>}" >&2; exit 2 ;;
  esac
 }

--- a/core/application/agent_jobs.go
+++ b/core/application/agent_jobs.go
@@ -37,8 +37,6 @@ func (a *Application) RestartAgentJobService() error {
 		if d.JobStore != nil {
 			agentJobService.SetDistributedJobStore(d.JobStore)
 		}
-		// Keep agent tasks consistent across replicas (same client the dispatcher uses).
-		agentJobService.SetTaskSyncNATS(d.Nats)
 	}

 	// Start the service
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -103,11 +103,6 @@ func newApplication(appConfig *config.ApplicationConfig) *Application {
 		mcpTools.CloseMCPSessions(modelName)
 	})

-	// Record a model_load backend trace for every real backend load, so the
-	// Traces UI shows which backend runtime served each model and how long
-	// the load took. Load failures are traced by the modality wrappers.
-	ml.SetLoadObserver(corebackend.ModelLoadTraceObserver(appConfig))
-
 	app := &Application{
 		backendLoader:      config.NewModelConfigLoader(appConfig.SystemState.Model.ModelsPath),
 		modelLoader:        ml,
@@ -609,10 +604,6 @@ func (a *Application) StartAgentPool() {
 			usm.SetJobDBStore(s)
 		}
 	}
-	// Keep per-user agent tasks consistent across replicas (nil in standalone).
-	if d := a.Distributed(); d != nil {
-		usm.SetJobSyncNATS(d.Nats)
-	}
 	aps.SetUserServicesManager(usm)

 	a.agentPoolService.Store(aps)
--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -197,7 +197,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envWatchdogBusy := appConfig.WatchDogBusy == startupAppConfig.WatchDogBusy
 		envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout
 		envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout
-		envWatchdogInterval := appConfig.WatchDogInterval == startupAppConfig.WatchDogInterval
 		envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend
 		envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends
 		envMemoryReclaimerEnabled := appConfig.MemoryReclaimerEnabled == startupAppConfig.MemoryReclaimerEnabled
@@ -258,14 +257,6 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 					xlog.Warn("invalid watchdog busy timeout in runtime_settings.json", "error", err, "timeout", *settings.WatchdogBusyTimeout)
 				}
 			}
-			if settings.WatchdogInterval != nil && !envWatchdogInterval {
-				dur, err := time.ParseDuration(*settings.WatchdogInterval)
-				if err == nil {
-					appConfig.WatchDogInterval = dur
-				} else {
-					xlog.Warn("invalid watchdog interval in runtime_settings.json", "error", err, "interval", *settings.WatchdogInterval)
-				}
-			}
 			// Handle MaxActiveBackends (new) and SingleBackend (deprecated)
 			if settings.MaxActiveBackends != nil && !envMaxActiveBackends {
 				appConfig.MaxActiveBackends = *settings.MaxActiveBackends
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -355,13 +355,6 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		PrefixProvider:   prefixProvider,
 		PrefixConfig:     prefixCfg,
 		Pressure:         pressure,
-		SharedModels:     cfg.Distributed.SharedModels,
-		// Cap how long a cold load may hold the per-model advisory lock: the
-		// configured backend.install deadline plus a margin for file staging and
-		// the remote LoadModel. Derived from the install timeout so raising it
-		// (for slow links pulling multi-GB images) widens the ceiling too,
-		// instead of letting the static default cut a legitimately slow load.
-		ModelLoadCeiling: cfg.Distributed.BackendInstallTimeoutOrDefault() + 10*time.Minute,
 	})

 	// Wire staging-progress broadcasting so file-staging shows up on every
--- a/core/application/runtime_settings_branding_test.go
+++ b/core/application/runtime_settings_branding_test.go
@@ -87,31 +87,6 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() {
 		})
 	})

-	// Watchdog check interval (issue #10601). Unlike the idle/busy timeouts
-	// (which default to 0), NewApplicationConfig baseline-defaults the
-	// interval to 500ms. The loader's "apply file value only if still at the
-	// zero default" env-detection therefore never fired for the interval, so
-	// a UI-saved Check Interval silently reverted to 500ms on every restart
-	// while the idle/busy timeouts persisted. These specs construct the
-	// config the same way boot does (NewApplicationConfig) so they observe
-	// the real default the loader sees.
-	Describe("watchdog interval", func() {
-		It("loads a UI-saved watchdog_interval on the next startup", func() {
-			cfg := config.NewApplicationConfig()
-			cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
-			loadRuntimeSettingsFromFile(cfg)
-			Expect(cfg.WatchDogInterval).To(Equal(2 * time.Second))
-		})
-
-		It("does not override an explicit env/CLI interval", func() {
-			cfg := config.NewApplicationConfig()
-			cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`)
-			cfg.WatchDogInterval = 1 * time.Second // simulate SetWatchDogInterval from env
-			loadRuntimeSettingsFromFile(cfg)
-			Expect(cfg.WatchDogInterval).To(Equal(1*time.Second), "env/CLI interval must win over the persisted file value")
-		})
-	})
-
 	// MITM listener address. The file is the only source — no env var
 	// exists — so a regression here means an admin who configured the
 	// listener via /api/settings loses it after a reboot, even though
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -280,9 +280,6 @@ func New(opts ...config.AppOption) (*Application, error) {
 		if application.agentJobService != nil {
 			application.agentJobService.SetDistributedBackends(distSvc.Dispatcher)
 			application.agentJobService.SetDistributedJobStore(distSvc.JobStore)
-			// Keep agent tasks consistent across replicas (jobs already sync via the
-			// dispatcher + DB read-through). Same NATS client the dispatcher uses.
-			application.agentJobService.SetTaskSyncNATS(distSvc.Nats)
 		}
 		// Wire skill store into AgentPoolService (wired at pool start time via closure)
 		// The actual wiring happens in StartAgentPool since the pool doesn't exist yet.
@@ -369,7 +366,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 	}

 	for _, backend := range options.ExternalBackends {
-		if err := galleryop.InstallExternalBackend(options.Context, options.BackendGalleries, options.SystemState, application.ModelLoader(), nil, backend, "", "", false, options.RequireBackendIntegrity); err != nil {
+		if err := galleryop.InstallExternalBackend(options.Context, options.BackendGalleries, options.SystemState, application.ModelLoader(), nil, backend, "", "", options.RequireBackendIntegrity); err != nil {
 			xlog.Error("error installing external backend", "error", err)
 		}
 	}
--- a/core/backend/model_load_trace_test.go
+++ b/core/backend/model_load_trace_test.go
@@ -1,72 +0,0 @@
-package backend_test
-
-import (
-	"errors"
-	"time"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/trace"
-	"github.com/mudler/LocalAI/pkg/model"
-)
-
-// ModelLoadTraceObserver is what makes successful loads visible on the
-// Traces page: one model_load row per real backend load, carrying the
-// resolved backend runtime. Failures must NOT be recorded here — the
-// modality wrappers own those — and the observer must respect the runtime
-// tracing toggle.
-var _ = Describe("ModelLoadTraceObserver", func() {
-	var appConfig *config.ApplicationConfig
-
-	successEvent := model.BackendLoadEvent{
-		ModelID:    "parakeet-cpp-realtime_eou_120m-v1",
-		ModelName:  "realtime_eou_120m.gguf",
-		Backend:    "parakeet-cpp",
-		BackendURI: "/backends/intel-sycl-f16-parakeet-cpp-development/run.sh",
-		Duration:   1500 * time.Millisecond,
-	}
-
-	BeforeEach(func() {
-		appConfig = &config.ApplicationConfig{
-			EnableTracing:   true,
-			TracingMaxItems: 64,
-		}
-		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
-		trace.ClearBackendTraces()
-	})
-
-	It("records a model_load trace with the backend runtime on success", func() {
-		backend.ModelLoadTraceObserver(appConfig)(successEvent)
-
-		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
-		got := trace.GetBackendTraces()[0]
-		Expect(got.Type).To(Equal(trace.BackendTraceModelLoad))
-		Expect(got.Summary).To(Equal("Model loaded"))
-		Expect(got.ModelName).To(Equal("parakeet-cpp-realtime_eou_120m-v1"))
-		Expect(got.Backend).To(Equal("parakeet-cpp"))
-		Expect(got.Duration).To(Equal(1500 * time.Millisecond))
-		Expect(got.Data["backend_runtime"]).To(Equal("/backends/intel-sycl-f16-parakeet-cpp-development/run.sh"))
-		Expect(got.Data["model_file"]).To(Equal("realtime_eou_120m.gguf"))
-		Expect(got.Error).To(BeEmpty())
-	})
-
-	It("skips failed loads — the modality wrappers trace those with request context", func() {
-		failed := successEvent
-		failed.Err = errors.New("grpc service not ready")
-
-		backend.ModelLoadTraceObserver(appConfig)(failed)
-
-		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
-	})
-
-	It("records nothing when tracing is disabled", func() {
-		appConfig.EnableTracing = false
-
-		backend.ModelLoadTraceObserver(appConfig)(successEvent)
-
-		Consistently(trace.GetBackendTraces, "100ms", "20ms").Should(BeEmpty())
-	})
-})
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -19,39 +19,6 @@ import (
 	"github.com/mudler/xlog"
 )

-// ModelLoadTraceObserver returns the ModelLoader load observer that records
-// a model_load backend trace for every successful real load (backend process
-// spawn + LoadModel RPC; cache hits never reach the observer). Failures are
-// deliberately skipped here: the modality wrappers already record them via
-// recordModelLoadFailure with request context, and the backend auto-discovery
-// scan probes several backends before one succeeds — tracing every probe
-// failure would bury the buffer in noise.
-//
-// The traced data includes the resolved backend runtime (the installed
-// backend's launcher path, which names the variant directory) — that is what
-// identifies WHICH build served the load. A stale installed backend is
-// invisible in the model config but obvious here.
-func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.BackendLoadEvent) {
-	return func(ev model.BackendLoadEvent) {
-		if ev.Err != nil || !appConfig.EnableTracing {
-			return
-		}
-		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes)
-		trace.RecordBackendTrace(trace.BackendTrace{
-			Timestamp: time.Now(),
-			Duration:  ev.Duration,
-			Type:      trace.BackendTraceModelLoad,
-			ModelName: ev.ModelID,
-			Backend:   ev.Backend,
-			Summary:   "Model loaded",
-			Data: map[string]any{
-				"model_file":      ev.ModelName,
-				"backend_runtime": ev.BackendURI,
-			},
-		})
-	}
-}
-
 // recordModelLoadFailure records a backend trace when model loading fails.
 func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
 	if !appConfig.EnableTracing {
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -181,7 +181,6 @@ func transcriptResultFromProto(r *proto.TranscriptResult) *schema.TranscriptionR
 		Text:     r.Text,
 		Language: r.Language,
 		Duration: float64(r.Duration),
-		Eou:      r.Eou,
 	}

 	for _, s := range r.Segments {
--- a/Show More
+++ b/Show More