chore: ⬆️ Update mudler/parakeet.cpp to 30a307553f1965ceb38a1a922069a71e7dd67bf3 (#10092 )

⬆️ Update mudler/parakeet.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
chore: ⬆️ Update antirez/ds4 to e16ead1e29c81a67bbb64e5b001117679cf9ce6e (#10076 )
2026-06-07 16:27:09 -04:00 · 2026-05-30 22:48:09 +02:00 · 2026-05-30 22:08:30 +02:00 · 2026-05-30 14:46:10 +02:00 · 2026-05-30 12:04:10 +02:00 · 2026-05-30 08:34:00 +02:00
76 changed files with 3331 additions and 324 deletions
--- a/.agents/coding-style.md
+++ b/.agents/coding-style.md
@@ -50,6 +50,17 @@ Do not mix styles within a package. If you are extending tests in a package that

 This is enforced by `golangci-lint` via the `forbidigo` linter (see `.golangci.yml`); calls like `t.Errorf` / `t.Fatalf` / `t.Run` / `t.Skip` / `t.Logf` are flagged. Run `make lint` locally before submitting; the same check runs in CI (`.github/workflows/lint.yml`).

+## Outbound HTTP
+
+All outbound HTTP must go through `github.com/mudler/LocalAI/pkg/httpclient` rather than the standard library's default client. Use `httpclient.New(...)` (no body deadline — safe for streaming/SSE) or `httpclient.NewWithTimeout(d, ...)` (simple request/response). Both **refuse redirects by default** and set a TLS 1.2 floor.
+
+The reason is GHSA-3mj3-57v2-4636: the std default client follows redirects, and on a *cross-host* redirect Go forwards custom credential headers (e.g. Anthropic's `x-api-key`) to the redirect target, leaking the secret. `httpclient` fails closed instead.
+
+- Need to follow redirects (download CDNs, registry blobs, GitHub asset URLs)? Pass `httpclient.WithFollowRedirects()` — it still strips credential headers on any cross-host hop.
+- Have a custom transport (IP-pinned dialer, HTTP/2 tuning, a credential-injecting `RoundTripper`)? Pass `httpclient.WithTransport(rt)`, basing the transport on `httpclient.HardenedTransport()` to keep the TLS floor. Handed a `*http.Client` by a library? `httpclient.Harden(c)` applies the policy in place.
+
+This is enforced by `forbidigo` (see `.golangci.yml`): `http.DefaultClient` and `http.Get`/`Post`/`PostForm`/`Head` are flagged. The `&http.Client{}` composite literal can't be matched precisely by forbidigo without also flagging legitimate `*http.Client` type references, so that form is caught by review — don't construct raw clients.
+
 ## Documentation

 The project documentation is located in `docs/content`. When adding new features or changing existing functionality, it is crucial to update the documentation to reflect these changes. This helps users understand how to use the new capabilities and ensures the documentation stays relevant.
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -716,6 +716,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "8"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-12-parakeet-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
@@ -1556,6 +1569,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-13-parakeet-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -1569,6 +1595,19 @@ include:
    backend: "whisper"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-cuda-13-arm64-parakeet-cpp'
+    base-image: "ubuntu:24.04"
+    ubuntu-version: '2404'
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -2944,6 +2983,115 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  # parakeet-cpp
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-parakeet-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-parakeet-cpp'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f32'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f32-parakeet-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f16'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f16-parakeet-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-parakeet-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-parakeet-cpp'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-arm64-parakeet-cpp'
+    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2204'
+  - build-type: 'hipblas'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-rocm-hipblas-parakeet-cpp'
+    base-image: "rocm/dev-ubuntu-24.04:7.2.1"
+    runs-on: 'ubuntu-latest'
+    skip-drivers: 'false'
+    backend: "parakeet-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  # acestep-cpp
  - build-type: ''
    cuda-major-version: ""
@@ -3976,6 +4124,10 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-whisper"
    build-type: "metal"
    lang: "go"
+  - backend: "parakeet-cpp"
+    tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
+    build-type: "metal"
+    lang: "go"
  - backend: "acestep-cpp"
    tag-suffix: "-metal-darwin-arm64-acestep-cpp"
    build-type: "metal"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -30,6 +30,10 @@ jobs:
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
            file: "backend/go/whisper/Makefile"
+          - repository: "mudler/parakeet.cpp"
+            variable: "PARAKEET_VERSION"
+            branch: "master"
+            file: "backend/go/parakeet-cpp/Makefile"
          - repository: "leejet/stable-diffusion.cpp"
            variable: "STABLEDIFFUSION_GGML_VERSION"
            branch: "master"
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -46,6 +46,7 @@ jobs:
      speaker-recognition: ${{ steps.detect.outputs.speaker-recognition }}
      sherpa-onnx: ${{ steps.detect.outputs.sherpa-onnx }}
      whisper: ${{ steps.detect.outputs.whisper }}
+      parakeet-cpp: ${{ steps.detect.outputs.parakeet-cpp }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
@@ -633,6 +634,26 @@ jobs:
      - name: Build whisper backend image and run transcription gRPC e2e tests
        run: |
          make test-extra-backend-whisper-transcription
+  # Parakeet ASR via the parakeet-cpp backend (C++/ggml port of NeMo
+  # Parakeet). Drives AudioTranscription (offline, with word timestamps) on
+  # tdt_ctc-110m + the JFK 11s clip.
+  tests-parakeet-cpp-grpc-transcription:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.parakeet-cpp == 'true' || needs.detect-changes.outputs.run-all == 'true'
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: true
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.25.4'
+      - name: Build parakeet-cpp backend image and run transcription gRPC e2e tests
+        run: |
+          make test-extra-backend-parakeet-cpp-transcription
  # VITS TTS via the sherpa-onnx backend. Drives both TTS (file write) and
  # TTSStream (PCM chunks) on the e2e-backends harness.
  tests-sherpa-onnx-grpc-tts:
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -56,6 +56,20 @@ linters:
        # are exempt — see linters.exclusions.rules below.
        - pattern: '^os\.(Getenv|LookupEnv|Environ)$'
          msg: 'Plumb config through ApplicationConfig (or the relevant CLI struct) instead of reading env directly. CLI entry points (core/cli/) bind env vars via kong''s `env:` tag — that is the only sanctioned env→struct boundary. See .agents/coding-style.md.'
+        # Outbound HTTP must go through pkg/httpclient, which refuses redirects
+        # by default and sets a TLS floor. The std-library default client and
+        # the http.Get/Post/... convenience helpers follow redirects (up to 10)
+        # and, on a cross-host redirect, forward custom credential headers such
+        # as Anthropic's x-api-key to the redirect target — leaking the secret
+        # (GHSA-3mj3-57v2-4636). forbidigo can't precisely match the
+        # `&http.Client{}` composite literal without also flagging legitimate
+        # `*http.Client` type references, so that form is enforced by
+        # convention + review; these two patterns catch the implicit-default
+        # client, which is the common footgun.
+        - pattern: '^http\.DefaultClient$'
+          msg: 'Use pkg/httpclient (httpclient.New / NewWithTimeout) instead of http.DefaultClient — the std client follows redirects and leaks credential headers cross-host (GHSA-3mj3-57v2-4636). See .agents/coding-style.md.'
+        - pattern: '^http\.(Get|Post|PostForm|Head)$'
+          msg: 'Use pkg/httpclient (httpclient.New / NewWithTimeout) instead of http.Get/Post/PostForm/Head — these use http.DefaultClient, which follows redirects and leaks credential headers cross-host (GHSA-3mj3-57v2-4636). See .agents/coding-style.md.'
  exclusions:
    paths:
      # Upstream whisper.cpp source tree fetched by the whisper backend Makefile.
@@ -95,3 +109,18 @@ linters:
      - path: _test\.go$
        text: 'os\.(Getenv|LookupEnv|Environ)'
        linters: [forbidigo]
+      # pkg/httpclient is the sanctioned home for outbound HTTP clients; it
+      # necessarily references net/http directly.
+      - path: ^pkg/httpclient/
+        text: 'http\.(DefaultClient|Get|Post|PostForm|Head)'
+        linters: [forbidigo]
+      # Tests drive local httptest servers where redirect/TLS hardening is
+      # irrelevant; the std client is fine there.
+      - path: _test\.go$
+        text: 'http\.(DefaultClient|Get|Post|PostForm|Head)'
+        linters: [forbidigo]
+      # Vendored upstream whisper.cpp Go bindings are a separate module and
+      # cannot import pkg/httpclient.
+      - path: ^backend/go/whisper/sources/
+        text: 'http\.(DefaultClient|Get|Post|PostForm|Head)'
+        linters: [forbidigo]
--- a/17
+++ b/17
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio

 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -991,6 +991,19 @@ test-extra-backend-whisper-transcription: docker-build-whisper
 	BACKEND_TEST_CAPS=health,load,transcription \
 	$(MAKE) test-extra-backend

+## Audio transcription wrapper for the parakeet-cpp (parakeet.cpp ggml port)
+## backend. Mirrors test-extra-backend-whisper-transcription: drives the
+## AudioTranscription / AudioTranscriptionStream RPCs against a published
+## Parakeet GGUF using the JFK 11s clip from whisper.cpp's CI samples. Not
+## part of the default test suite - run explicitly once the pinned model URL
+## is reachable.
+test-extra-backend-parakeet-cpp-transcription: docker-build-parakeet-cpp
+	BACKEND_IMAGE=local-ai-backend:parakeet-cpp \
+	BACKEND_TEST_MODEL_URL=https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf \
+	BACKEND_TEST_AUDIO_URL=https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav \
+	BACKEND_TEST_CAPS=health,load,transcription \
+	$(MAKE) test-extra-backend
+
 ## LocalVQE audio transform (joint AEC + noise suppression + dereverb).
 ## Exercises the audio_transform capability end-to-end: batch transform
 ## of a real WAV fixture and bidi streaming of synthetic silent frames.
@@ -1149,6 +1162,7 @@ BACKEND_HUGGINGFACE = huggingface|golang|.|false|true
 BACKEND_SILERO_VAD = silero-vad|golang|.|false|true
 BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|true
 BACKEND_WHISPER = whisper|golang|.|false|true
+BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true
 BACKEND_VOXTRAL = voxtral|golang|.|false|true
 BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true
 BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true
@@ -1236,6 +1250,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
 $(eval $(call generate-docker-build-target,$(BACKEND_SILERO_VAD)))
 $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
 $(eval $(call generate-docker-build-target,$(BACKEND_WHISPER)))
+$(eval $(call generate-docker-build-target,$(BACKEND_PARAKEET_CPP)))
 $(eval $(call generate-docker-build-target,$(BACKEND_VOXTRAL)))
 $(eval $(call generate-docker-build-target,$(BACKEND_OPUS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS)))
--- a/backend/cpp/ds4/CMakeLists.txt
+++ b/backend/cpp/ds4/CMakeLists.txt
@@ -60,6 +60,11 @@ elseif(DS4_GPU STREQUAL "cpu")
    set(DS4_OBJS "${DS4_DIR}/ds4_cpu.o")
 endif()

+# ds4.c now references ds4_distributed.c (distributed inference was split into
+# its own translation unit upstream). It is a single GPU-agnostic object shared
+# by every GPU mode, so link it in regardless of DS4_GPU.
+list(APPEND DS4_OBJS "${DS4_DIR}/ds4_distributed.o")
+
 add_executable(${TARGET}
    grpc-server.cpp
    dsml_parser.cpp
--- a/backend/cpp/ds4/Makefile
+++ b/backend/cpp/ds4/Makefile
@@ -1,10 +1,10 @@
 # ds4 backend Makefile.
 #
-# Upstream pin lives below as DS4_VERSION?=072bc0feb187be5f374c08b16d0045e1ad7bc9bc
+# Upstream pin lives below as DS4_VERSION?=e16ead1e29c81a67bbb64e5b001117679cf9ce6e
 # (.github/bump_deps.sh) can find and update it - matches the
 # llama-cpp / ik-llama-cpp / turboquant convention.

-DS4_VERSION?=072bc0feb187be5f374c08b16d0045e1ad7bc9bc
+DS4_VERSION?=e16ead1e29c81a67bbb64e5b001117679cf9ce6e
 DS4_REPO?=https://github.com/antirez/ds4

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
@@ -18,16 +18,19 @@ UNAME_S := $(shell uname -s)

 CMAKE_ARGS ?= -DCMAKE_BUILD_TYPE=Release

+# ds4_distributed.o is a GPU-agnostic translation unit that ds4.c/ds4_cpu.o now
+# reference (upstream split distributed inference into its own .c). The same
+# object is shared by every GPU mode, so it is appended unconditionally below.
 ifeq ($(BUILD_TYPE),cublas)
    CMAKE_ARGS += -DDS4_GPU=cuda
-    DS4_OBJ_TARGET := ds4.o ds4_cuda.o
+    DS4_OBJ_TARGET := ds4.o ds4_cuda.o ds4_distributed.o
 else ifeq ($(UNAME_S),Darwin)
    CMAKE_ARGS += -DDS4_GPU=metal
-    DS4_OBJ_TARGET := ds4.o ds4_metal.o
+    DS4_OBJ_TARGET := ds4.o ds4_metal.o ds4_distributed.o
 else
    # CPU reference path (Linux only - macOS CPU path is broken by VM bug per ds4 README).
    CMAKE_ARGS += -DDS4_GPU=cpu
-    DS4_OBJ_TARGET := ds4_cpu.o
+    DS4_OBJ_TARGET := ds4_cpu.o ds4_distributed.o
 endif

 ifneq ($(NATIVE),true)
@@ -52,11 +55,11 @@ ds4:
 # the right per-platform compile flags (Objective-C/Metal on Darwin, nvcc on Linux+CUDA).
 ds4/ds4.o: ds4
 ifeq ($(BUILD_TYPE),cublas)
-	+$(MAKE) -C ds4 ds4.o ds4_cuda.o
+	+$(MAKE) -C ds4 ds4.o ds4_cuda.o ds4_distributed.o
 else ifeq ($(UNAME_S),Darwin)
-	+$(MAKE) -C ds4 ds4.o ds4_metal.o
+	+$(MAKE) -C ds4 ds4.o ds4_metal.o ds4_distributed.o
 else
-	+$(MAKE) -C ds4 ds4_cpu.o
+	+$(MAKE) -C ds4 ds4_cpu.o ds4_distributed.o
 endif

 grpc-server: ds4/ds4.o
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=3bf7e836c2c5a895e8d12d3eb7e398ae7ab2f9ce
+IK_LLAMA_VERSION?=8960c5ba5ee9db30ba838304373aa4dbec9f7cbd
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=aa50b2c2ae91326d5aad956ceeb015d1d48e626b
+LLAMA_VERSION?=22d66b567eef11cf2e9832f04db64ee0323a0fd0
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/go/cloud-proxy/passthrough_edge_test.go
+++ b/backend/go/cloud-proxy/passthrough_edge_test.go
@@ -192,6 +192,61 @@ var _ = Describe("Forward", func() {
 		Expect(<-gotAuth).To(Equal("Bearer sk-real"), "caller-supplied Basic header must be replaced")
 	})

+	It("refuses to follow upstream redirects and never leaks the key to the redirect target", func() {
+		// A 3xx from the configured upstream means misconfiguration or a
+		// hijacked/spoofed host. Following it would replay the request —
+		// and the injected API key — to the Location host. Anthropic's
+		// x-api-key is NOT stripped by Go on cross-host redirects, so this
+		// would be a credential leak. The proxy must refuse the redirect.
+		sinkHit := make(chan string, 1)
+		sink := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			sinkHit <- r.Header.Get("x-api-key")
+			w.WriteHeader(http.StatusOK)
+		}))
+		defer sink.Close()
+
+		redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			http.Redirect(w, r, sink.URL, http.StatusFound)
+		}))
+		defer redirector.Close()
+
+		GinkgoT().Setenv("CLOUD_PROXY_REDIRECT_KEY", "ant-secret")
+
+		cp := NewCloudProxy()
+		Expect(cp.Load(&pb.ModelOptions{
+			Proxy: &pb.ProxyOptions{
+				UpstreamUrl: redirector.URL,
+				Mode:        modePassthrough,
+				Provider:    providerAnthropic,
+				ApiKeyEnv:   "CLOUD_PROXY_REDIRECT_KEY",
+			},
+		})).To(Succeed())
+
+		addr := "test://forward-no-redirect"
+		grpc.Provide(addr, cp)
+		c := grpc.NewClient(addr, true, nil, false)
+		stream, err := c.Forward(context.Background())
+		Expect(err).NotTo(HaveOccurred())
+		Expect(stream.Send(&pb.ForwardRequest{
+			Path:   "/v1/messages",
+			Method: "POST",
+		})).To(Succeed())
+		Expect(stream.CloseSend()).To(Succeed())
+
+		// Drain the stream; a refused redirect surfaces as a non-EOF error.
+		var streamErr error
+		for {
+			if _, err := stream.Recv(); err != nil {
+				if !errors.Is(err, io.EOF) {
+					streamErr = err
+				}
+				break
+			}
+		}
+		Expect(streamErr).To(HaveOccurred(), "refused redirect must surface as an error")
+		Expect(sinkHit).NotTo(Receive(), "the redirect target must never be contacted")
+	})
+
 	It("handles concurrent calls without interference", func() {
 		// CloudProxy explicitly omits base.SingleThread — independent
 		// Forward streams must not block each other or leak state.
--- a/backend/go/cloud-proxy/proxy.go
+++ b/backend/go/cloud-proxy/proxy.go
@@ -11,9 +11,11 @@ import (
 	"strings"
 	"sync/atomic"

+	"github.com/mudler/xlog"
+
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/xlog"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // Mirror of core/config.Proxy{Mode,Provider}* — backends don't
@@ -48,10 +50,15 @@ type proxyConfig struct {
 }

 func NewCloudProxy() *CloudProxy {
-	// No Client-level Timeout — that would bound streaming SSE
-	// responses too, which can legitimately last minutes. Per-request
-	// deadlines come from the gRPC stream context.
-	return &CloudProxy{client: &http.Client{}}
+	// httpclient.New refuses redirects outright: the proxy talks to a
+	// single configured upstream API (OpenAI/Anthropic/...) that answers
+	// directly, so a 3xx means misconfiguration, a hijacked upstream, or
+	// DNS trickery — never normal operation. Following it would replay the
+	// request, including the operator's x-api-key (which Go does NOT strip
+	// on cross-host redirects), to an unvetted host and leak the key
+	// (GHSA-3mj3-57v2-4636). It also imposes no body deadline, so streaming
+	// SSE responses that legitimately last minutes are not truncated.
+	return &CloudProxy{client: httpclient.New()}
 }

 func (c *CloudProxy) Load(opts *pb.ModelOptions) error {
@@ -426,4 +433,3 @@ func isHopByHopHeader(name string) bool {
 	}
 	return false
 }
-
--- a/backend/go/parakeet-cpp/.gitignore
+++ b/backend/go/parakeet-cpp/.gitignore
@@ -0,0 +1,11 @@
+.cache/
+sources/
+build/
+package/
+parakeet-cpp-grpc
+# build artifacts staged in-tree by the Makefile (cp from sources/) or
+# symlinked for local dev; the real sources live in parakeet.cpp upstream.
+*.so
+*.so.*
+parakeet_capi.h
+compile_commands.json
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -0,0 +1,89 @@
+# parakeet-cpp backend Makefile.
+#
+# Upstream pin lives below as PARAKEET_VERSION?=30a307553f1965ceb38a1a922069a71e7dd67bf3
+# (.github/bump_deps.sh) can find and update it - matches the
+# whisper.cpp / ds4 / vibevoice-cpp convention.
+#
+# Local dev shortcut: if you already have an out-of-tree parakeet.cpp
+# build, you can symlink the .so + header into this directory and skip
+# the clone/cmake steps entirely, e.g.:
+#
+#   ln -sf /path/to/parakeet.cpp/build-shared/libparakeet.so .
+#   ln -sf /path/to/parakeet.cpp/include/parakeet_capi.h .
+#   go build -o parakeet-cpp-grpc .
+#
+# That's what the L0 smoke test uses. The default target below does the
+# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
+
+PARAKEET_VERSION?=30a307553f1965ceb38a1a922069a71e7dd67bf3
+PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+
+BUILD_TYPE?=
+NATIVE?=false
+
+# Build ggml statically into libparakeet.so (PIC) so the shared lib is
+# self-contained: dlopen needs no libggml*.so alongside it, only system libs
+# (libstdc++/libgomp/libc) that the runtime image already provides.
+CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DPARAKEET_SHARED=ON -DPARAKEET_BUILD_CLI=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DGGML_HIP=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=ON
+endif
+
+.PHONY: parakeet-cpp-grpc package build clean purge test all
+
+all: parakeet-cpp-grpc
+
+# Clone the upstream parakeet.cpp source at the pinned commit. Directory
+# acts as the target so make only re-clones when missing. After a
+# PARAKEET_VERSION bump, run 'make purge && make' to refetch.
+sources/parakeet.cpp:
+	mkdir -p sources/parakeet.cpp
+	cd sources/parakeet.cpp && \
+	git init -q && \
+	git remote add origin $(PARAKEET_REPO) && \
+	git fetch --depth 1 origin $(PARAKEET_VERSION) && \
+	git checkout FETCH_HEAD && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+# Build the shared lib + header out-of-tree, then stage them next to the
+# Go sources so purego.Dlopen("libparakeet.so") and the cgo-less build
+# both pick them up.
+libparakeet.so: sources/parakeet.cpp
+	cmake -B sources/parakeet.cpp/build-shared -S sources/parakeet.cpp $(CMAKE_ARGS)
+	cmake --build sources/parakeet.cpp/build-shared --config Release -j$(JOBS)
+	cp -fv sources/parakeet.cpp/build-shared/libparakeet.so* ./ 2>/dev/null || true
+	cp -fv sources/parakeet.cpp/include/parakeet_capi.h ./
+
+parakeet-cpp-grpc: libparakeet.so main.go goparakeetcpp.go
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o parakeet-cpp-grpc .
+
+package: parakeet-cpp-grpc
+	bash package.sh
+
+build: package
+
+# Test target. Smoke test is gated on PARAKEET_BACKEND_TEST_MODEL +
+# PARAKEET_BACKEND_TEST_WAV; without them the spec auto-skips.
+test:
+	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
+
+clean: purge
+	rm -rf libparakeet.so* parakeet_capi.h package parakeet-cpp-grpc
+
+purge:
+	rm -rf sources/parakeet.cpp
--- a/backend/go/parakeet-cpp/goparakeetcpp.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp.go
@@ -0,0 +1,393 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"unsafe"
+
+	"github.com/go-audio/wav"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/utils"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// purego-bound entry points from libparakeet.so. Names match
+// parakeet_capi.h exactly so a `nm libparakeet.so | grep parakeet_capi`
+// is enough to spot drift.
+//
+// Functions that return char* are declared as uintptr so we can call
+// parakeet_capi_free_string on the same pointer after copying, the
+// C-API contract is "caller owns and must free the returned buffer".
+var (
+	CppAbiVersion         func() int32
+	CppLoad               func(ggufPath string) uintptr
+	CppFree               func(ctx uintptr)
+	CppTranscribePath     func(ctx uintptr, wavPath string, decoder int32) uintptr
+	CppTranscribePathJSON func(ctx uintptr, wavPath string, decoder int32) uintptr
+	CppFreeString         func(s uintptr)
+	CppLastError          func(ctx uintptr) string
+
+	// Cache-aware streaming (RNN-T) entry points. stream_begin returns 0 for
+	// non-streaming models. feed/finalize return a malloc'd char* (uintptr,
+	// freed via CppFreeString); feed writes 1 to *eouOut on an <EOU>/<EOB>.
+	CppStreamBegin    func(ctx uintptr) uintptr
+	CppStreamFeed     func(s uintptr, pcm []float32, nSamples int32, eouOut unsafe.Pointer) uintptr
+	CppStreamFinalize func(s uintptr) uintptr
+	CppStreamFree     func(s uintptr)
+)
+
+// streamChunkSamples is how much 16 kHz mono PCM we hand to stream_feed per
+// call (1 s). The session buffers internally and decodes once a full
+// cache-aware encoder chunk is available, so this only bounds how often we
+// poll for newly-finalized text, not the model's actual chunk size.
+const streamChunkSamples = 16000
+
+// transcriptJSON mirrors the document returned by
+// parakeet_capi_transcribe_path_json (see parakeet_capi.h):
+//
+//	{"text":"...",
+//	 "words":[{"w":"...","start":0.480,"end":0.640,"conf":0.9100}, ...],
+//	 "tokens":[{"id":123,"t":0.480,"conf":0.9100}, ...]}
+//
+// "start"/"end"/"t" are seconds; "conf" is confidence in (0,1].
+type transcriptJSON struct {
+	Text   string            `json:"text"`
+	Words  []transcriptWord  `json:"words"`
+	Tokens []transcriptToken `json:"tokens"`
+}
+
+type transcriptWord struct {
+	W     string  `json:"w"`
+	Start float64 `json:"start"`
+	End   float64 `json:"end"`
+	Conf  float64 `json:"conf"`
+}
+
+type transcriptToken struct {
+	ID   int32   `json:"id"`
+	T    float64 `json:"t"`
+	Conf float64 `json:"conf"`
+}
+
+// ParakeetCpp owns a single loaded parakeet_ctx. The C engine is a
+// thread-unsafe singleton (mirrors whisper.cpp / vibevoice.cpp), so we
+// serialize calls through base.SingleThread.
+type ParakeetCpp struct {
+	base.SingleThread
+	ctxPtr uintptr
+}
+
+// Load is the LocalAI gRPC entry point for LoadModel: it calls
+// parakeet_capi_load with the GGUF path and stashes the resulting
+// opaque context pointer for AudioTranscription.
+func (p *ParakeetCpp) Load(opts *pb.ModelOptions) error {
+	if opts.ModelFile == "" {
+		return errors.New("parakeet-cpp: ModelFile is required")
+	}
+
+	ctx := CppLoad(opts.ModelFile)
+	if ctx == 0 {
+		// No ctx to ask for last_error (the C-API's last-error buffer
+		// lives on the ctx that was never returned). Surface the path
+		// so the operator at least knows which load failed.
+		return fmt.Errorf("parakeet-cpp: parakeet_capi_load failed for %q", opts.ModelFile)
+	}
+	p.ctxPtr = ctx
+	return nil
+}
+
+// AudioTranscription runs parakeet_capi_transcribe_path_json on the wav at
+// opts.Dst with the default decoder (decoder=0, which selects the right head
+// per architecture: transducer for tdt/rnnt/hybrid, CTC for ctc) and shapes
+// the per-word timestamps into a LocalAI TranscriptResult.
+//
+// Parakeet emits word- and token-level timestamps but no native segment
+// boundaries, so we synthesise a single whole-clip segment spanning the first
+// word start to the last word end. Word-level timings are attached only when
+// the caller opts in via timestamp_granularities=["word"] (matching the
+// OpenAI API, whose default is segment-level); token ids always populate
+// Segment.Tokens.
+//
+// translate/diarize/prompt/temperature/language/threads are not applicable to
+// parakeet and are ignored; streaming is handled by AudioTranscriptionStream
+// (L2).
+func (p *ParakeetCpp) AudioTranscription(_ context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
+	if p.ctxPtr == 0 {
+		return pb.TranscriptResult{}, errors.New("parakeet-cpp: model not loaded")
+	}
+	if opts.Dst == "" {
+		return pb.TranscriptResult{}, errors.New("parakeet-cpp: TranscriptRequest.dst (audio path) is required")
+	}
+
+	cstr := CppTranscribePathJSON(p.ctxPtr, opts.Dst, 0)
+	if cstr == 0 {
+		msg := CppLastError(p.ctxPtr)
+		if msg == "" {
+			msg = "unknown error"
+		}
+		return pb.TranscriptResult{}, fmt.Errorf("parakeet-cpp: transcribe_path_json failed: %s", msg)
+	}
+
+	raw := goStringFromCPtr(cstr)
+	CppFreeString(cstr)
+
+	var doc transcriptJSON
+	if err := json.Unmarshal([]byte(raw), &doc); err != nil {
+		return pb.TranscriptResult{}, fmt.Errorf("parakeet-cpp: decode transcript json: %w", err)
+	}
+
+	text := strings.TrimSpace(doc.Text)
+
+	words := make([]*pb.TranscriptWord, 0, len(doc.Words))
+	for _, w := range doc.Words {
+		words = append(words, &pb.TranscriptWord{
+			Start: secondsToNanos(w.Start),
+			End:   secondsToNanos(w.End),
+			Text:  w.W,
+		})
+	}
+
+	tokens := make([]int32, 0, len(doc.Tokens))
+	for _, t := range doc.Tokens {
+		tokens = append(tokens, t.ID)
+	}
+
+	// Single whole-clip segment, spanning the first word start to the last
+	// word end (0/0 when the clip produced no words).
+	var segStart, segEnd int64
+	if len(words) > 0 {
+		segStart = words[0].Start
+		segEnd = words[len(words)-1].End
+	}
+	seg := &pb.TranscriptSegment{
+		Id:     0,
+		Start:  segStart,
+		End:    segEnd,
+		Text:   text,
+		Tokens: tokens,
+	}
+	if wordsRequested(opts.TimestampGranularities) {
+		seg.Words = words
+	}
+
+	return pb.TranscriptResult{
+		Text:     text,
+		Segments: []*pb.TranscriptSegment{seg},
+	}, nil
+}
+
+// wordsRequested reports whether the caller asked for word-level timestamps.
+// The OpenAI transcription API gates word timings behind
+// timestamp_granularities[] containing "word" and defaults to segment-level
+// otherwise; we follow that contract.
+func wordsRequested(granularities []string) bool {
+	for _, g := range granularities {
+		if strings.EqualFold(strings.TrimSpace(g), "word") {
+			return true
+		}
+	}
+	return false
+}
+
+// secondsToNanos converts the C-API's fractional-second timestamps into the
+// int64 nanoseconds LocalAI carries on TranscriptSegment/TranscriptWord, the
+// same nanosecond convention the whisper backend uses.
+func secondsToNanos(sec float64) int64 {
+	return int64(sec * 1e9)
+}
+
+// AudioTranscriptionStream drives the cache-aware streaming RNN-T over the
+// audio at opts.Dst: it decodes the file to 16 kHz mono PCM, feeds it in
+// chunks to parakeet_capi_stream_feed, and emits each newly-finalized text
+// run as a TranscriptStreamResponse delta. <EOU>/<EOB> events close the
+// current segment; a closing FinalResult carries the full transcript and the
+// per-utterance segments.
+//
+// stream_begin returns 0 for models that are not cache-aware streaming models
+// (only e.g. nvidia/parakeet_realtime_eou_120m-v1 qualifies). For those we fall
+// back to a single offline transcription emitted as one delta plus a closing
+// FinalResult, matching LocalAI's non-streaming streaming contract (and the
+// whisper backend), so the streaming endpoint works for every model.
+func (p *ParakeetCpp) AudioTranscriptionStream(ctx context.Context, opts *pb.TranscriptRequest, results chan *pb.TranscriptStreamResponse) error {
+	defer close(results)
+
+	if p.ctxPtr == 0 {
+		return errors.New("parakeet-cpp: model not loaded")
+	}
+	if opts.Dst == "" {
+		return errors.New("parakeet-cpp: TranscriptRequest.dst (audio path) is required")
+	}
+	if err := ctx.Err(); err != nil {
+		return status.Error(codes.Canceled, "transcription cancelled")
+	}
+
+	stream := CppStreamBegin(p.ctxPtr)
+	if stream == 0 {
+		// Not a cache-aware streaming model: run a normal offline
+		// transcription and emit it as one delta + a closing final result.
+		res, err := p.AudioTranscription(ctx, opts)
+		if err != nil {
+			return err
+		}
+		if t := strings.TrimSpace(res.Text); t != "" {
+			results <- &pb.TranscriptStreamResponse{Delta: t}
+		}
+		results <- &pb.TranscriptStreamResponse{FinalResult: &res}
+		return nil
+	}
+	defer CppStreamFree(stream)
+
+	data, duration, err := decodeWavMono16k(opts.Dst)
+	if err != nil {
+		return err
+	}
+
+	var (
+		full     strings.Builder
+		segText  strings.Builder
+		segments []*pb.TranscriptSegment
+		segID    int32
+	)
+
+	flushSegment := func() {
+		t := strings.TrimSpace(segText.String())
+		segText.Reset()
+		if t == "" {
+			return
+		}
+		segments = append(segments, &pb.TranscriptSegment{Id: segID, Text: t})
+		segID++
+	}
+
+	// emitDelta consumes the malloc'd char* returned by feed/finalize: frees
+	// it, accumulates the text, and sends a delta when non-empty. A 0 return
+	// is an error (vs the "" empty-but-non-NULL no-new-text case).
+	emitDelta := func(ret uintptr) error {
+		if ret == 0 {
+			msg := CppLastError(p.ctxPtr)
+			if msg == "" {
+				msg = "unknown error"
+			}
+			return fmt.Errorf("parakeet-cpp: stream feed/finalize failed: %s", msg)
+		}
+		delta := goStringFromCPtr(ret)
+		CppFreeString(ret)
+		if delta == "" {
+			return nil
+		}
+		full.WriteString(delta)
+		segText.WriteString(delta)
+		results <- &pb.TranscriptStreamResponse{Delta: delta}
+		return nil
+	}
+
+	for off := 0; off < len(data); off += streamChunkSamples {
+		if err := ctx.Err(); err != nil {
+			return status.Error(codes.Canceled, "transcription cancelled")
+		}
+		end := min(off+streamChunkSamples, len(data))
+		chunk := data[off:end]
+
+		var eou int32
+		ret := CppStreamFeed(stream, chunk, int32(len(chunk)), unsafe.Pointer(&eou))
+		if err := emitDelta(ret); err != nil {
+			return err
+		}
+		if eou != 0 {
+			flushSegment()
+		}
+	}
+
+	// Flush the streaming tail (final encoder chunk).
+	if err := emitDelta(CppStreamFinalize(stream)); err != nil {
+		return err
+	}
+	flushSegment()
+
+	text := strings.TrimSpace(full.String())
+	if len(segments) == 0 && text != "" {
+		segments = append(segments, &pb.TranscriptSegment{Id: 0, Text: text})
+	}
+	results <- &pb.TranscriptStreamResponse{
+		FinalResult: &pb.TranscriptResult{
+			Text:     text,
+			Segments: segments,
+			Duration: duration,
+		},
+	}
+	return nil
+}
+
+// decodeWavMono16k converts any input audio to 16 kHz mono PCM and returns the
+// float samples plus the clip duration in seconds. Mirrors the whisper
+// backend: utils.AudioToWav (ffmpeg) normalises rate/channels, go-audio
+// decodes the PCM.
+func decodeWavMono16k(path string) ([]float32, float32, error) {
+	dir, err := os.MkdirTemp("", "parakeet")
+	if err != nil {
+		return nil, 0, err
+	}
+	defer func() { _ = os.RemoveAll(dir) }()
+
+	converted := filepath.Join(dir, "converted.wav")
+	if err := utils.AudioToWav(path, converted); err != nil {
+		return nil, 0, err
+	}
+
+	fh, err := os.Open(converted)
+	if err != nil {
+		return nil, 0, err
+	}
+	defer func() { _ = fh.Close() }()
+
+	buf, err := wav.NewDecoder(fh).FullPCMBuffer()
+	if err != nil {
+		return nil, 0, err
+	}
+	data := buf.AsFloat32Buffer().Data
+	var duration float32
+	if buf.Format != nil && buf.Format.SampleRate > 0 {
+		duration = float32(len(data)) / float32(buf.Format.SampleRate)
+	}
+	return data, duration, nil
+}
+
+// Free releases the underlying parakeet_ctx. Called by LocalAI when the
+// model is unloaded.
+func (p *ParakeetCpp) Free() error {
+	if p.ctxPtr != 0 {
+		CppFree(p.ctxPtr)
+		p.ctxPtr = 0
+	}
+	return nil
+}
+
+// goStringFromCPtr copies a NUL-terminated C string into Go memory.
+// cptr is the raw pointer returned by purego from the C-API (a malloc'd
+// buffer the caller owns); callers must free it via CppFreeString after
+// the copy lands.
+//
+// The uintptr->unsafe.Pointer conversion below trips go vet's unsafeptr
+// check, which can't distinguish a C-owned heap pointer from Go-managed
+// memory. It is safe here: the pointer addresses a malloc'd C buffer the
+// Go GC neither tracks nor moves, and we dereference it immediately to
+// copy the bytes out, the same pattern (and the same tolerated warning)
+// as the whisper backend's unsafe.Slice over segsPtr.
+func goStringFromCPtr(cptr uintptr) string {
+	if cptr == 0 {
+		return ""
+	}
+	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
+	n := 0
+	for *(*byte)(unsafe.Add(p, n)) != 0 {
+		n++
+	}
+	return string(unsafe.Slice((*byte)(p), n))
+}
--- a/backend/go/parakeet-cpp/goparakeetcpp_test.go
+++ b/backend/go/parakeet-cpp/goparakeetcpp_test.go
@@ -0,0 +1,164 @@
+package main
+
+import (
+	"context"
+	"os"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/ebitengine/purego"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestParakeetCpp(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "parakeet-cpp Backend Suite")
+}
+
+var (
+	libLoadOnce sync.Once
+	libLoadErr  error
+)
+
+// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive
+// the C-API bridge without spinning up the gRPC server. Skips the
+// current spec when libparakeet.so isn't loadable from cwd
+// ($LD_LIBRARY_PATH or a symlink in ./).
+func ensureLibLoaded() {
+	libLoadOnce.Do(func() {
+		libName := os.Getenv("PARAKEET_LIBRARY")
+		if libName == "" {
+			libName = "libparakeet.so"
+		}
+		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+		if err != nil {
+			libLoadErr = err
+			return
+		}
+		purego.RegisterLibFunc(&CppAbiVersion, lib, "parakeet_capi_abi_version")
+		purego.RegisterLibFunc(&CppLoad, lib, "parakeet_capi_load")
+		purego.RegisterLibFunc(&CppFree, lib, "parakeet_capi_free")
+		purego.RegisterLibFunc(&CppTranscribePath, lib, "parakeet_capi_transcribe_path")
+		purego.RegisterLibFunc(&CppTranscribePathJSON, lib, "parakeet_capi_transcribe_path_json")
+		purego.RegisterLibFunc(&CppStreamBegin, lib, "parakeet_capi_stream_begin")
+		purego.RegisterLibFunc(&CppStreamFeed, lib, "parakeet_capi_stream_feed")
+		purego.RegisterLibFunc(&CppStreamFinalize, lib, "parakeet_capi_stream_finalize")
+		purego.RegisterLibFunc(&CppStreamFree, lib, "parakeet_capi_stream_free")
+		purego.RegisterLibFunc(&CppFreeString, lib, "parakeet_capi_free_string")
+		purego.RegisterLibFunc(&CppLastError, lib, "parakeet_capi_last_error")
+	})
+	if libLoadErr != nil {
+		Skip("libparakeet.so not loadable: " + libLoadErr.Error())
+	}
+}
+
+// fixturesOrSkip returns the model + audio paths or skips the spec if
+// either env var is unset. The smoke test never runs in default CI; it
+// needs a real parakeet GGUF and a 16 kHz mono WAV on disk.
+func fixturesOrSkip() (string, string) {
+	modelPath := os.Getenv("PARAKEET_BACKEND_TEST_MODEL")
+	audioPath := os.Getenv("PARAKEET_BACKEND_TEST_WAV")
+	if modelPath == "" || audioPath == "" {
+		Skip("set PARAKEET_BACKEND_TEST_MODEL and PARAKEET_BACKEND_TEST_WAV to run this spec")
+	}
+	return modelPath, audioPath
+}
+
+var _ = Describe("ParakeetCpp", func() {
+	Context("AudioTranscription", func() {
+		It("transcribes a WAV via the parakeet C-API", func() {
+			modelPath, audioPath := fixturesOrSkip()
+			ensureLibLoaded()
+
+			p := &ParakeetCpp{}
+			Expect(p.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
+			defer func() { _ = p.Free() }()
+
+			res, err := p.AudioTranscription(context.Background(), &pb.TranscriptRequest{
+				Dst: audioPath,
+			})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(strings.TrimSpace(res.Text)).ToNot(BeEmpty(),
+				"expected non-empty transcript for %s", audioPath)
+			Expect(res.Segments).To(HaveLen(1),
+				"synthesises a single whole-clip segment")
+			Expect(res.Segments[0].Text).To(Equal(res.Text),
+				"single segment text must equal the top-level text")
+			// Default (no granularities) is segment-level: no per-word timings.
+			Expect(res.Segments[0].Words).To(BeEmpty(),
+				"word timings are opt-in via timestamp_granularities")
+		})
+
+		It("emits word-level timestamps when granularity=word", func() {
+			modelPath, audioPath := fixturesOrSkip()
+			ensureLibLoaded()
+
+			p := &ParakeetCpp{}
+			Expect(p.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
+			defer func() { _ = p.Free() }()
+
+			res, err := p.AudioTranscription(context.Background(), &pb.TranscriptRequest{
+				Dst:                    audioPath,
+				TimestampGranularities: []string{"word"},
+			})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.Segments).To(HaveLen(1))
+			seg := res.Segments[0]
+			Expect(seg.Words).ToNot(BeEmpty(),
+				"expected per-word timestamps with granularity=word")
+			// Monotonic, non-negative timings spanning the segment.
+			Expect(seg.Words[0].Start).To(BeNumerically(">=", int64(0)))
+			Expect(seg.End).To(BeNumerically(">=", seg.Start))
+			Expect(seg.Words[len(seg.Words)-1].End).To(Equal(seg.End),
+				"segment end tracks the last word")
+		})
+	})
+
+	Context("AudioTranscriptionStream", func() {
+		It("streams deltas and a closing FinalResult from a cache-aware model", func() {
+			// Streaming needs a cache-aware streaming model (e.g.
+			// realtime_eou); the offline test model would fail stream_begin.
+			modelPath := os.Getenv("PARAKEET_BACKEND_TEST_STREAM_MODEL")
+			audioPath := os.Getenv("PARAKEET_BACKEND_TEST_WAV")
+			if modelPath == "" || audioPath == "" {
+				Skip("set PARAKEET_BACKEND_TEST_STREAM_MODEL (cache-aware streaming model) and PARAKEET_BACKEND_TEST_WAV")
+			}
+			ensureLibLoaded()
+
+			p := &ParakeetCpp{}
+			Expect(p.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
+			defer func() { _ = p.Free() }()
+
+			results := make(chan *pb.TranscriptStreamResponse, 64)
+			errCh := make(chan error, 1)
+			go func() {
+				errCh <- p.AudioTranscriptionStream(context.Background(),
+					&pb.TranscriptRequest{Dst: audioPath}, results)
+			}()
+
+			var deltas []string
+			var final *pb.TranscriptResult
+			for r := range results {
+				if r.Delta != "" {
+					deltas = append(deltas, r.Delta)
+				}
+				if r.FinalResult != nil {
+					final = r.FinalResult
+				}
+			}
+			Expect(<-errCh).ToNot(HaveOccurred())
+
+			Expect(final).ToNot(BeNil(), "expected a closing FinalResult")
+			Expect(strings.TrimSpace(final.Text)).ToNot(BeEmpty(),
+				"expected a non-empty streamed transcript")
+			Expect(final.Segments).ToNot(BeEmpty(),
+				"FinalResult always carries at least one segment")
+			// The concatenated deltas reconstruct the final transcript.
+			Expect(strings.TrimSpace(strings.Join(deltas, ""))).To(Equal(strings.TrimSpace(final.Text)),
+				"deltas should reconstruct the final text")
+		})
+	})
+})
--- a/backend/go/parakeet-cpp/main.go
+++ b/backend/go/parakeet-cpp/main.go
@@ -0,0 +1,68 @@
+package main
+
+// Started internally by LocalAI - one gRPC server per loaded model.
+//
+// Loads libparakeet.so via purego and registers the flat C-API entry
+// points declared in parakeet_capi.h. The library name can be overridden
+// with PARAKEET_LIBRARY (mirrors the WHISPER_LIBRARY / VIBEVOICECPP_LIBRARY
+// convention in the sibling backends); the default looks for the .so next
+// to this binary.
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
+func main() {
+	libName := os.Getenv("PARAKEET_LIBRARY")
+	if libName == "" {
+		libName = "libparakeet.so"
+	}
+
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(fmt.Errorf("parakeet-cpp: dlopen %q: %w", libName, err))
+	}
+
+	// Bound 1:1 to parakeet_capi.h. The C-API returns malloc'd char*
+	// buffers from transcribe_*; we register those as uintptr so we get
+	// the raw pointer back and can call parakeet_capi_free_string on it
+	// (purego's string return would copy and forget the original pointer,
+	// leaking it on every call).
+	libFuncs := []LibFuncs{
+		{&CppAbiVersion, "parakeet_capi_abi_version"},
+		{&CppLoad, "parakeet_capi_load"},
+		{&CppFree, "parakeet_capi_free"},
+		{&CppTranscribePath, "parakeet_capi_transcribe_path"},
+		{&CppTranscribePathJSON, "parakeet_capi_transcribe_path_json"},
+		{&CppStreamBegin, "parakeet_capi_stream_begin"},
+		{&CppStreamFeed, "parakeet_capi_stream_feed"},
+		{&CppStreamFinalize, "parakeet_capi_stream_finalize"},
+		{&CppStreamFree, "parakeet_capi_stream_free"},
+		{&CppFreeString, "parakeet_capi_free_string"},
+		{&CppLastError, "parakeet_capi_last_error"},
+	}
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
+	}
+
+	fmt.Fprintf(os.Stderr, "[parakeet-cpp] ABI=%d\n", CppAbiVersion())
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &ParakeetCpp{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+#
+# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
+# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
+# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+mkdir -p "$CURDIR/package/lib"
+
+cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
+cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
+
+# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
+cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
+	exit 1
+}
+
+echo "L0 package layout (full ldd walk lands in L3):"
+ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/parakeet-cpp/run.sh
+++ b/backend/go/parakeet-cpp/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+
+# If a self-contained ld.so was packaged, route through it so the
+# packaged libc / libstdc++ are used instead of the host's (matches the
+# whisper backend's runtime layout).
+if [ -f "$CURDIR/lib/ld.so" ]; then
+	echo "Using lib/ld.so"
+	exec "$CURDIR/lib/ld.so" "$CURDIR/parakeet-cpp-grpc" "$@"
+fi
+
+exec "$CURDIR/parakeet-cpp-grpc" "$@"
--- a/backend/go/rfdetr-cpp/Makefile
+++ b/backend/go/rfdetr-cpp/Makefile
@@ -11,7 +11,7 @@ JOBS?=$(shell nproc --ignore=1)
 # build; leaving this on `master` always picks up the latest C-API surface
 # (incl. the per-detection accessor functions used by gorfdetrcpp.go).
 RFDETR_REPO?=https://github.com/mudler/rf-detr.cpp.git
-RFDETR_VERSION?=main
+RFDETR_VERSION?=65c0ffcc9a9bc9dae38252f63d0417c9845a6cf7

 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=29ab511fc75f89fbab148665eab1a8e10a139a72
+STABLEDIFFUSION_GGML_VERSION?=0e4ee04488159b81d95a9ffcd983a077fd5dcb77

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=6dcdd6536456158667747f724d6bd3a2ceaa8d88
+WHISPER_CPP_VERSION?=f24588a272ae8e23280d9c220536437164e6ed28
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -122,6 +122,35 @@
    nvidia-cuda-12: "cuda12-whisper"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-whisper"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-whisper"
+- &parakeetcpp
+  name: "parakeet-cpp"
+  alias: "parakeet-cpp"
+  license: mit
+  icon: https://avatars.githubusercontent.com/u/95302084
+  description: |
+    parakeet.cpp is a C++/ggml port of NVIDIA NeMo Parakeet automatic speech recognition (ASR) models.
+    It supports the tdt, ctc, rnnt and hybrid decoder families as well as cache-aware streaming transcription,
+    and runs on CPU, NVIDIA CUDA, AMD ROCm/HIP, Intel SYCL and NVIDIA Jetson (L4T) targets.
+  urls:
+    - https://github.com/mudler/parakeet.cpp
+  tags:
+    - audio-transcription
+    - CPU
+    - GPU
+    - CUDA
+    - HIP
+  capabilities:
+    default: "cpu-parakeet-cpp"
+    nvidia: "cuda12-parakeet-cpp"
+    intel: "intel-sycl-f16-parakeet-cpp"
+    metal: "metal-parakeet-cpp"
+    amd: "rocm-parakeet-cpp"
+    vulkan: "vulkan-parakeet-cpp"
+    nvidia-l4t: "nvidia-l4t-arm64-parakeet-cpp"
+    nvidia-cuda-13: "cuda13-parakeet-cpp"
+    nvidia-cuda-12: "cuda12-parakeet-cpp"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -1928,6 +1957,121 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-whisper
+## parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "parakeet-cpp-development"
+  capabilities:
+    default: "cpu-parakeet-cpp-development"
+    nvidia: "cuda12-parakeet-cpp-development"
+    intel: "intel-sycl-f16-parakeet-cpp-development"
+    metal: "metal-parakeet-cpp-development"
+    amd: "rocm-parakeet-cpp-development"
+    vulkan: "vulkan-parakeet-cpp-development"
+    nvidia-l4t: "nvidia-l4t-arm64-parakeet-cpp-development"
+    nvidia-cuda-13: "cuda13-parakeet-cpp-development"
+    nvidia-cuda-12: "cuda12-parakeet-cpp-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-parakeet-cpp-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-parakeet-cpp-development"
+- !!merge <<: *parakeetcpp
+  name: "nvidia-l4t-arm64-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "nvidia-l4t-arm64-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cuda13-nvidia-l4t-arm64-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cuda13-nvidia-l4t-arm64-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cpu-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-cpu-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cpu-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-cpu-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "metal-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "metal-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cuda12-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cuda12-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "rocm-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "rocm-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "intel-sycl-f32-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "intel-sycl-f32-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "intel-sycl-f16-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "intel-sycl-f16-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "vulkan-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "vulkan-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cuda13-parakeet-cpp"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-parakeet-cpp
+- !!merge <<: *parakeetcpp
+  name: "cuda13-parakeet-cpp-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-parakeet-cpp
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
--- a/backend/python/vllm/requirements-cublas13-after.txt
+++ b/backend/python/vllm/requirements-cublas13-after.txt
@@ -3,5 +3,5 @@
 # on a cu130 host. Pull the cu130-flavoured wheel from vLLM's per-tag index
 # instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
 # so uv consults this index alongside PyPI.
--extra-index-url https://wheels.vllm.ai/0.21.0/cu130
-vllm==0.21.0
+--extra-index-url https://wheels.vllm.ai/0.22.0/cu130
+vllm==0.22.0
--- a/cmd/launcher/internal/release_manager.go
+++ b/cmd/launcher/internal/release_manager.go
@@ -17,6 +17,7 @@ import (
 	"time"

 	"github.com/mudler/LocalAI/internal"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // Release represents a LocalAI release
@@ -67,9 +68,7 @@ func NewReleaseManager() *ReleaseManager {
 		CurrentVersion: internal.PrintableVersion(),
 		ChecksumsPath:  checksumsPath,
 		MetadataPath:   metadataPath,
-		HTTPClient: &http.Client{
-			Timeout: 30 * time.Second,
-		},
+		HTTPClient:     httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects()),
 	}
 }

--- a/core/application/application.go
+++ b/core/application/application.go
@@ -90,6 +90,8 @@ type Application struct {
 	// LocalAI Assistant in-process MCP server. nil when DisableLocalAIAssistant
 	// is set; otherwise initialised in start() after galleryService.
 	localAIAssistant *mcpTools.LocalAIAssistantHolder
+
+	shutdownOnce sync.Once
 }

 func newApplication(appConfig *config.ApplicationConfig) *Application {
@@ -320,6 +322,24 @@ func (a *Application) IsDistributed() bool {
 	return a.distributed != nil
 }

+// Shutdown stops backend gRPC processes and distributed services
+// synchronously on the caller's stack. The context-cancel goroutine wired
+// in New does the same work asynchronously, which races test-binary exit
+// and CLI shutdown — orphaning spawned mock-backend / llama.cpp / etc.
+// children to init. Callers that need a guarantee that cleanup has
+// finished before they proceed (AfterSuite/AfterEach, signal handlers)
+// must call this. Safe to call multiple times.
+func (a *Application) Shutdown() error {
+	var err error
+	a.shutdownOnce.Do(func() {
+		a.distributed.Shutdown()
+		if a.modelLoader != nil {
+			err = a.modelLoader.StopAllGRPC()
+		}
+	})
+	return err
+}
+
 // waitForHealthyWorker blocks until at least one healthy backend worker is registered.
 // This prevents the agent pool from failing during startup when workers haven't connected yet.
 func (a *Application) waitForHealthyWorker() {
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -449,13 +449,15 @@ func New(opts ...config.AppOption) (*Application, error) {

 	application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging)

-	// turn off any process that was started by GRPC if the context is canceled
+	// Safety-net cleanup if the application context is cancelled without
+	// the caller invoking Shutdown directly. This is fire-and-forget — it
+	// races binary exit and is unreliable in tests; the deterministic path
+	// is application.Shutdown(), which Shutdown's sync.Once dedupes with
+	// this goroutine.
 	go func() {
 		<-options.Context.Done()
 		xlog.Debug("Context canceled, shutting down")
-		application.distributed.Shutdown()
-		err := application.ModelLoader().StopAllGRPC()
-		if err != nil {
+		if err := application.Shutdown(); err != nil {
 			xlog.Error("error while stopping all grpc backends", "error", err)
 		}
 	}()
--- a/core/backend/options_internal_test.go
+++ b/core/backend/options_internal_test.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"

 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/reasoning"

 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -42,3 +43,35 @@ var _ = Describe("grpcModelOpts EngineArgs", func() {
 		Expect(opts.EngineArgs).To(BeEmpty())
 	})
 })
+
+// Guards the DisableReasoning -> enable_thinking metadata conversion that the
+// per-request reasoning_effort feature (issue #10072) relies on: the request
+// merge sets ReasoningConfig.DisableReasoning, and gRPCPredictOpts is where it
+// becomes the gRPC PredictOptions.Metadata the backend reads.
+var _ = Describe("gRPCPredictOpts enable_thinking metadata", func() {
+	// withReasoning builds a fully-defaulted config (gRPCPredictOpts dereferences
+	// many pointer fields) and overrides only the reasoning toggle.
+	withReasoning := func(disable *bool) config.ModelConfig {
+		cfg := config.ModelConfig{}
+		cfg.SetDefaults()
+		cfg.ReasoningConfig = reasoning.Config{DisableReasoning: disable}
+		return cfg
+	}
+	disabled := true
+	enabled := false
+
+	It("emits enable_thinking=false when reasoning is disabled", func() {
+		opts := gRPCPredictOpts(withReasoning(&disabled), "/tmp/models")
+		Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "false"))
+	})
+
+	It("emits enable_thinking=true when reasoning is enabled", func() {
+		opts := gRPCPredictOpts(withReasoning(&enabled), "/tmp/models")
+		Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "true"))
+	})
+
+	It("omits enable_thinking when reasoning is unset", func() {
+		opts := gRPCPredictOpts(withReasoning(nil), "/tmp/models")
+		Expect(opts.Metadata).ToNot(HaveKey("enable_thinking"))
+	})
+})
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -577,12 +577,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	}

 	signals.RegisterGracefulTerminationHandler(func() {
-		if err := app.ModelLoader().StopAllGRPC(); err != nil {
-			xlog.Error("error while stopping all grpc backends", "error", err)
-		}
-		// Clean up distributed services (idempotent — safe if already called)
-		if d := app.Distributed(); d != nil {
-			d.Shutdown()
+		if err := app.Shutdown(); err != nil {
+			xlog.Error("error while shutting down application", "error", err)
 		}
 	})

--- a/core/cli/workerregistry/client.go
+++ b/core/cli/workerregistry/client.go
@@ -15,6 +15,8 @@ import (
 	"time"

 	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // RegistrationClient talks to the frontend's /api/node/* endpoints.
@@ -37,7 +39,7 @@ func (c *RegistrationClient) httpTimeout() time.Duration {
 // httpClient returns the shared HTTP client, initializing it on first use.
 func (c *RegistrationClient) httpClient() *http.Client {
 	c.clientOnce.Do(func() {
-		c.client = &http.Client{Timeout: c.httpTimeout()}
+		c.client = httpclient.NewWithTimeout(c.httpTimeout())
 	})
 	return c.client
 }
--- a/core/clients/store.go
+++ b/core/clients/store.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // Define a struct to hold the store API client
@@ -47,7 +49,7 @@ type FindResponse struct {
 func NewStoreClient(baseUrl string) *StoreClient {
 	return &StoreClient{
 		BaseURL: baseUrl,
-		Client:  &http.Client{},
+		Client:  httpclient.New(),
 	}
 }

--- a/core/config/gen_inference_defaults/main.go
+++ b/core/config/gen_inference_defaults/main.go
@@ -9,10 +9,11 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"net/http"
 	"os"
 	"sort"
 	"strings"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 const (
@@ -55,7 +56,7 @@ var allowedFields = map[string]bool{
 func main() {
 	fmt.Fprintf(os.Stderr, "Fetching %s ...\n", unslothURL)

-	resp, err := http.Get(unslothURL)
+	resp, err := httpclient.New(httpclient.WithFollowRedirects()).Get(unslothURL)
 	if err != nil {
 		fatal("fetch failed: %v", err)
 	}
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -732,6 +732,17 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Proxy.Mode = ProxyModePassthrough
 	}

+	// When templating is delegated to the backend (use_tokenizer_template),
+	// the backend also owns tool-call grammar generation and parsing. Sending
+	// a LocalAI-generated grammar alongside overrides the backend's native
+	// (name-first) tool pipeline and makes it stream the tool-call JSON back as
+	// plain content (issue #10052). The GGUF auto-import path already couples
+	// these two flags; enforce it here so gallery and hand-written configs that
+	// set use_tokenizer_template directly stay consistent.
+	if cfg.TemplateConfig.UseTokenizerTemplate {
+		cfg.FunctionsConfig.GrammarConfig.NoGrammar = true
+	}
+
 	// Apply model-family-specific inference defaults before generic fallbacks.
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -471,4 +471,33 @@ concurrency_groups:
 			Expect(configs[0].GetConcurrencyGroups()).To(Equal([]string{"vram-heavy", "120b"}))
 		})
 	})
+
+	// When templating is delegated to the backend (use_tokenizer_template),
+	// the backend also owns tool-call grammar generation and parsing. A
+	// LocalAI-generated grammar sent alongside would override the backend's
+	// native (name-first) tool pipeline and make it stream the tool-call JSON
+	// back as plain content (issue #10052). SetDefaults must therefore couple
+	// the two: tokenizer template implies grammar generation is disabled.
+	Context("use_tokenizer_template couples with grammar disable (issue #10052)", func() {
+		It("disables Go grammar generation when the tokenizer template is used", func() {
+			cfg := &ModelConfig{
+				TemplateConfig: TemplateConfig{UseTokenizerTemplate: true},
+			}
+			Expect(cfg.FunctionsConfig.GrammarConfig.NoGrammar).To(BeFalse())
+
+			cfg.SetDefaults()
+
+			Expect(cfg.FunctionsConfig.GrammarConfig.NoGrammar).To(BeTrue(),
+				"use_tokenizer_template must imply grammar.disable so tools go to the backend's native pipeline")
+		})
+
+		It("leaves grammar generation enabled when the tokenizer template is not used", func() {
+			cfg := &ModelConfig{}
+
+			cfg.SetDefaults()
+
+			Expect(cfg.FunctionsConfig.GrammarConfig.NoGrammar).To(BeFalse(),
+				"models that template in Go still rely on the Go-generated grammar")
+		})
+	})
 })
--- a/core/gallery/importers/importers.go
+++ b/core/gallery/importers/importers.go
@@ -115,6 +115,10 @@ var defaultImporters = []Importer{
 	&NemoImporter{},
 	&FasterWhisperImporter{},
 	&QwenASRImporter{},
+	// ParakeetCppImporter matches only parakeet GGUFs (<arch>-<size>-<quant>.gguf);
+	// kept ahead of LlamaCPPImporter so its .gguf bundles aren't claimed by the
+	// generic GGUF importer.
+	&ParakeetCppImporter{},
 	// TTS (Batch 2)
 	&PiperImporter{},
 	&BarkImporter{},
--- a/core/gallery/importers/parakeet-cpp.go
+++ b/core/gallery/importers/parakeet-cpp.go
@@ -0,0 +1,180 @@
+package importers
+
+import (
+	"encoding/json"
+	"path/filepath"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/downloader"
+	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
+	"go.yaml.in/yaml/v2"
+)
+
+var _ Importer = &ParakeetCppImporter{}
+
+// ParakeetCppImporter recognises parakeet.cpp GGUF weights, the C++/ggml port
+// of NVIDIA NeMo Parakeet. The signal is narrow on purpose: parakeet.cpp names
+// its weights "<arch>-<size>-<quant>.gguf" (e.g. tdt_ctc-110m-f16.gguf,
+// rnnt-0.6b-q4_k.gguf, realtime_eou_120m-v1-q8_0.gguf), so we only match a
+// .gguf whose name carries a parakeet architecture token. That keeps us from
+// claiming arbitrary llama-style GGUFs (the importer is registered before
+// llama-cpp), and it deliberately does NOT match the upstream nvidia/parakeet-*
+// NeMo repos (which ship .nemo checkpoints, not runnable GGUFs).
+// preferences.backend="parakeet-cpp" forces the importer regardless.
+type ParakeetCppImporter struct{}
+
+func (i *ParakeetCppImporter) Name() string      { return "parakeet-cpp" }
+func (i *ParakeetCppImporter) Modality() string  { return "asr" }
+func (i *ParakeetCppImporter) AutoDetects() bool { return true }
+
+func (i *ParakeetCppImporter) Match(details Details) bool {
+	preferences, err := details.Preferences.MarshalJSON()
+	if err != nil {
+		return false
+	}
+	preferencesMap := make(map[string]any)
+	if len(preferences) > 0 {
+		if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
+			return false
+		}
+	}
+
+	if b, ok := preferencesMap["backend"].(string); ok && b == "parakeet-cpp" {
+		return true
+	}
+
+	// Direct URL or path to a parakeet GGUF.
+	if isParakeetGGUF(filepath.Base(details.URI)) {
+		return true
+	}
+
+	// HF repo shipping at least one parakeet GGUF.
+	if details.HuggingFace != nil {
+		for _, f := range details.HuggingFace.Files {
+			if isParakeetGGUF(filepath.Base(f.Path)) {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+func (i *ParakeetCppImporter) Import(details Details) (gallery.ModelConfig, error) {
+	preferences, err := details.Preferences.MarshalJSON()
+	if err != nil {
+		return gallery.ModelConfig{}, err
+	}
+	preferencesMap := make(map[string]any)
+	if len(preferences) > 0 {
+		if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
+			return gallery.ModelConfig{}, err
+		}
+	}
+
+	name, ok := preferencesMap["name"].(string)
+	if !ok {
+		name = filepath.Base(details.URI)
+	}
+
+	description, ok := preferencesMap["description"].(string)
+	if !ok {
+		description = "Imported from " + details.URI
+	}
+
+	// parakeet quants are near-lossless even at Q4_K (WER 0.0 vs NeMo on 110m),
+	// so default to the smallest, then fall back up the size ladder; the last
+	// file wins if none match (mirrors whisper / llama-cpp).
+	preferredQuants, _ := preferencesMap["quantizations"].(string)
+	quants := []string{"q4_k", "q5_k", "q6_k", "q8_0", "f16"}
+	if preferredQuants != "" {
+		quants = strings.Split(preferredQuants, ",")
+	}
+
+	cfg := gallery.ModelConfig{
+		Name:        name,
+		Description: description,
+	}
+
+	modelConfig := config.ModelConfig{
+		Name:                name,
+		Description:         description,
+		Backend:             "parakeet-cpp",
+		KnownUsecaseStrings: []string{"transcript"},
+	}
+
+	uri := downloader.URI(details.URI)
+	directGGUF := isParakeetGGUF(filepath.Base(details.URI))
+	switch {
+	case uri.LooksLikeURL() && directGGUF:
+		// Direct file URL (e.g. .../resolve/main/tdt_ctc-110m-f16.gguf). The
+		// exact file is known, no quant pick.
+		fileName, err := uri.FilenameFromUrl()
+		if err != nil {
+			return gallery.ModelConfig{}, err
+		}
+		target := filepath.Join("parakeet-cpp", "models", name, fileName)
+		cfg.Files = append(cfg.Files, gallery.File{
+			URI:      details.URI,
+			Filename: target,
+		})
+		modelConfig.PredictionOptions = schema.PredictionOptions{
+			BasicModelRequest: schema.BasicModelRequest{Model: target},
+		}
+	case details.HuggingFace != nil:
+		// HF repo: collect every parakeet GGUF, pick the preferred quant, and
+		// nest under parakeet-cpp/models/<name>/ so a multi-quant repo doesn't
+		// collide on disk.
+		var ggufFiles []hfapi.ModelFile
+		for _, f := range details.HuggingFace.Files {
+			if isParakeetGGUF(filepath.Base(f.Path)) {
+				ggufFiles = append(ggufFiles, f)
+			}
+		}
+		if chosen, ok := pickPreferredGGMLFile(ggufFiles, quants); ok {
+			target := filepath.Join("parakeet-cpp", "models", name, filepath.Base(chosen.Path))
+			cfg.Files = append(cfg.Files, gallery.File{
+				URI:      chosen.URL,
+				Filename: target,
+				SHA256:   chosen.SHA256,
+			})
+			modelConfig.PredictionOptions = schema.PredictionOptions{
+				BasicModelRequest: schema.BasicModelRequest{Model: target},
+			}
+		}
+	default:
+		// Bare URI with no HF metadata (pref-only path): point at the basename
+		// so users can tweak the YAML after import.
+		modelConfig.PredictionOptions = schema.PredictionOptions{
+			BasicModelRequest: schema.BasicModelRequest{Model: filepath.Base(details.URI)},
+		}
+	}
+
+	data, err := yaml.Marshal(modelConfig)
+	if err != nil {
+		return gallery.ModelConfig{}, err
+	}
+	cfg.ConfigFile = string(data)
+
+	return cfg, nil
+}
+
+// isParakeetGGUF reports whether name is a parakeet.cpp GGUF: a .gguf file
+// whose name carries a parakeet architecture token. The .gguf check is
+// case-insensitive; the tokens cover the published naming
+// (<arch>-<size>-<quant>.gguf) plus a generic "parakeet" fallback.
+func isParakeetGGUF(name string) bool {
+	lower := strings.ToLower(name)
+	if !strings.HasSuffix(lower, ".gguf") {
+		return false
+	}
+	for _, tok := range []string{"tdt_ctc", "tdt-", "tdt_", "rnnt", "ctc-", "ctc_", "realtime_eou", "parakeet"} {
+		if strings.Contains(lower, tok) {
+			return true
+		}
+	}
+	return false
+}
--- a/core/gallery/importers/parakeet-cpp_test.go
+++ b/core/gallery/importers/parakeet-cpp_test.go
@@ -0,0 +1,103 @@
+package importers_test
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/gallery/importers"
+	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// hfWith builds Details carrying a synthetic HF file list so detection can be
+// exercised without hitting the network.
+func parakeetDetails(uri string, prefs string, files ...hfapi.ModelFile) importers.Details {
+	return importers.Details{
+		URI:         uri,
+		Preferences: json.RawMessage(prefs),
+		HuggingFace: &hfapi.ModelDetails{Files: files},
+	}
+}
+
+var _ = Describe("ParakeetCppImporter", func() {
+	imp := &importers.ParakeetCppImporter{}
+
+	Context("Importer interface metadata", func() {
+		It("exposes name/modality/autodetect", func() {
+			Expect(imp.Name()).To(Equal("parakeet-cpp"))
+			Expect(imp.Modality()).To(Equal("asr"))
+			Expect(imp.AutoDetects()).To(BeTrue())
+		})
+	})
+
+	Context("detection (Match)", func() {
+		It("matches an HF repo shipping a parakeet GGUF", func() {
+			d := parakeetDetails("huggingface://mudler/parakeet-cpp-gguf", `{}`,
+				hfapi.ModelFile{Path: "tdt_ctc-110m-f16.gguf"},
+				hfapi.ModelFile{Path: "README.md"},
+			)
+			Expect(imp.Match(d)).To(BeTrue())
+		})
+
+		It("matches a direct URL to a parakeet GGUF", func() {
+			d := parakeetDetails("https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/rnnt-0.6b-q4_k.gguf", `{}`)
+			Expect(imp.Match(d)).To(BeTrue())
+		})
+
+		It("honours preferences.backend=parakeet-cpp for arbitrary URIs", func() {
+			d := parakeetDetails("https://example.com/whatever", `{"backend": "parakeet-cpp"}`)
+			Expect(imp.Match(d)).To(BeTrue())
+		})
+
+		It("does NOT claim a generic llama-style GGUF", func() {
+			d := parakeetDetails("huggingface://someorg/some-llm-gguf", `{}`,
+				hfapi.ModelFile{Path: "llama-3-8b-instruct-q4_k_m.gguf"},
+			)
+			Expect(imp.Match(d)).To(BeFalse())
+		})
+
+		It("does NOT claim the upstream NeMo repo (.nemo, no GGUF)", func() {
+			d := parakeetDetails("huggingface://nvidia/parakeet-tdt_ctc-110m", `{}`,
+				hfapi.ModelFile{Path: "parakeet-tdt_ctc-110m.nemo"},
+			)
+			Expect(imp.Match(d)).To(BeFalse())
+		})
+	})
+
+	Context("import (Import)", func() {
+		It("picks the default quant (q4_k) from a multi-quant HF repo", func() {
+			d := parakeetDetails("huggingface://mudler/parakeet-cpp-gguf", `{"name":"parakeet-110m"}`,
+				hfapi.ModelFile{Path: "tdt_ctc-110m-f16.gguf", URL: "https://hf/f16", SHA256: "aaa"},
+				hfapi.ModelFile{Path: "tdt_ctc-110m-q4_k.gguf", URL: "https://hf/q4k", SHA256: "bbb"},
+				hfapi.ModelFile{Path: "tdt_ctc-110m-q8_0.gguf", URL: "https://hf/q8", SHA256: "ccc"},
+			)
+			cfg, err := imp.Import(d)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(cfg.ConfigFile).To(ContainSubstring("backend: parakeet-cpp"), fmt.Sprintf("%+v", cfg))
+			Expect(cfg.ConfigFile).To(ContainSubstring("transcript"))
+			Expect(cfg.Files).To(HaveLen(1))
+			Expect(cfg.Files[0].URI).To(Equal("https://hf/q4k"), "default quant should be q4_k")
+			Expect(cfg.Files[0].Filename).To(ContainSubstring("parakeet-cpp/models/parakeet-110m/tdt_ctc-110m-q4_k.gguf"))
+		})
+
+		It("honours a preferred quantization override", func() {
+			d := parakeetDetails("huggingface://mudler/parakeet-cpp-gguf", `{"name":"p","quantizations":"q8_0"}`,
+				hfapi.ModelFile{Path: "tdt_ctc-110m-f16.gguf", URL: "https://hf/f16"},
+				hfapi.ModelFile{Path: "tdt_ctc-110m-q8_0.gguf", URL: "https://hf/q8"},
+			)
+			cfg, err := imp.Import(d)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(cfg.Files).To(HaveLen(1))
+			Expect(cfg.Files[0].URI).To(Equal("https://hf/q8"))
+		})
+
+		It("uses the exact file for a direct GGUF URL", func() {
+			d := parakeetDetails("https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/ctc-0.6b-q5_k.gguf", `{"name":"ctc"}`)
+			cfg, err := imp.Import(d)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(cfg.Files).To(HaveLen(1))
+			Expect(cfg.Files[0].Filename).To(ContainSubstring("parakeet-cpp/models/ctc/ctc-0.6b-q5_k.gguf"))
+		})
+	})
+})
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -308,6 +308,11 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string
 	var modelDir string
+	// localAIApp captures the Application so AfterEach can synchronously
+	// stop the spawned gRPC backend processes. application.New cancels
+	// them asynchronously on context cancel, which races with test-binary
+	// exit and leaks mock-backend children to init.
+	var localAIApp *application.Application

 	commonOpts := []config.AppOption{
 		config.WithDebug(true),
@@ -736,14 +741,14 @@ parameters:
 			)
 			Expect(err).ToNot(HaveOccurred())

-			application, err := application.New(
+			localAIApp, err = application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithSystemState(systemState),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
-			app, err = API(application)
+			localAIApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
+			app, err = API(localAIApp)
 			Expect(err).ToNot(HaveOccurred())
 			go func() {
 				if err := app.Start("127.0.0.1:9090"); err != nil && err != http.ErrServerClosed {
@@ -765,6 +770,11 @@ parameters:
 			}, "2m").ShouldNot(HaveOccurred())
 		})
 		AfterEach(func() {
+			// Synchronous shutdown — context-cancel cleanup is async and races
+			// test-binary exit, orphaning mock-backend children to init.
+			if localAIApp != nil {
+				_ = localAIApp.Shutdown()
+			}
 			cancel()
 			if app != nil {
 				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
@@ -976,15 +986,15 @@ parameters:
 			)
 			Expect(err).ToNot(HaveOccurred())

-			application, err := application.New(
+			localAIApp, err = application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithSystemState(systemState),
 					config.WithConfigFile(configFile))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
-			app, err = API(application)
+			localAIApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
+			app, err = API(localAIApp)
 			Expect(err).ToNot(HaveOccurred())

 			go func() {
@@ -1005,6 +1015,11 @@ parameters:
 			}, "2m").ShouldNot(HaveOccurred())
 		})
 		AfterEach(func() {
+			// Synchronous shutdown — context-cancel cleanup is async and races
+			// test-binary exit, orphaning mock-backend children to init.
+			if localAIApp != nil {
+				_ = localAIApp.Shutdown()
+			}
 			cancel()
 			if app != nil {
 				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
--- a/core/http/auth/oauth.go
+++ b/core/http/auth/oauth.go
@@ -19,6 +19,8 @@ import (
 	"golang.org/x/oauth2"
 	githubOAuth "golang.org/x/oauth2/github"
 	"gorm.io/gorm"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // providerEntry holds the OAuth2/OIDC config for a single provider.
@@ -389,7 +391,7 @@ func fetchGitHubUserInfoAsOAuth(ctx context.Context, accessToken string) (*oauth
 }

 func fetchGitHubUserInfo(ctx context.Context, accessToken string) (*githubUserInfo, error) {
-	client := &http.Client{Timeout: 10 * time.Second}
+	client := httpclient.NewWithTimeout(10 * time.Second)

 	req, _ := http.NewRequestWithContext(ctx, "GET", "https://api.github.com/user", nil)
 	req.Header.Set("Authorization", "Bearer "+accessToken)
@@ -420,7 +422,7 @@ func fetchGitHubUserInfo(ctx context.Context, accessToken string) (*githubUserIn
 }

 func fetchGitHubPrimaryEmail(ctx context.Context, accessToken string) (string, error) {
-	client := &http.Client{Timeout: 10 * time.Second}
+	client := httpclient.NewWithTimeout(10 * time.Second)

 	req, _ := http.NewRequestWithContext(ctx, "GET", "https://api.github.com/user/emails", nil)
 	req.Header.Set("Authorization", "Bearer "+accessToken)
@@ -458,7 +460,6 @@ func fetchGitHubPrimaryEmail(ctx context.Context, accessToken string) (string, e
 	return "", fmt.Errorf("no verified email found")
 }

-
 func generateState() (string, error) {
 	b := make([]byte, 16)
 	if _, err := rand.Read(b); err != nil {
--- a/core/http/endpoints/localai/audio.go
+++ b/core/http/endpoints/localai/audio.go
@@ -11,6 +11,8 @@ import (
 	"time"

 	"github.com/labstack/echo/v4"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 	"github.com/mudler/LocalAI/pkg/utils"
 )

@@ -22,7 +24,9 @@ import (
 // decoding on the leading `data:` bytes.
 var audioDataURIPattern = regexp.MustCompile(`^data:[^,]+?;base64,`)

-var audioDownloadClient = http.Client{Timeout: 30 * time.Second}
+// Downloading user-supplied media URLs legitimately follows redirects (CDNs);
+// WithFollowRedirects still strips any credential header on a cross-host hop.
+var audioDownloadClient = httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects())

 // decodeAudioInput materialises a URL / data-URI / raw-base64 audio
 // payload to a temporary file and returns its path plus a cleanup
--- a/core/http/endpoints/localai/cors_proxy.go
+++ b/core/http/endpoints/localai/cors_proxy.go
@@ -11,9 +11,11 @@ import (
 	"time"

 	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/httpclient"
+	"github.com/mudler/LocalAI/pkg/utils"
 )

 // CORSProxyEndpoint proxies HTTP requests to external MCP servers,
@@ -77,7 +79,7 @@ func CORSProxyEndpoint(appConfig *config.ApplicationConfig) echo.HandlerFunc {
 				)
 			},
 		}
-		client := &http.Client{Transport: transport, Timeout: 10 * time.Minute}
+		client := httpclient.New(httpclient.WithTransport(transport), httpclient.WithTimeout(10*time.Minute))

 		xlog.Debug("CORS proxy request", "method", c.Request().Method, "target", targetURL)

--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -16,14 +16,16 @@ import (
 	"github.com/google/uuid"
 	"github.com/gorilla/websocket"
 	"github.com/labstack/echo/v4"
+	"github.com/mudler/xlog"
+	"gorm.io/gorm"
+
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/http/auth"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/galleryop"
 	"github.com/mudler/LocalAI/core/services/nodes"
-	"github.com/mudler/xlog"
-	"gorm.io/gorm"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // nodeError builds a schema.ErrorResponse for node endpoints.
@@ -65,15 +67,15 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {

 // RegisterNodeRequest is the request body for registering a new worker node.
 type RegisterNodeRequest struct {
-	Name          string `json:"name"`
-	NodeType      string `json:"node_type,omitempty"` // "backend" (default) or "agent"
-	Address       string `json:"address"`
-	HTTPAddress   string `json:"http_address,omitempty"`
-	Token         string `json:"token,omitempty"`
-	TotalVRAM     uint64 `json:"total_vram,omitempty"`
-	AvailableVRAM uint64 `json:"available_vram,omitempty"`
-	TotalRAM      uint64 `json:"total_ram,omitempty"`
-	AvailableRAM  uint64 `json:"available_ram,omitempty"`
+	Name          string            `json:"name"`
+	NodeType      string            `json:"node_type,omitempty"` // "backend" (default) or "agent"
+	Address       string            `json:"address"`
+	HTTPAddress   string            `json:"http_address,omitempty"`
+	Token         string            `json:"token,omitempty"`
+	TotalVRAM     uint64            `json:"total_vram,omitempty"`
+	AvailableVRAM uint64            `json:"available_vram,omitempty"`
+	TotalRAM      uint64            `json:"total_ram,omitempty"`
+	AvailableRAM  uint64            `json:"available_ram,omitempty"`
 	GPUVendor     string            `json:"gpu_vendor,omitempty"`
 	Labels        map[string]string `json:"labels,omitempty"`
 	// MaxReplicasPerModel is the per-node cap on replicas of any single model.
@@ -983,6 +985,6 @@ func proxyHTTPToWorker(httpAddress, path, token string) (*http.Response, error)
 		req.Header.Set("Authorization", "Bearer "+token)
 	}

-	client := &http.Client{Timeout: 15 * time.Second}
+	client := httpclient.NewWithTimeout(15 * time.Second)
 	return client.Do(req)
 }
--- a/core/http/endpoints/localai/video.go
+++ b/core/http/endpoints/localai/video.go
@@ -6,7 +6,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"net/http"
 	"net/url"
 	"os"
 	"path/filepath"
@@ -15,18 +14,23 @@ import (

 	"github.com/google/uuid"
 	"github.com/labstack/echo/v4"
+
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"

 	"github.com/mudler/LocalAI/core/backend"

+	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
-	"github.com/mudler/xlog"
 )

-var videoDownloadClient = http.Client{Timeout: 30 * time.Second}
+// Downloading user-supplied media URLs legitimately follows redirects (CDNs);
+// WithFollowRedirects still strips any credential header on a cross-host hop.
+var videoDownloadClient = httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects())

 func downloadFile(url string) (string, error) {
 	if err := utils.ValidateExternalURL(url); err != nil {
--- a/core/http/endpoints/mcp/tools.go
+++ b/core/http/endpoints/mcp/tools.go
@@ -17,6 +17,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/messaging"

 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 	"github.com/mudler/LocalAI/pkg/signals"

 	"github.com/modelcontextprotocol/go-sdk/mcp"
@@ -180,10 +181,10 @@ func SessionsFromMCPConfig(
 	for _, server := range remote.Servers {
 		xlog.Debug("[MCP remote server] Configuration", "server", server)
 		// Create HTTP client with custom roundtripper for bearer token injection
-		httpClient := &http.Client{
-			Timeout:   config.DefaultMCPToolTimeout,
-			Transport: newBearerTokenRoundTripper(server.Token, http.DefaultTransport),
-		}
+		httpClient := httpclient.New(
+			httpclient.WithTimeout(config.DefaultMCPToolTimeout),
+			httpclient.WithTransport(newBearerTokenRoundTripper(server.Token, httpclient.HardenedTransport())),
+		)

 		transport := &mcp.StreamableClientTransport{Endpoint: server.URL, HTTPClient: httpClient}
 		mcpSession, err := client.Connect(ctx, transport, nil)
@@ -262,10 +263,10 @@ func NamedSessionsFromMCPConfig(

 		for serverName, server := range remote.Servers {
 			xlog.Debug("[MCP remote server] Configuration", "name", serverName, "server", server)
-			httpClient := &http.Client{
-				Timeout:   config.DefaultMCPToolTimeout,
-				Transport: newBearerTokenRoundTripper(server.Token, http.DefaultTransport),
-			}
+			httpClient := httpclient.New(
+				httpclient.WithTimeout(config.DefaultMCPToolTimeout),
+				httpclient.WithTransport(newBearerTokenRoundTripper(server.Token, httpclient.HardenedTransport())),
+			)

 			transport := &mcp.StreamableClientTransport{Endpoint: server.URL, HTTPClient: httpClient}
 			mcpSession, err := client.Connect(ctx, transport, nil)
--- a/core/http/endpoints/openai/chat_stream_workers.go
+++ b/core/http/endpoints/openai/chat_stream_workers.go
@@ -341,6 +341,19 @@ func processStreamWithTools(
 			}
 		}

+		// Issue #9722: when the C++ autoparser is already producing tool
+		// calls (it delivers them via ChatDeltas, which are flushed at
+		// end-of-stream by ToolCallsFromChatDeltas -> buildDeferredToolCallChunks),
+		// skip the Go-side iterative parser below. Running both parsers makes
+		// the same logical tool call surface at multiple `index` values.
+		// The deferred flush is guarded by lastEmittedCount, so the race where
+		// the Go parser already emitted before this flag flipped also stays
+		// single-emission. Backends without an autoparser (e.g. vLLM) keep
+		// hasChatDeltaToolCalls=false and are unaffected.
+		if hasChatDeltaToolCalls {
+			return true
+		}
+
 		// Try incremental XML parsing for streaming support using iterative parser
 		// This allows emitting partial tool calls as they're being generated
 		cleanedResult := functions.CleanupLLMResult(result, cfg.FunctionsConfig)
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -6,7 +6,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
-	"net/http"
 	"net/url"
 	"os"
 	"path/filepath"
@@ -16,15 +15,18 @@ import (

 	"github.com/google/uuid"
 	"github.com/labstack/echo/v4"
+
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"

 	"github.com/mudler/LocalAI/core/backend"

+	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
-	"github.com/mudler/xlog"
 )

 func downloadFile(url string) (string, error) {
@@ -33,7 +35,7 @@ func downloadFile(url string) (string, error) {
 	}

 	// Get the data
-	resp, err := http.Get(url)
+	resp, err := httpclient.New(httpclient.WithFollowRedirects()).Get(url)
 	if err != nil {
 		return "", err
 	}
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -310,6 +310,26 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema.
 		config.Temperature = input.Temperature
 	}

+	// Map the per-request reasoning_effort onto the reasoning toggle the
+	// backend reads (enable_thinking metadata, set in gRPCPredictOpts).
+	// "none" disables thinking for this request - the use case from #10072,
+	// running a single Qwen3-style model and turning reasoning off per
+	// request. Any explicit effort level enables thinking, UNLESS the model
+	// config explicitly disabled it (DisableReasoning==true wins): an
+	// operator who deliberately turned reasoning off should not be overridden
+	// by a request. A value of "none" always disables, since that never
+	// conflicts with a config that also disables.
+	switch strings.ToLower(input.ReasoningEffort) {
+	case "none":
+		disable := true
+		config.ReasoningConfig.DisableReasoning = &disable
+	case "minimal", "low", "medium", "high":
+		if config.ReasoningConfig.DisableReasoning == nil || !*config.ReasoningConfig.DisableReasoning {
+			enable := false
+			config.ReasoningConfig.DisableReasoning = &enable
+		}
+	}
+
 	// Collapse the modern max_completion_tokens alias into the
 	// legacy Maxtokens field so downstream code reads exactly one.
 	// MaxCompletionTokens wins on conflict — it's the canonical
--- a/core/http/middleware/request_test.go
+++ b/core/http/middleware/request_test.go
@@ -597,3 +597,137 @@ var _ = Describe("SetModelAndConfig tool_choice parsing (chat completions)", fun
 		})
 	})
 })
+
+// These tests cover the per-request reasoning_effort -> enable_thinking mapping.
+// The merge lives in mergeOpenAIRequestAndModelConfig (called from
+// SetOpenAIRequest), so they drive the full middleware chain like the
+// production /v1/chat/completions route does. The block builds its own app per
+// test so the model config can be varied (some cases need reasoning.disable set
+// in the model YAML to assert that an explicit config disable wins).
+//
+// Mapping under test (issue #10072):
+//   - reasoning_effort=none                 -> DisableReasoning=true
+//   - reasoning_effort=low/medium/high      -> DisableReasoning=false, UNLESS the
+//     model config explicitly set true
+//   - empty / unrecognized                  -> no change
+var _ = Describe("SetModelAndConfig reasoning_effort parsing (chat completions)", func() {
+	var modelDir string
+
+	BeforeEach(func() {
+		var err error
+		modelDir, err = os.MkdirTemp("", "localai-test-models-*")
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(modelDir)
+	})
+
+	// buildApp writes a model config with the given YAML body and returns an app
+	// plus a pointer to the captured per-request config.
+	buildApp := func(cfgYAML string) (*echo.Echo, **config.ModelConfig) {
+		Expect(os.WriteFile(filepath.Join(modelDir, "test-model.yaml"), []byte(cfgYAML), 0644)).To(Succeed())
+
+		ss := &system.SystemState{Model: system.Model{ModelsPath: modelDir}}
+		appConfig := config.NewApplicationConfig()
+		appConfig.SystemState = ss
+		mcl := config.NewModelConfigLoader(modelDir)
+		ml := model.NewModelLoader(ss)
+		re := NewRequestExtractor(mcl, ml, appConfig)
+
+		captured := new(*config.ModelConfig)
+		app := echo.New()
+		app.POST("/v1/chat/completions",
+			func(c echo.Context) error {
+				if cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig); ok {
+					*captured = cfg
+				}
+				return c.String(http.StatusOK, "ok")
+			},
+			re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			func(next echo.HandlerFunc) echo.HandlerFunc {
+				return func(c echo.Context) error {
+					if err := re.SetOpenAIRequest(c); err != nil {
+						return err
+					}
+					return next(c)
+				}
+			},
+		)
+		return app, captured
+	}
+
+	chatReq := func(effort string) string {
+		return `{"model":"test-model",` +
+			`"messages":[{"role":"user","content":"hi"}],` +
+			`"reasoning_effort":` + effort + `}`
+	}
+
+	plainCfg := "name: test-model\nbackend: llama-cpp\n"
+
+	It("disables thinking for reasoning_effort=none", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"none"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("enables thinking for reasoning_effort=high when config is unset", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+
+	It("enables thinking for reasoning_effort=high when config explicitly set false", func() {
+		app, captured := buildApp(plainCfg + "reasoning:\n  disable: false\n")
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+
+	It("config wins: reasoning_effort=high cannot re-enable when config explicitly disabled", func() {
+		app, captured := buildApp(plainCfg + "reasoning:\n  disable: true\n")
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("is a no-op when reasoning_effort is empty", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions",
+			`{"model":"test-model","messages":[{"role":"user","content":"hi"}]}`)
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).To(BeNil())
+	})
+
+	It("is case-insensitive (None disables, HIGH enables)", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"None"`))
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+
+		app2, captured2 := buildApp(plainCfg)
+		rec2 := postJSON(app2, "/v1/chat/completions", chatReq(`"HIGH"`))
+		Expect(rec2.Code).To(Equal(http.StatusOK))
+		Expect(*captured2).ToNot(BeNil())
+		Expect((*captured2).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured2).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+})
--- a/core/http/react-ui/e2e/models-gallery.spec.js
+++ b/core/http/react-ui/e2e/models-gallery.spec.js
@@ -1,28 +1,52 @@
-import { test, expect } from './coverage-fixtures.js'
+import { test, expect } from "./coverage-fixtures.js";

 const MOCK_MODELS_RESPONSE = {
  models: [
-    { name: 'llama-model', description: 'A llama model', backend: 'llama-cpp', installed: false, tags: ['chat'] },
-    { name: 'whisper-model', description: 'A whisper model', backend: 'whisper', installed: true, tags: ['transcript'] },
-    { name: 'stablediffusion-model', description: 'An image model', backend: 'stablediffusion', installed: false, tags: ['sd'] },
-    { name: 'unknown-model', description: 'No backend', backend: '', installed: false, tags: [] },
+    {
+      name: "llama-model",
+      description: "A llama model",
+      backend: "llama-cpp",
+      installed: false,
+      tags: ["chat"],
+    },
+    {
+      name: "whisper-model",
+      description: "A whisper model",
+      backend: "whisper",
+      installed: true,
+      tags: ["transcript"],
+    },
+    {
+      name: "stablediffusion-model",
+      description: "An image model",
+      backend: "stablediffusion",
+      installed: false,
+      tags: ["sd"],
+    },
+    {
+      name: "unknown-model",
+      description: "No backend",
+      backend: "",
+      installed: false,
+      tags: [],
+    },
  ],
-  allBackends: ['llama-cpp', 'stablediffusion', 'whisper'],
-  allTags: ['chat', 'sd', 'transcript'],
+  allBackends: ["llama-cpp", "stablediffusion", "whisper"],
+  allTags: ["chat", "sd", "transcript"],
  availableModels: 4,
  installedModels: 1,
  totalPages: 1,
  currentPage: 1,
-}
+};

 const MOCK_GPU_RESOURCES_RESPONSE = {
-  type: 'gpu',
+  type: "gpu",
  available: true,
  gpus: [
    {
      index: 0,
-      name: 'Mock GPU',
-      vendor: 'nvidia',
+      name: "Mock GPU",
+      vendor: "nvidia",
      total_vram: 12 * 1024 * 1024 * 1024,
      used_vram: 2 * 1024 * 1024 * 1024,
      free_vram: 10 * 1024 * 1024 * 1024,
@@ -36,272 +60,374 @@ const MOCK_GPU_RESOURCES_RESPONSE = {
    usage_percent: 16.7,
    gpu_count: 1,
  },
-}
+};

 const MOCK_ESTIMATES = {
-  'llama-model': {
+  "llama-model": {
    sizeBytes: 4 * 1024 * 1024 * 1024,
-    sizeDisplay: '4.00 GB',
+    sizeDisplay: "4.00 GB",
    estimates: {
-      '8192': {
+      8192: {
        vramBytes: 8 * 1024 * 1024 * 1024,
-        vramDisplay: '8.00 GB',
+        vramDisplay: "8.00 GB",
      },
    },
  },
-  'whisper-model': {
+  "whisper-model": {
    sizeBytes: 1 * 1024 * 1024 * 1024,
-    sizeDisplay: '1.00 GB',
+    sizeDisplay: "1.00 GB",
    estimates: {
-      '8192': {
+      8192: {
        vramBytes: 2 * 1024 * 1024 * 1024,
-        vramDisplay: '2.00 GB',
+        vramDisplay: "2.00 GB",
      },
    },
  },
-  'stablediffusion-model': {
+  "stablediffusion-model": {
    sizeBytes: 8 * 1024 * 1024 * 1024,
-    sizeDisplay: '8.00 GB',
+    sizeDisplay: "8.00 GB",
    estimates: {
-      '8192': {
+      8192: {
        vramBytes: 16 * 1024 * 1024 * 1024,
-        vramDisplay: '16.00 GB',
+        vramDisplay: "16.00 GB",
      },
    },
  },
-}
+};

-test.describe('Models Gallery - Backend Features', () => {
+test.describe("Models Gallery - Backend Features", () => {
  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/models*', (route) => {
+    await page.route("**/api/models*", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_MODELS_RESPONSE),
-      })
-    })
-    await page.goto('/app/models')
+      });
+    });
+    await page.goto("/app/models");
    // Wait for the table to render
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
-  })
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });

-  test('backend column header is visible', async ({ page }) => {
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible()
-  })
+  test("backend column header is visible", async ({ page }) => {
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible();
+  });

-  test('backend badges shown in table rows', async ({ page }) => {
-    const table = page.locator('table')
-    await expect(table.locator('.badge', { hasText: 'llama-cpp' })).toBeVisible()
-    await expect(table.locator('.badge', { hasText: /^whisper$/ })).toBeVisible()
-  })
+  test("backend badges shown in table rows", async ({ page }) => {
+    const table = page.locator("table");
+    await expect(
+      table.locator(".badge", { hasText: "llama-cpp" }),
+    ).toBeVisible();
+    await expect(
+      table.locator(".badge", { hasText: /^whisper$/ }),
+    ).toBeVisible();
+  });

-  test('backend dropdown is visible', async ({ page }) => {
-    await expect(page.locator('button', { hasText: 'All Backends' })).toBeVisible()
-  })
+  test("backend dropdown is visible", async ({ page }) => {
+    await expect(
+      page.locator("button", { hasText: "All Backends" }),
+    ).toBeVisible();
+  });

-  test('clicking backend dropdown opens searchable panel', async ({ page }) => {
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    await expect(page.locator('input[placeholder="Search backends..."]')).toBeVisible()
-  })
+  test("clicking backend dropdown opens searchable panel", async ({ page }) => {
+    await page.locator("button", { hasText: "All Backends" }).click();
+    await expect(
+      page.locator('input[placeholder="Search backends..."]'),
+    ).toBeVisible();
+  });

-  test('typing in search filters dropdown options', async ({ page }) => {
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    const searchInput = page.locator('input[placeholder="Search backends..."]')
-    await searchInput.fill('llama')
+  test("typing in search filters dropdown options", async ({ page }) => {
+    await page.locator("button", { hasText: "All Backends" }).click();
+    const searchInput = page.locator('input[placeholder="Search backends..."]');
+    await searchInput.fill("llama");

    // llama-cpp option should be visible, whisper should not
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..')  .locator('..')
-    await expect(dropdown.locator('text=llama-cpp')).toBeVisible()
-    await expect(dropdown.locator('text=whisper')).not.toBeVisible()
-  })
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await expect(dropdown.locator("text=llama-cpp")).toBeVisible();
+    await expect(dropdown.locator("text=whisper")).not.toBeVisible();
+  });

-  test('selecting a backend updates the dropdown label', async ({ page }) => {
-    await page.locator('button', { hasText: 'All Backends' }).click()
+  test("selecting a backend updates the dropdown label", async ({ page }) => {
+    await page.locator("button", { hasText: "All Backends" }).click();
    // Click the llama-cpp option within the dropdown (not the table badge)
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
-    await dropdown.locator('text=llama-cpp').click()
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await dropdown.locator("text=llama-cpp").click();

    // The dropdown button should now show the selected backend instead of "All Backends"
-    await expect(page.locator('button span', { hasText: 'llama-cpp' })).toBeVisible()
-  })
+    await expect(
+      page.locator("button span", { hasText: "llama-cpp" }),
+    ).toBeVisible();
+  });

-  test('expanded row shows backend in detail', async ({ page }) => {
+  test("expanded row shows backend in detail", async ({ page }) => {
    // Click the first model row to expand it
-    await page.locator('tr', { hasText: 'llama-model' }).click()
+    await page.locator("tr", { hasText: "llama-model" }).click();

    // The detail view should show Backend label and value
-    const detail = page.locator('td[colspan="8"]')
-    await expect(detail.locator('text=Backend')).toBeVisible()
-    await expect(detail.locator('text=llama-cpp')).toBeVisible()
-  })
-})
+    const detail = page.locator('td[colspan="8"]');
+    await expect(detail.locator("text=Backend")).toBeVisible();
+    await expect(detail.locator("text=llama-cpp")).toBeVisible();
+  });
+});

 const BACKEND_USECASES_MOCK = {
-  'llama-cpp': ['chat', 'embeddings', 'vision'],
-  'whisper': ['transcript'],
-  'stablediffusion': ['image'],
-}
+  "llama-cpp": ["chat", "embeddings", "vision"],
+  whisper: ["transcript"],
+  stablediffusion: ["image"],
+};

-test.describe('Models Gallery - Multi-select Filters', () => {
+const EMPTY_FILTERED_RESPONSE = {
+  ...MOCK_MODELS_RESPONSE,
+  models: [],
+  availableModels: 0,
+  totalPages: 1,
+  currentPage: 1,
+};
+
+test.describe("Models Gallery - Multi-select Filters", () => {
  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/models*', (route) => {
+    await page.route("**/api/models*", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_MODELS_RESPONSE),
-      })
-    })
-    await page.route('**/api/backends/usecases', (route) => {
+      });
+    });
+    await page.route("**/api/backends/usecases", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(BACKEND_USECASES_MOCK),
-      })
-    })
-    await page.goto('/app/models')
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
-  })
+      });
+    });
+    await page.goto("/app/models");
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });

-  test('multi-select toggle: click Chat, TTS, then Chat again', async ({ page }) => {
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+  test("multi-select toggle: click Chat, TTS, then Chat again", async ({
+    page,
+  }) => {
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });

-    await chatBtn.click()
-    await expect(chatBtn).toHaveClass(/active/)
+    await chatBtn.click();
+    await expect(chatBtn).toHaveClass(/active/);

-    await ttsBtn.click()
-    await expect(chatBtn).toHaveClass(/active/)
-    await expect(ttsBtn).toHaveClass(/active/)
+    await ttsBtn.click();
+    await expect(chatBtn).toHaveClass(/active/);
+    await expect(ttsBtn).toHaveClass(/active/);

    // Click Chat again to deselect it
-    await chatBtn.click()
-    await expect(chatBtn).not.toHaveClass(/active/)
-    await expect(ttsBtn).toHaveClass(/active/)
-  })
+    await chatBtn.click();
+    await expect(chatBtn).not.toHaveClass(/active/);
+    await expect(ttsBtn).toHaveClass(/active/);
+  });

  test('"All" clears selection', async ({ page }) => {
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const allBtn = page.locator('.filter-btn', { hasText: 'All' })
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const allBtn = page.locator(".filter-btn", { hasText: "All" });

-    await chatBtn.click()
-    await expect(chatBtn).toHaveClass(/active/)
+    await chatBtn.click();
+    await expect(chatBtn).toHaveClass(/active/);

-    await allBtn.click()
-    await expect(allBtn).toHaveClass(/active/)
-    await expect(chatBtn).not.toHaveClass(/active/)
-  })
+    await allBtn.click();
+    await expect(allBtn).toHaveClass(/active/);
+    await expect(chatBtn).not.toHaveClass(/active/);
+  });

-  test('query param sent correctly with multiple filters', async ({ page }) => {
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+  test("query param sent correctly with multiple filters", async ({ page }) => {
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });

    // Click Chat and wait for its request to settle
-    await chatBtn.click()
-    await page.waitForResponse(resp => resp.url().includes('/api/models'))
+    await chatBtn.click();
+    await page.waitForResponse((resp) => resp.url().includes("/api/models"));

    // Now click TTS and capture the resulting request
    const [request] = await Promise.all([
-      page.waitForRequest(req => {
-        if (!req.url().includes('/api/models')) return false
-        const u = new URL(req.url())
-        const tag = u.searchParams.get('tag')
-        return tag && tag.split(',').length >= 2
+      page.waitForRequest((req) => {
+        if (!req.url().includes("/api/models")) return false;
+        const u = new URL(req.url());
+        const tag = u.searchParams.get("tag");
+        return tag && tag.split(",").length >= 2;
      }),
      ttsBtn.click(),
-    ])
+    ]);

-    const url = new URL(request.url())
-    const tags = url.searchParams.get('tag').split(',').sort()
-    expect(tags).toEqual(['chat', 'tts'])
-  })
+    const url = new URL(request.url());
+    const tags = url.searchParams.get("tag").split(",").sort();
+    expect(tags).toEqual(["chat", "tts"]);
+  });

-  test('backend greys out unavailable filters', async ({ page }) => {
+  test("backend greys out unavailable filters", async ({ page }) => {
    // Select llama-cpp backend via dropdown
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
-    await dropdown.locator('text=llama-cpp').click()
+    await page.locator("button", { hasText: "All Backends" }).click();
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await dropdown.locator("text=llama-cpp").click();

    // Wait for filter state to update
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
-    const sttBtn = page.locator('.filter-btn', { hasText: 'STT' })
-    const imageBtn = page.locator('.filter-btn', { hasText: 'Image' })
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });
+    const sttBtn = page.locator(".filter-btn", { hasText: "STT" });
+    const imageBtn = page.locator(".filter-btn", { hasText: "Image" });

    // TTS, STT, Image should be disabled for llama-cpp
-    await expect(ttsBtn).toBeDisabled()
-    await expect(sttBtn).toBeDisabled()
-    await expect(imageBtn).toBeDisabled()
+    await expect(ttsBtn).toBeDisabled();
+    await expect(sttBtn).toBeDisabled();
+    await expect(imageBtn).toBeDisabled();

    // Chat, Embeddings, Vision should remain enabled
-    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
-    const embBtn = page.locator('.filter-btn', { hasText: 'Embeddings' })
-    const visBtn = page.locator('.filter-btn', { hasText: 'Vision' })
-    await expect(chatBtn).toBeEnabled()
-    await expect(embBtn).toBeEnabled()
-    await expect(visBtn).toBeEnabled()
-  })
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const embBtn = page.locator(".filter-btn", { hasText: "Embeddings" });
+    const visBtn = page.locator(".filter-btn", { hasText: "Vision" });
+    await expect(chatBtn).toBeEnabled();
+    await expect(embBtn).toBeEnabled();
+    await expect(visBtn).toBeEnabled();
+  });

-  test('backend clears incompatible filters', async ({ page }) => {
+  test("backend clears incompatible filters", async ({ page }) => {
    // Select TTS filter first
-    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
-    await ttsBtn.click()
-    await expect(ttsBtn).toHaveClass(/active/)
+    const ttsBtn = page.locator(".filter-btn", { hasText: "TTS" });
+    await ttsBtn.click();
+    await expect(ttsBtn).toHaveClass(/active/);

    // Now select llama-cpp backend (which doesn't support TTS)
-    await page.locator('button', { hasText: 'All Backends' }).click()
-    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
-    await dropdown.locator('text=llama-cpp').click()
+    await page.locator("button", { hasText: "All Backends" }).click();
+    const dropdown = page
+      .locator('input[placeholder="Search backends..."]')
+      .locator("..")
+      .locator("..");
+    await dropdown.locator("text=llama-cpp").click();

    // TTS should be auto-removed from selection
-    await expect(ttsBtn).not.toHaveClass(/active/)
-  })
-})
+    await expect(ttsBtn).not.toHaveClass(/active/);
+  });
+});

-test.describe('Models Gallery - Fits In GPU Filter', () => {
+test.describe("Models Gallery - Fits In GPU Filter", () => {
  test.beforeEach(async ({ page }) => {
-    await page.route('**/api/models*', (route) => {
+    await page.route("**/api/models*", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_MODELS_RESPONSE),
-      })
-    })
+      });
+    });

-    await page.route('**/api/resources', (route) => {
+    await page.route("**/api/resources", (route) => {
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_GPU_RESOURCES_RESPONSE),
-      })
-    })
+      });
+    });

-    await page.route('**/api/models/estimate/*', (route) => {
-      const url = new URL(route.request().url())
-      const id = decodeURIComponent(url.pathname.split('/').pop() || '')
+    await page.route("**/api/models/estimate/*", (route) => {
+      const url = new URL(route.request().url());
+      const id = decodeURIComponent(url.pathname.split("/").pop() || "");
      route.fulfill({
-        contentType: 'application/json',
+        contentType: "application/json",
        body: JSON.stringify(MOCK_ESTIMATES[id] || {}),
-      })
-    })
+      });
+    });

-    await page.goto('/app/models')
-    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
-  })
+    await page.goto("/app/models");
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });

-  test('fits toggle is visible when GPU resources are available', async ({ page }) => {
-    await expect(page.getByText('Fits in GPU')).toBeVisible()
-  })
+  test("fits toggle is visible when GPU resources are available", async ({
+    page,
+  }) => {
+    await expect(page.getByText("Fits in GPU")).toBeVisible();
+  });

-  test('enabling fits filter hides models that exceed available VRAM', async ({ page }) => {
-    await expect(page.locator('tr', { hasText: 'stablediffusion-model' })).toBeVisible()
+  test("enabling fits filter hides models that exceed available VRAM", async ({
+    page,
+  }) => {
+    await expect(
+      page.locator("tr", { hasText: "stablediffusion-model" }),
+    ).toBeVisible();

    // The shared <Toggle> visually hides its native input (opacity:0;w:0;h:0),
    // so .check() can't interact with it directly — click the visible track.
-    await page.locator('label.filter-bar-group__toggle', { hasText: 'Fits in GPU' }).locator('.toggle__track').click()
+    await page
+      .locator("label.filter-bar-group__toggle", { hasText: "Fits in GPU" })
+      .locator(".toggle__track")
+      .click();

-    await expect(page.locator('tr', { hasText: 'stablediffusion-model' })).toHaveCount(0)
-    await expect(page.locator('tr', { hasText: 'llama-model' })).toBeVisible()
+    await expect(
+      page.locator("tr", { hasText: "stablediffusion-model" }),
+    ).toHaveCount(0);
+    await expect(page.locator("tr", { hasText: "llama-model" })).toBeVisible();
    // Unknown estimate stays visible until an explicit non-fit verdict exists.
-    await expect(page.locator('tr', { hasText: 'unknown-model' })).toBeVisible()
-  })
+    await expect(
+      page.locator("tr", { hasText: "unknown-model" }),
+    ).toBeVisible();
+  });

-  test('fits filter state persists after reload', async ({ page }) => {
-    await page.locator('label.filter-bar-group__toggle', { hasText: 'Fits in GPU' }).locator('.toggle__track').click()
-    await page.reload()
-    await expect(page.getByLabel('Fits in GPU')).toBeChecked()
-  })
-})
+  test("fits filter state persists after reload", async ({ page }) => {
+    await page
+      .locator("label.filter-bar-group__toggle", { hasText: "Fits in GPU" })
+      .locator(".toggle__track")
+      .click();
+    await page.reload();
+    await expect(page.getByLabel("Fits in GPU")).toBeChecked();
+  });
+});
+
+test.describe("Models Gallery - Empty State", () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route("**/api/models*", (route) => {
+      const url = new URL(route.request().url());
+      const tag = url.searchParams.get("tag");
+      const body =
+        tag === "chat" ? EMPTY_FILTERED_RESPONSE : MOCK_MODELS_RESPONSE;
+
+      route.fulfill({
+        contentType: "application/json",
+        body: JSON.stringify(body),
+      });
+    });
+
+    await page.goto("/app/models");
+    await expect(page.locator("th", { hasText: "Backend" })).toBeVisible({
+      timeout: 10_000,
+    });
+  });
+
+  test("shows empty state for filtered-out results and clear filters restores the gallery", async ({
+    page,
+  }) => {
+    const chatBtn = page.locator(".filter-btn", { hasText: "Chat" });
+    const allBtn = page.locator(".filter-btn", { hasText: "All" });
+
+    await chatBtn.click();
+
+    await expect(page.locator(".empty-state-title")).toHaveText(
+      "No models found",
+    );
+    await expect(page.locator(".empty-state-text")).toHaveText(
+      "No models match your current search or filters.",
+    );
+
+    const clearBtn = page.getByRole("button", { name: "Clear filters" });
+    await expect(clearBtn).toBeVisible();
+    await expect(page.locator("tr", { hasText: "llama-model" })).toHaveCount(0);
+
+    await clearBtn.click();
+
+    await expect(allBtn).toHaveClass(/active/);
+    await expect(chatBtn).not.toHaveClass(/active/);
+    await expect(page.locator(".empty-state")).toHaveCount(0);
+    await expect(page.locator("tr", { hasText: "llama-model" })).toBeVisible();
+  });
+});
--- a/core/services/agentpool/agent_jobs.go
+++ b/core/services/agentpool/agent_jobs.go
@@ -21,17 +21,19 @@ import (

 	"github.com/Masterminds/sprig/v3"
 	"github.com/google/uuid"
+	"github.com/mudler/cogito"
+	"github.com/mudler/cogito/clients"
+	"github.com/mudler/xlog"
+	"github.com/robfig/cron/v3"
+
 	"github.com/mudler/LocalAI/core/config"
 	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/jobs"
 	"github.com/mudler/LocalAI/core/templates"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/xsync"
-	"github.com/mudler/cogito"
-	"github.com/mudler/cogito/clients"
-	"github.com/mudler/xlog"
-	"github.com/robfig/cron/v3"
 )

 // AgentJobService manages agent tasks and job execution
@@ -647,7 +649,7 @@ func (s *AgentJobService) fetchMultimediaFromURL(url string, headers map[string]
 	}

 	// Execute request
-	client := &http.Client{Timeout: 30 * time.Second}
+	client := httpclient.NewWithTimeout(30 * time.Second)
 	resp, err := client.Do(req)
 	if err != nil {
 		return "", fmt.Errorf("failed to fetch URL: %w", err)
@@ -1249,7 +1251,7 @@ func (s *AgentJobService) sendWebhook(job schema.Job, task schema.Task, webhookC
 	}

 	// Execute with retry
-	client := &http.Client{Timeout: 30 * time.Second}
+	client := httpclient.NewWithTimeout(30 * time.Second)
 	err = s.executeWithRetry(client, req)
 	if err != nil {
 		xlog.Error("Webhook delivery failed", "error", err, "job_id", job.ID, "webhook_url", webhookConfig.URL)
--- a/core/services/agents/knowledge.go
+++ b/core/services/agents/knowledge.go
@@ -13,6 +13,8 @@ import (

 	"github.com/mudler/cogito"
 	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // KBSearchResult represents a search result from the knowledge base.
@@ -61,7 +63,7 @@ func KBAutoSearchPrompt(ctx context.Context, apiURL, apiKey, collection, query s
 		req.Header.Set("Authorization", "Bearer "+apiKey)
 	}

-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.New().Do(req)
 	if err != nil {
 		xlog.Warn("KB auto-search: request failed", "error", err)
 		return ""
@@ -181,7 +183,7 @@ func KBStoreContent(ctx context.Context, apiURL, apiKey, collection, content, us
 		req.Header.Set("Authorization", "Bearer "+apiKey)
 	}

-	resp, err := http.DefaultClient.Do(req)
+	resp, err := httpclient.New().Do(req)
 	if err != nil {
 		return fmt.Errorf("upload request failed: %w", err)
 	}
--- a/core/services/cloudproxy/mitm/handler.go
+++ b/core/services/cloudproxy/mitm/handler.go
@@ -12,12 +12,14 @@ import (
 	"sync/atomic"
 	"time"

+	"github.com/mudler/xlog"
+	"golang.org/x/net/http2"
+
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/cloudproxy/ssewire"
 	"github.com/mudler/LocalAI/core/services/routing/pii"
 	"github.com/mudler/LocalAI/core/services/routing/piiadapter"
-	"github.com/mudler/xlog"
-	"golang.org/x/net/http2"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // PIIHandlerOptions configures NewPIIHandler.
@@ -87,7 +89,14 @@ func NewPIIHandler(opts PIIHandlerOptions) InterceptHandler {
 	}

 	d := &piiDispatcher{
-		client:        &http.Client{Transport: transport},
+		// Refuse redirects: the MITM client forwards to the real
+		// upstream over TLS, and a 3xx means the upstream (or something
+		// impersonating it) is trying to bounce the request elsewhere.
+		// Following it would replay caller headers — including provider
+		// API keys such as Anthropic's x-api-key, which Go does NOT
+		// strip on cross-host redirects — to an unvetted host. Surface
+		// it as an error (handled as a 502) instead.
+		client:        httpclient.New(httpclient.WithTransport(transport)),
 		redactor:      opts.Redactor,
 		store:         opts.EventStore,
 		patternAction: patternAction,
--- a/core/services/cloudproxy/mitm/handler_test.go
+++ b/core/services/cloudproxy/mitm/handler_test.go
@@ -123,6 +123,25 @@ var _ = Describe("PIIHandler", func() {
 		Expect(store.recorded()).NotTo(BeZero(), "no PIIEvent recorded for the email match")
 	})

+	It("refuses to follow an upstream redirect", func() {
+		// A 3xx from the upstream would otherwise be followed, replaying
+		// the request (and its provider API key, e.g. Anthropic's
+		// x-api-key which Go does NOT strip on cross-host redirects) to
+		// the Location host. The refused redirect surfaces as a 502.
+		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			http.Redirect(w, r, "https://evil.example.com/steal", http.StatusFound)
+		})
+
+		client, base, _, cleanup := startPIITestRig(upstream)
+		defer cleanup()
+
+		body := `{"model":"claude-3-5-sonnet","max_tokens":100,"messages":[{"role":"user","content":"hello"}]}`
+		resp, err := client.Post(base+"/v1/messages", "application/json", strings.NewReader(body))
+		Expect(err).NotTo(HaveOccurred(), "client.Post")
+		defer func() { _ = resp.Body.Close() }()
+		Expect(resp.StatusCode).To(Equal(http.StatusBadGateway), "refused redirect must surface as 502, not be followed")
+	})
+
 	It("blocks api key in request", func() {
 		upstreamCalled := false
 		upstream := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
--- a/core/services/nodes/file_stager_http.go
+++ b/core/services/nodes/file_stager_http.go
@@ -16,9 +16,11 @@ import (
 	"syscall"
 	"time"

+	"github.com/mudler/xlog"
+
 	"github.com/mudler/LocalAI/core/services/storage"
 	"github.com/mudler/LocalAI/pkg/downloader"
-	"github.com/mudler/xlog"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // HTTPFileStager implements FileStager using HTTP for environments without S3.
@@ -67,14 +69,12 @@ func NewHTTPFileStager(httpAddrFor func(nodeID string) (string, error), token st
 	return &HTTPFileStager{
 		httpAddrFor: httpAddrFor,
 		token:       token,
-		client: &http.Client{
-			// No Timeout set — for large uploads, http.Client.Timeout covers the
-			// entire request lifecycle including the body upload. If it fires
-			// mid-write, Go closes the connection causing "connection reset by peer"
-			// on the server. Instead we use ResponseHeaderTimeout on the transport
-			// to cover only the wait-for-server-response phase.
-			Transport: transport,
-		},
+		// No Timeout set — for large uploads, http.Client.Timeout covers the
+		// entire request lifecycle including the body upload. If it fires
+		// mid-write, Go closes the connection causing "connection reset by peer"
+		// on the server. Instead we use ResponseHeaderTimeout on the transport
+		// to cover only the wait-for-server-response phase.
+		client:          httpclient.New(httpclient.WithTransport(transport)),
 		responseTimeout: responseTimeout,
 		maxRetries:      maxRetries,
 	}
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -412,7 +412,10 @@ These load-time options control how the backend parses `<think>` reasoning block
 | `prefill_assistant` | bool | `true` | When `false`, the trailing assistant message is not pre-filled by the chat template. |

 {{% notice note %}}
-This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg (set via the YAML `reasoning.disable` field) toggles thinking on/off per call without restarting the model.
+This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg toggles thinking on/off per call without restarting the model. It can be driven either by the YAML `reasoning.disable` field (model default) or per request via the OpenAI `reasoning_effort` field on `/v1/chat/completions`:
+
+- `reasoning_effort: "none"` disables thinking for that request (`enable_thinking=false`) - useful to run a single reasoning model like Qwen3 for low-latency tasks while still enabling reasoning on other requests.
+- `reasoning_effort: "minimal" | "low" | "medium" | "high"` enables thinking, unless the model config explicitly set `reasoning.disable: true` (an operator's explicit disable wins and is never re-enabled by a request).
 {{% /notice %}}

 ### Multimodal Backend Options
--- a/docs/content/features/audio-to-text.md
+++ b/docs/content/features/audio-to-text.md
@@ -11,6 +11,7 @@ The transcription endpoint allows to convert audio files to text. The endpoint s
 - **[whisper.cpp](https://github.com/ggerganov/whisper.cpp)**: A C++ library for audio transcription (default)
 - **moonshine**: Ultra-fast transcription engine optimized for low-end devices
 - **faster-whisper**: Fast Whisper implementation with CTranslate2
+- **[parakeet-cpp](https://github.com/mudler/parakeet.cpp)**: A C++/ggml port of NVIDIA NeMo Parakeet (FastConformer TDT/CTC/RNNT/hybrid). Runs quantized GGUFs on CPU or GPU, emits word-level timestamps, and supports cache-aware streaming (the `realtime_eou` model surfaces end-of-utterance events).
 - **llama-cpp**: Route transcription to any multimodal-audio GGUF model served by the `llama-cpp` backend (e.g. [Qwen3-ASR](https://huggingface.co/ggml-org/Qwen3-ASR-0.6B-GGUF), Voxtral, Qwen2-Audio). Under the hood the request is converted into a chat completion with the audio attached via the model's audio encoder — the same path the upstream llama.cpp server uses. Set `backend: llama-cpp` in the model YAML and point `mmproj` at the matching audio encoder.
 - **voxtral**: Voxtral-family models served by a dedicated backend

@@ -157,6 +158,35 @@ curl http://localhost:8080/v1/audio/transcriptions \
  -F model="qwen3-asr"
 ```

+## Using the parakeet-cpp backend
+
+[parakeet.cpp](https://github.com/mudler/parakeet.cpp) is a C++/ggml port of NVIDIA NeMo Parakeet that matches the upstream PyTorch models on CPU. GGUF weights for every model and quant are published in a single repo, [`mudler/parakeet-cpp-gguf`](https://huggingface.co/mudler/parakeet-cpp-gguf). F16 is the recommended default, and Q4_K stays near-lossless on the small models. The easiest path is to import directly (the GGUFs auto-detect to this backend):
+
+```bash
+local-ai models import https://huggingface.co/mudler/parakeet-cpp-gguf/resolve/main/tdt_ctc-110m-f16.gguf
+```
+
+Or write a model YAML:
+
+```yaml
+name: parakeet-110m
+backend: parakeet-cpp
+parameters:
+  model: tdt_ctc-110m-f16.gguf
+```
+
+Then call `/v1/audio/transcriptions` as usual. Pass `timestamp_granularities[]=word` for per-word timings:
+
+```bash
+curl http://localhost:8080/v1/audio/transcriptions \
+  -H "Content-Type: multipart/form-data" \
+  -F file="@jfk.wav" \
+  -F model="parakeet-110m" \
+  -F "timestamp_granularities[]=word"
+```
+
+For real-time use, load a cache-aware streaming model (e.g. `realtime_eou_120m-v1-*.gguf`) and pass `-F stream=true`. Deltas are emitted as the audio is decoded, with end-of-utterance events closing each segment.
+
 ## See also

 - [Audio Transform]({{< relref "audio-transform.md" >}}) — clean up the audio (echo cancellation, noise suppression, dereverberation) before passing it to a transcription model.
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v4.3.1"
+  "version": "v4.3.4"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,37 @@
 ---
+- name: "lfm2.5-8b-a1b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/LiquidAI/LFM2.5-8B-A1B-GGUF
+  description: "Try LFM •\nDocs •\nLEAP •\nDiscord\n\n# LFM2.5-8B-A1B\n\nLFM2.5 is a new family of hybrid models designed for on-device deployment. It builds on the LFM2 architecture with extended pre-training and reinforcement learning.\n\n  - **On-device personal assistant**: Designed to power real-life applications, chaining tool calls, and following complex instructions on all devices.\n  - **Compressed performance**: Competitive with much larger dense and MoE models on instruction following and agentic tasks.\n  - **Unmatched throughput**: Fastest in its size class on both CPU and GPU inference, with day-one support for llama.cpp, MLX, vLLM, and SGLang.\n\nFind more information about LFM2.5-8B-A1B in our blog post.\n\n**AA-Omniscience Index (higher is better) rewards correct answers and penalizes hallucinations. Scores range from -100 to 100. See more results on Artificial Analysis.*\n\n## \U0001F5D2️ Model Details\n\nLFM2.5-8B-A1B is a general-purpose text-only model with the following features:\n\n...\n"
+  license: "other"
+  tags:
+    - llm
+    - gguf
+  icon: https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/qUZVGkns1bg3sZUShBbhv.png
+  overrides:
+    backend: llama-cpp
+    function:
+      automatic_tool_parsing_fallback: true
+      grammar:
+        disable: true
+    known_usecases:
+      - chat
+    options:
+      - use_jinja:true
+    parameters:
+      min_p: 0.15
+      model: llama-cpp/models/LFM2.5-8B-A1B-GGUF/LFM2.5-8B-A1B-Q4_K_M.gguf
+      repeat_penalty: 1.05
+      temperature: 0.1
+      top_k: 50
+      top_p: 0.1
+    template:
+      use_tokenizer_template: true
+  files:
+    - filename: llama-cpp/models/LFM2.5-8B-A1B-GGUF/LFM2.5-8B-A1B-Q4_K_M.gguf
+      uri: https://huggingface.co/LiquidAI/LFM2.5-8B-A1B-GGUF/resolve/main/LFM2.5-8B-A1B-Q4_K_M.gguf
+      sha256: 4923ec14f06b968b74d663e5949867d2d9c3bf13a20b8be1a9f9af39989b2bb0
 - name: "qwopus3.5-9b-coder-mtp"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
@@ -31458,3 +31491,283 @@
    - filename: ds4flash.gguf
      sha256: 31598c67c8b8744d3bcebcd19aa62253c6dc43cef3b8adf9f593656c9e86fd8c
      uri: huggingface://antirez/deepseek-v4-gguf/DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf
+- name: parakeet-cpp-tdt_ctc-110m
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    Hybrid TDT+CTC FastConformer, 110M. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-tdt_ctc-110m
+    parameters:
+      model: parakeet-cpp/tdt_ctc-110m-f16.gguf
+  files:
+    - filename: parakeet-cpp/tdt_ctc-110m-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/tdt_ctc-110m-f16.gguf
+      sha256: 7f9a6376edde6a74592ace48b2ebdc27a1ac972d0be9dfcc29e668d99381faf1
+- name: parakeet-cpp-realtime_eou_120m-v1
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    Cache-aware streaming RNNT FastConformer with end-of-utterance (EOU) detection, 120M. Use with streaming transcription. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-realtime_eou_120m-v1
+    parameters:
+      model: parakeet-cpp/realtime_eou_120m-v1-f16.gguf
+  files:
+    - filename: parakeet-cpp/realtime_eou_120m-v1-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/realtime_eou_120m-v1-f16.gguf
+      sha256: d1a2b12f12b8a096a57499c9111ed13b442a2b786e17a292c168be45088f0edc
+- name: parakeet-cpp-ctc-0.6b
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    CTC FastConformer, 0.6B. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-ctc-0.6b
+    parameters:
+      model: parakeet-cpp/ctc-0.6b-f16.gguf
+  files:
+    - filename: parakeet-cpp/ctc-0.6b-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/ctc-0.6b-f16.gguf
+      sha256: 97fcefa21ae78a04d9dedd5d4776535f37e14e252e9c156758a9ace0fd56bafb
+- name: parakeet-cpp-rnnt-0.6b
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    RNNT FastConformer, 0.6B. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-rnnt-0.6b
+    parameters:
+      model: parakeet-cpp/rnnt-0.6b-f16.gguf
+  files:
+    - filename: parakeet-cpp/rnnt-0.6b-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/rnnt-0.6b-f16.gguf
+      sha256: 20308eb952a856b217dc52ae89f530fcef09119f4580b0068c1181a70442a8cf
+- name: parakeet-cpp-tdt-0.6b-v2
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    TDT FastConformer, 0.6B (v2). F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-tdt-0.6b-v2
+    parameters:
+      model: parakeet-cpp/tdt-0.6b-v2-f16.gguf
+  files:
+    - filename: parakeet-cpp/tdt-0.6b-v2-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/tdt-0.6b-v2-f16.gguf
+      sha256: f8df7f5dc7b9ceb5cd0637a81194aab5d93022ace555ce81c8969c7a694b8f3d
+- name: parakeet-cpp-tdt-0.6b-v3
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    TDT FastConformer, 0.6B (v3, multilingual). F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-tdt-0.6b-v3
+    parameters:
+      model: parakeet-cpp/tdt-0.6b-v3-f16.gguf
+  files:
+    - filename: parakeet-cpp/tdt-0.6b-v3-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/tdt-0.6b-v3-f16.gguf
+      sha256: 8ba47343e1e919895aca90e099150a01ed203ee0942d8ed31e27295efc5abb22
+- name: parakeet-cpp-ctc-1.1b
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    CTC FastConformer, 1.1B. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-ctc-1.1b
+    parameters:
+      model: parakeet-cpp/ctc-1.1b-f16.gguf
+  files:
+    - filename: parakeet-cpp/ctc-1.1b-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/ctc-1.1b-f16.gguf
+      sha256: 48eac4cf0975f0e31f5a8b857972524e2536363b88ec2bf7147e70bbb006e57b
+- name: parakeet-cpp-rnnt-1.1b
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    RNNT FastConformer, 1.1B. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-rnnt-1.1b
+    parameters:
+      model: parakeet-cpp/rnnt-1.1b-f16.gguf
+  files:
+    - filename: parakeet-cpp/rnnt-1.1b-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/rnnt-1.1b-f16.gguf
+      sha256: 981b5941251b5bbbc15bd8672114040ddb697f9b8aae5b15217f445b7cd68e83
+- name: parakeet-cpp-tdt-1.1b
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    TDT FastConformer, 1.1B. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-tdt-1.1b
+    parameters:
+      model: parakeet-cpp/tdt-1.1b-f16.gguf
+  files:
+    - filename: parakeet-cpp/tdt-1.1b-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/tdt-1.1b-f16.gguf
+      sha256: 83075a3e00c0fe43248f6b8fac24a29096e4fab28b944dbba7ff380a918b56b5
+- name: parakeet-cpp-tdt_ctc-1.1b
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://huggingface.co/mudler/parakeet-cpp-gguf
+    - https://github.com/mudler/parakeet.cpp
+  description: |
+    Hybrid TDT+CTC FastConformer, 1.1B. F16 GGUF for the parakeet-cpp backend (C++/ggml port of NVIDIA NeMo Parakeet),
+    byte-identical to NeMo at WER 0. Faster than NeMo on CPU and GPU.
+  license: cc-by-4.0
+  tags:
+    - parakeet
+    - parakeet-cpp
+    - asr
+    - speech-recognition
+    - stt
+    - gguf
+    - ggml
+  overrides:
+    backend: parakeet-cpp
+    known_usecases:
+      - transcript
+    name: parakeet-cpp-tdt_ctc-1.1b
+    parameters:
+      model: parakeet-cpp/tdt_ctc-1.1b-f16.gguf
+  files:
+    - filename: parakeet-cpp/tdt_ctc-1.1b-f16.gguf
+      uri: huggingface://mudler/parakeet-cpp-gguf/tdt_ctc-1.1b-f16.gguf
+      sha256: cd53f64eefac2623a12f2f118ef50b56622dc3012f42c815c6adf0d08292f387
--- a/gallery/qwen3.yaml
+++ b/gallery/qwen3.yaml
@@ -17,6 +17,13 @@ config_file: |
    # "pure content" PEG parser that leaks reasoning tags into content.
    options:
        - use_jinja:true
+    # With use_tokenizer_template the backend (llama.cpp) owns tool-call
+    # grammar generation and parsing too. Disabling LocalAI's own grammar lets
+    # llama.cpp's native name-first tool pipeline run; otherwise the generated
+    # grammar overrides it and the tool-call JSON leaks into content (#10052).
+    function:
+        grammar:
+            disable: true
    template:
        use_tokenizer_template: true
 name: qwen3
--- a/pkg/downloader/huggingface.go
+++ b/pkg/downloader/huggingface.go
@@ -5,8 +5,9 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"net/http"
 	"strings"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 type HuggingFaceScanResult struct {
@@ -29,7 +30,7 @@ func HuggingFaceScan(uri URI) (*HuggingFaceScanResult, error) {
 	if len(cleanParts) <= 4 || (cleanParts[2] != "huggingface.co" && cleanParts[2] != hfHost) {
 		return nil, ErrNonHuggingFaceFile
 	}
-	results, err := http.Get(fmt.Sprintf("%s/api/models/%s/%s/scan", HF_ENDPOINT, cleanParts[3], cleanParts[4]))
+	results, err := httpclient.New(httpclient.WithFollowRedirects()).Get(fmt.Sprintf("%s/api/models/%s/%s/scan", HF_ENDPOINT, cleanParts[3], cleanParts[4]))
 	if err != nil {
 		return nil, err
 	}
--- a/pkg/downloader/uri.go
+++ b/pkg/downloader/uri.go
@@ -17,10 +17,12 @@ import (
 	"github.com/google/go-containerregistry/pkg/v1/tarball"
 	ocispec "github.com/opencontainers/image-spec/specs-go/v1"

+	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 	"github.com/mudler/LocalAI/pkg/oci"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/pkg/xio"
-	"github.com/mudler/xlog"
 )

 const (
@@ -171,7 +173,7 @@ func (uri URI) ReadWithAuthorizationAndCallback(ctx context.Context, basePath st
 		req.Header.Add("Authorization", authorization)
 	}

-	response, err := http.DefaultClient.Do(req)
+	response, err := downloadClient.Do(req)
 	if err != nil {
 		return err
 	}
@@ -347,9 +349,15 @@ func calculateHashForPartialFile(file *os.File) (hash.Hash, error) {
 	return hash, nil
 }

+// downloadClient is the shared client for HTTP(S) downloads and size
+// probes. It follows redirects (model hosts and CDNs rely on them) but
+// strips credential headers on any cross-host hop, and sets no body
+// deadline so large downloads are not truncated.
+var downloadClient = httpclient.New(httpclient.WithFollowRedirects())
+
 func (uri URI) checkSeverSupportsRangeHeader() (bool, error) {
 	url := uri.ResolveURL()
-	resp, err := http.Head(url)
+	resp, err := downloadClient.Head(url)
 	if err != nil {
 		return false, err
 	}
@@ -376,7 +384,7 @@ func (u URI) ContentLength(ctx context.Context) (int64, error) {
 	if err != nil {
 		return 0, err
 	}
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := downloadClient.Do(req)
 	if err != nil {
 		return 0, err
 	}
@@ -395,7 +403,7 @@ func (u URI) ContentLength(ctx context.Context) (int64, error) {
 		return 0, err
 	}
 	req2.Header.Set("Range", "bytes=0-0")
-	resp2, err := http.DefaultClient.Do(req2)
+	resp2, err := downloadClient.Do(req2)
 	if err != nil {
 		return 0, err
 	}
@@ -584,7 +592,7 @@ func (uri URI) DownloadFileWithContext(ctx context.Context, filePath, sha string
 		contentLength = l.Size()
 	} else {
 		// Start the request
-		resp, err := http.DefaultClient.Do(req)
+		resp, err := downloadClient.Do(req)
 		if err != nil {
 			// Check if error is due to context cancellation
 			if errors.Is(err, context.Canceled) {
--- a/pkg/functions/grammars/json_schema.go
+++ b/pkg/functions/grammars/json_schema.go
@@ -155,12 +155,22 @@ func (sc *JSONSchemaConverter) visit(schema map[string]any, name string, rootSch
 			propName   string
 			propSchema map[string]any
 		}) int {
-			aOrder := propOrder[a.propName]
-			bOrder := propOrder[b.propName]
-			if aOrder != 0 && bOrder != 0 {
+			// Use presence in the order map (not a non-zero sentinel) so that
+			// the first listed key — index 0 — is honored. Keys present in
+			// properties_order sort by their index and ahead of any key that
+			// isn't listed; unlisted keys keep a stable alphabetical order.
+			aOrder, aOK := propOrder[a.propName]
+			bOrder, bOK := propOrder[b.propName]
+			switch {
+			case aOK && bOK:
 				return cmp.Compare(aOrder, bOrder)
+			case aOK:
+				return -1
+			case bOK:
+				return 1
+			default:
+				return cmp.Compare(a.propName, b.propName)
 			}
-			return cmp.Compare(a.propName, b.propName)
 		})

 		var rule strings.Builder
--- a/pkg/functions/grammars/json_schema_test.go
+++ b/pkg/functions/grammars/json_schema_test.go
@@ -547,3 +547,61 @@ realvalue
 		})
 	})
 })
+
+var _ = Describe("JSON schema property ordering (issue #10052)", func() {
+	// A function-call shaped schema. The grammar must honor the configured
+	// properties_order. Before the fix, the sort guard `aOrder != 0 && bOrder != 0`
+	// treated the first listed key (index 0) as "unset" and fell back to
+	// alphabetical order, so "arguments" was emitted before "name" even when
+	// properties_order put name first.
+	const schema = `{
+		"type": "object",
+		"properties": {
+			"name": {"type": "string"},
+			"arguments": {"type": "object", "properties": {"cmd": {"type": "string"}}}
+		}
+	}`
+
+	// keyIndex finds the position of an object-key literal (escaped as \"key\"
+	// in GBNF), which only appears where the key is emitted in the rule — not
+	// in derived rule names like root-name.
+	keyIndex := func(grammar, key string) int {
+		return strings.Index(grammar, `\"`+key+`\"`)
+	}
+
+	It("honors properties_order with name listed first (index 0)", func() {
+		grammar, err := NewJSONSchemaConverter("name,arguments").GrammarFromBytes([]byte(schema))
+		Expect(err).To(BeNil())
+		ni := keyIndex(grammar, "name")
+		ai := keyIndex(grammar, "arguments")
+		Expect(ni).To(BeNumerically(">=", 0))
+		Expect(ai).To(BeNumerically(">=", 0))
+		Expect(ni).To(BeNumerically("<", ai),
+			"properties_order lists name first, so the grammar must emit \"name\" before \"arguments\"")
+	})
+
+	It("keeps alphabetical order when properties_order is empty", func() {
+		grammar, err := NewJSONSchemaConverter("").GrammarFromBytes([]byte(schema))
+		Expect(err).To(BeNil())
+		// No explicit order: keys fall back to alphabetical, so "arguments"
+		// precedes "name". This is the documented default and must not change.
+		Expect(keyIndex(grammar, "arguments")).To(BeNumerically("<", keyIndex(grammar, "name")))
+	})
+
+	It("sorts keys present in properties_order ahead of unlisted keys", func() {
+		const schemaWithExtra = `{
+			"type": "object",
+			"properties": {
+				"name": {"type": "string"},
+				"arguments": {"type": "object", "properties": {"cmd": {"type": "string"}}},
+				"aaa_unlisted": {"type": "string"}
+			}
+		}`
+		// "aaa_unlisted" is alphabetically first but not in the order list, so
+		// it must still come after the listed name/arguments keys.
+		grammar, err := NewJSONSchemaConverter("name,arguments").GrammarFromBytes([]byte(schemaWithExtra))
+		Expect(err).To(BeNil())
+		Expect(keyIndex(grammar, "name")).To(BeNumerically("<", keyIndex(grammar, "arguments")))
+		Expect(keyIndex(grammar, "arguments")).To(BeNumerically("<", keyIndex(grammar, "aaa_unlisted")))
+	})
+})
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@@ -628,6 +628,36 @@ func buildContent(before string, parser *ChatMsgParser) string {
 // This provides better streaming and partial parsing support.
 // When format is nil or when format is set, tries "find scope/tool start, split, parse suffix"
 // first (llama.cpp PEG order) so that content before the tool block does not cause parse failure.
+// validToolNameRe matches a plausible function name. OpenAI tool names are
+// limited to letters, digits, underscores and hyphens; dots appear in some
+// providers' namespaced names. Anything else (whitespace, braces, brackets,
+// quotes, colons) signals the XML auto-detector grabbed a JSON blob or prose
+// rather than a real name.
+var validToolNameRe = regexp.MustCompile(`^[A-Za-z0-9_.\-]+$`)
+
+// plausibleToolName reports whether name looks like a real function name.
+func plausibleToolName(name string) bool {
+	return validToolNameRe.MatchString(strings.TrimSpace(name))
+}
+
+// filterPlausibleToolCalls drops auto-detected tool calls whose name is not a
+// plausible function name. This guards against a format (notably glm-4.5, whose
+// tool block is <tool_call>name...</tool_call>) mis-claiming a Hermes-style
+// <tool_call>JSON</tool_call> block and returning the whole JSON object — or
+// any leading prose / array — as the function name. Dropping the misparse lets
+// auto-detection fall through to the next format and ultimately to JSON
+// parsing, which handles Hermes correctly. Replaces the narrower leading-"{"
+// check (PR #9940); see issue #9722.
+func filterPlausibleToolCalls(calls []FuncCallResults) []FuncCallResults {
+	out := calls[:0:0]
+	for _, c := range calls {
+		if plausibleToolName(c.Name) {
+			out = append(out, c)
+		}
+	}
+	return out
+}
+
 func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]FuncCallResults, error) {
 	// Try split-on-scope first so reasoning/content before tool block is skipped
 	if format != nil {
@@ -639,7 +669,12 @@ func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]F
 		for _, fmtPreset := range formats {
 			if fmtPreset.format != nil {
 				if pr, ok := tryParseXMLFromScopeStart(s, fmtPreset.format, isPartial); ok {
-					return pr.ToolCalls, nil
+					// Auto-detect: discard misparsed (non-name) results so a
+					// format that grabbed a JSON blob doesn't win; fall through
+					// to the next format.
+					if valid := filterPlausibleToolCalls(pr.ToolCalls); len(valid) > 0 {
+						return valid, nil
+					}
 				}
 			}
 		}
@@ -659,14 +694,19 @@ func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]F
 				if err != nil {
 					// Check if it's a partial exception (recoverable)
 					if _, ok := err.(*ChatMsgPartialException); ok {
-						// Partial parse, return what we have
-						return parser.ToolCalls(), nil
+						// Partial parse, return what we have — unless every
+						// result is a misparse, in which case try the next format.
+						if valid := filterPlausibleToolCalls(parser.ToolCalls()); len(valid) > 0 {
+							return valid, nil
+						}
 					}
 					// Try next format
 					continue
 				}
 				if success && len(parser.ToolCalls()) > 0 {
-					return parser.ToolCalls(), nil
+					if valid := filterPlausibleToolCalls(parser.ToolCalls()); len(valid) > 0 {
+						return valid, nil
+					}
 				}
 			}
 		}
--- a/pkg/functions/parse_glm_9722_test.go
+++ b/pkg/functions/parse_glm_9722_test.go
@@ -0,0 +1,56 @@
+package functions
+
+import (
+	"regexp"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Robust fix for the glm-4.5 XML auto-detect false positive (relates to #9722
+// / supersedes the brittle leading-"{" filter in #9940). When the XML
+// auto-detector mis-identifies a Hermes-style <tool_call>JSON</tool_call> block
+// as glm-4.5, it extracts the block body as the function NAME. A real function
+// name is [A-Za-z0-9_.-]+; anything with braces, brackets, whitespace, quotes
+// or colons is a misparse and must not be returned (so JSON parsing can take
+// over). This is stronger than checking only for a leading "{": it also rejects
+// leading prose, JSON arrays, and brace-less garbage.
+var _ = Describe("glm-4.5 auto-detect name validation (#9722/#9940)", func() {
+	// plausibleName mirrors the contract: a returned auto-detected tool name
+	// must look like a real function name.
+	plausible := regexp.MustCompile(`^[A-Za-z0-9_.\-]+$`)
+
+	DescribeTable("auto-detect must not emit a misparsed tool name",
+		func(input string) {
+			results, err := ParseXMLIterative(input, nil, false)
+			Expect(err).ToNot(HaveOccurred())
+			for _, r := range results {
+				Expect(plausible.MatchString(r.Name)).To(BeTrue(),
+					"auto-detected XML tool name must look like a function name, got: %q", r.Name)
+			}
+		},
+		Entry("canonical Hermes JSON", "<tool_call>\n{\"name\": \"bash\", \"arguments\": {\"script\": \"ls\"}}\n</tool_call>"),
+		Entry("leading prose then JSON", "<tool_call>\nSure: {\"name\": \"bash\", \"arguments\": {\"script\": \"ls\"}}\n</tool_call>"),
+		Entry("JSON array (parallel calls)", "<tool_call>\n[{\"name\": \"bash\", \"arguments\": {}}]\n</tool_call>"),
+		Entry("brace-less garbage", "<tool_call>\nname: bash, arguments: {}\n</tool_call>"),
+	)
+
+	// No-regression: a genuine glm-4.5 tool call must still be auto-detected.
+	It("still parses a legitimate glm-4.5 tool call", func() {
+		legit := "<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>NYC</arg_value>\n</tool_call>"
+		results, err := ParseXMLIterative(legit, nil, false)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(results).To(HaveLen(1))
+		Expect(results[0].Name).To(Equal("get_weather"))
+	})
+
+	// A user who explicitly forces the glm-4.5 format keeps the raw behaviour
+	// (no name filtering) — only auto-detection is guarded.
+	It("does not filter when the glm-4.5 format is explicitly forced", func() {
+		input := "<tool_call>\n{\"name\": \"bash\", \"arguments\": {}}\n</tool_call>"
+		forced, err := ParseXMLIterative(input, GetXMLFormatPreset("glm-4.5"), false)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(forced).ToNot(BeEmpty(),
+			"explicit format must be trusted verbatim, even if it yields a JSON-blob name")
+	})
+})
--- a/pkg/httpclient/client.go
+++ b/pkg/httpclient/client.go
@@ -0,0 +1,201 @@
+// Package httpclient provides hardened *http.Client constructors for all
+// outbound HTTP traffic in LocalAI.
+//
+// Direct use of net/http's default client (http.DefaultClient, http.Get,
+// http.Post, ...) or a bare http.Client{} is forbidden by lint (forbidigo).
+// The reason is GHSA-3mj3-57v2-4636: the standard client follows up to 10
+// redirects by default, and on a *cross-host* redirect Go forwards custom
+// request headers — including credential headers such as Anthropic's
+// x-api-key — to the redirect target. (Go strips Authorization, Cookie and
+// WWW-Authenticate cross-host, but NOT arbitrary custom headers.) An attacker
+// who can elicit a redirect from an upstream then harvests the credential.
+//
+// Every client built here refuses redirects by default (see NoRedirect). The
+// rare caller that genuinely must follow redirects should opt in with
+// WithFollowRedirects, which still strips credential headers on host change.
+//
+// Streaming note: New() intentionally sets NO client-level Timeout, because a
+// global timeout also bounds the response body and would truncate long-lived
+// SSE streams (chat completions can stream for minutes). Per-request deadlines
+// belong on the request context. Use NewWithTimeout for simple, non-streaming
+// request/response calls.
+package httpclient
+
+import (
+	"crypto/tls"
+	"errors"
+	"fmt"
+	"net"
+	"net/http"
+	"net/url"
+	"strings"
+	"time"
+)
+
+const (
+	// Transport-level bounds. These cap connection setup, NOT the response
+	// body, so they are safe for streaming responses.
+	dialTimeout           = 30 * time.Second
+	dialKeepAlive         = 30 * time.Second
+	tlsHandshakeTimeout   = 10 * time.Second
+	idleConnTimeout       = 90 * time.Second
+	expectContinueTimeout = 1 * time.Second
+	maxIdleConns          = 100
+
+	// maxRedirects bounds WithFollowRedirects chains (mirrors the net/http
+	// default) so an opt-in follower can't be spun forever by a redirect loop.
+	maxRedirects = 10
+)
+
+// sensitiveHeaders are credential-bearing request headers that must never be
+// replayed to a different host on a redirect. Go already drops the first three
+// cross-host; the rest are custom headers Go does not know about. Compared
+// case-insensitively via http.Header canonicalisation.
+var sensitiveHeaders = []string{
+	"Authorization",
+	"Www-Authenticate",
+	"Cookie",
+	"Proxy-Authorization",
+	"X-Api-Key",      // Anthropic, and many OpenAI-compatible providers
+	"Api-Key",        // Azure OpenAI
+	"X-Auth-Token",   // common custom scheme
+	"X-Goog-Api-Key", // Google
+}
+
+// ErrRedirectBlocked is wrapped by the error NoRedirect returns, so callers can
+// distinguish "the upstream tried to redirect us" from other transport errors
+// via errors.Is.
+var ErrRedirectBlocked = errors.New("httpclient: redirect blocked")
+
+// NoRedirect is an http.Client.CheckRedirect policy that refuses to follow any
+// redirect, surfacing it as an error instead. This is the default for clients
+// built by New/NewWithTimeout. The error uses URL.Redacted() so userinfo in
+// the target URL is not written to logs.
+func NoRedirect(req *http.Request, _ []*http.Request) error {
+	return fmt.Errorf("%w: refusing to follow redirect to %s (set httpclient.WithFollowRedirects to opt in)", ErrRedirectBlocked, req.URL.Redacted())
+}
+
+// stripAuthOnRedirect follows redirects but deletes credential headers whenever
+// the redirect crosses to a different host, closing the cross-host credential
+// leak while still allowing same-host or non-authenticated redirect chains.
+func stripAuthOnRedirect(req *http.Request, via []*http.Request) error {
+	if len(via) >= maxRedirects {
+		return fmt.Errorf("httpclient: stopped after %d redirects", maxRedirects)
+	}
+	prev := via[len(via)-1]
+	if !sameOrigin(prev.URL, req.URL) {
+		for _, h := range sensitiveHeaders {
+			req.Header.Del(h)
+		}
+	}
+	return nil
+}
+
+// sameOrigin reports whether two URLs share scheme AND host (including port).
+// Deliberately strict: a different port or scheme is treated as a different
+// origin so credential headers are stripped. This avoids the curl
+// CVE-2022-27774 class of bug where ports were ignored and credentials leaked
+// to a different service on the same hostname.
+func sameOrigin(a, b *url.URL) bool {
+	return strings.EqualFold(a.Scheme, b.Scheme) && strings.EqualFold(a.Host, b.Host)
+}
+
+// HardenedTransport returns a fresh *http.Transport with a TLS 1.2 floor and
+// bounded connection setup. Callers that need to wrap or extend the transport
+// (e.g. a credential-injecting RoundTripper) should base it on this rather than
+// http.DefaultTransport so the TLS floor and timeouts are preserved.
+func HardenedTransport() *http.Transport {
+	return &http.Transport{
+		Proxy: http.ProxyFromEnvironment,
+		DialContext: (&net.Dialer{
+			Timeout:   dialTimeout,
+			KeepAlive: dialKeepAlive,
+		}).DialContext,
+		ForceAttemptHTTP2:     true,
+		MaxIdleConns:          maxIdleConns,
+		IdleConnTimeout:       idleConnTimeout,
+		TLSHandshakeTimeout:   tlsHandshakeTimeout,
+		ExpectContinueTimeout: expectContinueTimeout,
+		TLSClientConfig:       &tls.Config{MinVersion: tls.VersionTLS12},
+	}
+}
+
+type options struct {
+	timeout         time.Duration
+	transport       http.RoundTripper
+	followRedirects bool
+}
+
+// Option configures a client built by New.
+type Option func(*options)
+
+// WithTimeout sets an overall client Timeout (covers the entire exchange
+// including reading the body). Do NOT use this for streaming endpoints; prefer
+// a per-request context deadline there. Equivalent to NewWithTimeout.
+func WithTimeout(d time.Duration) Option { return func(o *options) { o.timeout = d } }
+
+// WithTransport supplies a custom RoundTripper (e.g. an IP-pinned dialer or a
+// credential-injecting wrapper). The caller is responsible for the transport's
+// TLS configuration; base it on HardenedTransport to keep the TLS floor.
+func WithTransport(rt http.RoundTripper) Option { return func(o *options) { o.transport = rt } }
+
+// WithFollowRedirects opts into following redirects, while still stripping
+// credential headers on any cross-host hop. Use only when an endpoint legitimately
+// redirects (e.g. some download CDNs) and the request carries a secret.
+func WithFollowRedirects() Option { return func(o *options) { o.followRedirects = true } }
+
+// New returns a hardened *http.Client. By default it refuses redirects, sets a
+// TLS 1.2 floor, bounds connection setup, and imposes no body deadline (safe
+// for streaming). Apply Options to adjust.
+func New(opts ...Option) *http.Client {
+	o := options{}
+	for _, fn := range opts {
+		fn(&o)
+	}
+
+	rt := o.transport
+	if rt == nil {
+		rt = HardenedTransport()
+	}
+
+	check := NoRedirect
+	if o.followRedirects {
+		check = stripAuthOnRedirect
+	}
+
+	return &http.Client{
+		Transport:     rt,
+		Timeout:       o.timeout, // zero == no overall deadline (streaming-safe)
+		CheckRedirect: check,
+	}
+}
+
+// NewWithTimeout returns a hardened client with an overall Timeout. Use for
+// simple request/response calls; for streaming, use New with a context deadline.
+func NewWithTimeout(timeout time.Duration, opts ...Option) *http.Client {
+	return New(append([]Option{WithTimeout(timeout)}, opts...)...)
+}
+
+// Harden applies the default hardening (refuse redirects, TLS 1.2 floor) to an
+// existing client in place, for the cases where a third-party library hands us
+// a *http.Client to configure rather than letting us construct one. It returns
+// the same client for convenience. A nil client is left nil.
+func Harden(c *http.Client) *http.Client {
+	if c == nil {
+		return nil
+	}
+	if c.CheckRedirect == nil {
+		c.CheckRedirect = NoRedirect
+	}
+	switch t := c.Transport.(type) {
+	case nil:
+		c.Transport = HardenedTransport()
+	case *http.Transport:
+		if t.TLSClientConfig == nil {
+			t.TLSClientConfig = &tls.Config{MinVersion: tls.VersionTLS12}
+		} else if t.TLSClientConfig.MinVersion == 0 {
+			t.TLSClientConfig.MinVersion = tls.VersionTLS12
+		}
+	}
+	return c
+}
--- a/pkg/httpclient/client_test.go
+++ b/pkg/httpclient/client_test.go
@@ -0,0 +1,132 @@
+package httpclient_test
+
+import (
+	"crypto/tls"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestHTTPClient(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "httpclient suite")
+}
+
+var _ = Describe("httpclient", func() {
+	Describe("New (default)", func() {
+		It("refuses to follow redirects and never reaches the redirect target", func() {
+			sinkHit := make(chan string, 1)
+			sink := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				sinkHit <- r.Header.Get("X-Api-Key")
+				w.WriteHeader(http.StatusOK)
+			}))
+			defer sink.Close()
+
+			redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				http.Redirect(w, r, sink.URL, http.StatusFound)
+			}))
+			defer redirector.Close()
+
+			req, _ := http.NewRequest(http.MethodGet, redirector.URL, nil)
+			req.Header.Set("X-Api-Key", "secret")
+
+			_, err := httpclient.New().Do(req)
+			Expect(err).To(HaveOccurred(), "redirect must surface as an error")
+			Expect(errors.Is(err, httpclient.ErrRedirectBlocked)).To(BeTrue(), "error should wrap ErrRedirectBlocked")
+			Expect(sinkHit).NotTo(Receive(), "the redirect target must never be contacted")
+		})
+
+		It("sets no overall timeout (streaming-safe) by default", func() {
+			Expect(httpclient.New().Timeout).To(BeZero())
+		})
+
+		It("sets a TLS 1.2 floor on the default transport", func() {
+			c := httpclient.New()
+			t, ok := c.Transport.(*http.Transport)
+			Expect(ok).To(BeTrue())
+			Expect(t.TLSClientConfig).NotTo(BeNil())
+			Expect(t.TLSClientConfig.MinVersion).To(Equal(uint16(tls.VersionTLS12)))
+		})
+	})
+
+	Describe("NewWithTimeout", func() {
+		It("applies the overall timeout", func() {
+			Expect(httpclient.NewWithTimeout(5 * time.Second).Timeout).To(Equal(5 * time.Second))
+		})
+	})
+
+	Describe("WithFollowRedirects", func() {
+		It("follows same-host redirects keeping the credential header", func() {
+			got := make(chan string, 2)
+			var srv *httptest.Server
+			srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				if r.URL.Path == "/start" {
+					http.Redirect(w, r, srv.URL+"/end", http.StatusFound)
+					return
+				}
+				got <- r.Header.Get("X-Api-Key")
+				w.WriteHeader(http.StatusOK)
+			}))
+			defer srv.Close()
+
+			req, _ := http.NewRequest(http.MethodGet, srv.URL+"/start", nil)
+			req.Header.Set("X-Api-Key", "secret")
+
+			resp, err := httpclient.New(httpclient.WithFollowRedirects()).Do(req)
+			Expect(err).NotTo(HaveOccurred())
+			_ = resp.Body.Close()
+			Expect(<-got).To(Equal("secret"), "same-host redirect should preserve the header")
+		})
+
+		It("strips credential headers on a cross-host redirect", func() {
+			sinkKey := make(chan string, 1)
+			sink := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				sinkKey <- r.Header.Get("X-Api-Key")
+				w.WriteHeader(http.StatusOK)
+			}))
+			defer sink.Close()
+
+			redirector := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+				http.Redirect(w, r, sink.URL, http.StatusFound)
+			}))
+			defer redirector.Close()
+
+			req, _ := http.NewRequest(http.MethodGet, redirector.URL, nil)
+			req.Header.Set("X-Api-Key", "secret")
+
+			resp, err := httpclient.New(httpclient.WithFollowRedirects()).Do(req)
+			Expect(err).NotTo(HaveOccurred())
+			_ = resp.Body.Close()
+			Expect(<-sinkKey).To(BeEmpty(), "x-api-key must be stripped crossing to a different host")
+		})
+	})
+
+	Describe("Harden", func() {
+		It("adds NoRedirect and a TLS floor to a bare client without clobbering existing config", func() {
+			c := httpclient.Harden(&http.Client{})
+			Expect(c.CheckRedirect).NotTo(BeNil())
+			t, ok := c.Transport.(*http.Transport)
+			Expect(ok).To(BeTrue())
+			Expect(t.TLSClientConfig.MinVersion).To(Equal(uint16(tls.VersionTLS12)))
+		})
+
+		It("returns nil for a nil client", func() {
+			Expect(httpclient.Harden(nil)).To(BeNil())
+		})
+
+		It("preserves a caller-supplied CheckRedirect", func() {
+			sentinel := errors.New("mine")
+			c := httpclient.Harden(&http.Client{
+				CheckRedirect: func(*http.Request, []*http.Request) error { return sentinel },
+			})
+			Expect(c.CheckRedirect(nil, nil)).To(Equal(sentinel))
+		})
+	})
+})
--- a/pkg/huggingface-api/client.go
+++ b/pkg/huggingface-api/client.go
@@ -10,6 +10,8 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // Model represents a model from the Hugging Face API
@@ -94,7 +96,7 @@ type Client struct {
 func NewClient() *Client {
 	return &Client{
 		baseURL: "https://huggingface.co/api/models",
-		client:  &http.Client{},
+		client:  httpclient.New(httpclient.WithFollowRedirects()),
 	}
 }

--- a/pkg/mcp/localaitools/httpapi/client.go
+++ b/pkg/mcp/localaitools/httpapi/client.go
@@ -19,6 +19,7 @@ import (
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services/modeladmin"
+	"github.com/mudler/LocalAI/pkg/httpclient"
 	localaitools "github.com/mudler/LocalAI/pkg/mcp/localaitools"
 	"github.com/mudler/LocalAI/pkg/vram"
 )
@@ -36,11 +37,9 @@ type Client struct {
 // New returns a Client targeting baseURL with an optional bearer token.
 func New(baseURL, apiKey string) *Client {
 	return &Client{
-		BaseURL: strings.TrimRight(baseURL, "/"),
-		APIKey:  apiKey,
-		HTTPClient: &http.Client{
-			Timeout: 60 * time.Second,
-		},
+		BaseURL:    strings.TrimRight(baseURL, "/"),
+		APIKey:     apiKey,
+		HTTPClient: httpclient.NewWithTimeout(60 * time.Second),
 	}
 }

@@ -394,8 +393,8 @@ func (c *Client) UpgradeBackend(ctx context.Context, name string) (string, error

 func (c *Client) SystemInfo(ctx context.Context) (*localaitools.SystemInfo, error) {
 	var welcome struct {
-		Version           string   `json:"Version"`
-		LoadedModels      []any    `json:"LoadedModels"`
+		Version           string          `json:"Version"`
+		LoadedModels      []any           `json:"LoadedModels"`
 		InstalledBackends map[string]bool `json:"InstalledBackends"`
 	}
 	if err := c.do(ctx, http.MethodGet, routeWelcome, nil, &welcome); err != nil {
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -53,6 +53,13 @@ type ModelLoader struct {
 	modelRouter              ModelRouter // distributed mode: route to remote node
 	backendLogs              *BackendLogStore
 	backendLoggingEnabled    atomic.Bool
+	// stoppingProcs marks backend processes that LocalAI is stopping on
+	// purpose (model unload / graceful shutdown), keyed by the
+	// *process.Process pointer. The exit-watcher goroutine in startProcess
+	// consults it to decide whether an exit is an expected stop or a crash —
+	// the exit code can't, since a child killed by our own SIGTERM/SIGKILL
+	// reports -1, indistinguishable from a signal-induced crash.
+	stoppingProcs sync.Map
 }

 // NewModelLoader creates a new ModelLoader instance.
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -75,6 +75,9 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 		return nil
 	}

+	// Mark the stop as intentional so the exit-watcher logs it as an
+	// expected stop, not a crash (signal-terminated children report -1).
+	ml.stoppingProcs.Store(process, struct{}{})
 	err := process.Stop()
 	if err != nil {
 		xlog.Error("(deleteProcess) error while deleting process", "error", err, "model", s)
@@ -171,8 +174,16 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 	xlog.Debug("GRPC Service state dir", "dir", grpcControlProcess.StateDir())

 	signals.RegisterGracefulTerminationHandler(func() {
-		err := grpcControlProcess.Stop()
-		if err != nil {
+		// StopAllGRPC (the deleteProcess path) is registered earlier and runs
+		// first for store-tracked backends, stopping this process and removing
+		// its pidfile. Calling Stop again then fails with "failed to read PID".
+		// Skip when it's already gone; this handler still covers processes that
+		// StopAllGRPC doesn't track (e.g. worker-supervised backends).
+		if !grpcControlProcess.IsAlive() {
+			return
+		}
+		ml.stoppingProcs.Store(grpcControlProcess, struct{}{})
+		if err := grpcControlProcess.Stop(); err != nil {
 			xlog.Error("error while shutting down grpc process", "error", err)
 		}
 	})
@@ -211,20 +222,27 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
 	// whether the child is alive.
 	go func() {
 		<-grpcControlProcess.Done()
+		// LoadAndDelete both reads the intentional-stop marker and frees the
+		// map entry so it doesn't accumulate across the process's lifetime.
+		_, intentional := ml.stoppingProcs.LoadAndDelete(grpcControlProcess)
 		fields := []any{
 			"id", id,
 			"address", serverAddress,
 			"process", filepath.Base(grpcProcess),
 		}
-		code, codeErr := grpcControlProcess.ExitCode()
-		if codeErr == nil {
+		// Report the raw exit code without interpreting it: a child killed by
+		// our own SIGTERM/SIGKILL surfaces as -1 (Go reports -1 for signal
+		// termination, not the shell's 128+signal convention), so the code
+		// alone can't tell an intended stop from a crash. The stoppingProcs
+		// marker is the reliable signal for that, so it picks the log level.
+		if code, codeErr := grpcControlProcess.ExitCode(); codeErr == nil {
 			fields = append(fields, "exitCode", code)
 		}
-		// 143 = 128 + SIGTERM, the signal sent during graceful stop / model unload.
-		// Treat that and a clean 0 as expected; everything else is a likely crash.
-		if codeErr == nil && (code == "0" || code == "143") {
-			xlog.Info("Backend process exited", fields...)
+		if intentional {
+			xlog.Info("Backend process stopped", fields...)
 		} else {
+			// A stop we didn't initiate — a SIGSEGV from a missing shared
+			// library, a Python ImportError, an OOM kill, an unexpected self-exit.
 			xlog.Warn("Backend process exited unexpectedly", fields...)
 		}
 	}()
--- a/pkg/oci/ollama.go
+++ b/pkg/oci/ollama.go
@@ -8,6 +8,8 @@ import (
 	"net/http"

 	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

 // Define the main struct for the JSON data
@@ -45,7 +47,7 @@ func OllamaModelManifest(image string) (*Manifest, error) {
 		return nil, err
 	}
 	req.Header.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")
-	client := &http.Client{}
+	client := httpclient.New(httpclient.WithFollowRedirects())
 	resp, err := client.Do(req)
 	if err != nil {
 		return nil, err
--- a/pkg/utils/base64.go
+++ b/pkg/utils/base64.go
@@ -4,17 +4,16 @@ import (
 	"encoding/base64"
 	"fmt"
 	"io"
-	"net/http"
 	"regexp"
 	"strings"
 	"time"

 	"github.com/mudler/xlog"
+
+	"github.com/mudler/LocalAI/pkg/httpclient"
 )

-var base64DownloadClient http.Client = http.Client{
-	Timeout: 30 * time.Second,
-}
+var base64DownloadClient = httpclient.NewWithTimeout(30*time.Second, httpclient.WithFollowRedirects())

 // Match `data:<mime>[;param=value...];base64,` — browser-produced data URIs
 // often carry codec/charset params between the mime type and `;base64,`
--- a/pkg/utils/path_test.go
+++ b/pkg/utils/path_test.go
@@ -0,0 +1,157 @@
+package utils_test
+
+import (
+	"os"
+	"path/filepath"
+
+	. "github.com/mudler/LocalAI/pkg/utils"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("utils/path tests", func() {
+	Describe("VerifyPath", func() {
+		It("accepts a simple file directly inside the base path", func() {
+			Expect(VerifyPath("model.bin", "/srv/models")).To(Succeed())
+		})
+
+		It("accepts a nested subdirectory inside the base path", func() {
+			Expect(VerifyPath("subdir/model.bin", "/srv/models")).To(Succeed())
+		})
+
+		It("accepts traversal sequences that stay inside the base", func() {
+			// "a/b/../c" collapses to "a/c", still strictly inside the base,
+			// so the verifier should permit it.
+			Expect(VerifyPath("a/b/../c", "/srv/models")).To(Succeed())
+		})
+
+		It("rejects a single parent-traversal that escapes the base", func() {
+			Expect(VerifyPath("../etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects compound traversal that climbs above the base", func() {
+			Expect(VerifyPath("a/../../etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects a deeply-escaping path that lands on the filesystem root", func() {
+			Expect(VerifyPath("../../etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects the base path itself", func() {
+			// Documents that VerifyPath requires a strict descendant: an
+			// empty user input resolves to the base directory and is
+			// rejected, which is the safer default for a download helper
+			// that expects a target file inside the base.
+			Expect(VerifyPath("", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("treats an absolute-looking user input as relative to the base", func() {
+			// filepath.Join discards no segments here: the result is
+			// "/srv/models/etc/passwd", which is still inside the base.
+			// This protects callers that forward untrusted user paths
+			// directly to the verifier.
+			Expect(VerifyPath("/etc/passwd", "/srv/models")).To(Succeed())
+		})
+
+		It("is purely lexical and does not follow symlinks", func() {
+			// VerifyPath uses filepath.Clean, not filepath.EvalSymlinks,
+			// so a symlink that escapes the base is not detected here.
+			// Callers who must defend against symlink escapes need to
+			// EvalSymlinks before delegating to VerifyPath. This test
+			// pins the current contract so the trade-off stays explicit.
+			tmpDir := GinkgoT().TempDir()
+			base := filepath.Join(tmpDir, "base")
+			outside := filepath.Join(tmpDir, "outside")
+			Expect(os.Mkdir(base, 0o755)).To(Succeed())
+			Expect(os.Mkdir(outside, 0o755)).To(Succeed())
+			Expect(os.WriteFile(filepath.Join(outside, "secret.txt"), []byte("x"), 0o600)).To(Succeed())
+			Expect(os.Symlink(outside, filepath.Join(base, "escape"))).To(Succeed())
+
+			Expect(VerifyPath("escape/secret.txt", base)).To(Succeed())
+		})
+	})
+
+	Describe("InTrustedRoot", func() {
+		It("accepts a strict descendant of the trusted root", func() {
+			Expect(InTrustedRoot("/srv/models/file", "/srv/models")).To(Succeed())
+		})
+
+		It("accepts a deeply nested descendant", func() {
+			Expect(InTrustedRoot("/srv/models/a/b/c/file", "/srv/models")).To(Succeed())
+		})
+
+		It("rejects the trusted root itself", func() {
+			// The implementation walks up before comparing, so the input
+			// path must have at least one component beneath the root.
+			Expect(InTrustedRoot("/srv/models", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects a sibling directory that shares the parent", func() {
+			Expect(InTrustedRoot("/srv/other/file", "/srv/models")).ToNot(Succeed())
+		})
+
+		It("rejects an unrelated absolute path", func() {
+			Expect(InTrustedRoot("/etc/passwd", "/srv/models")).ToNot(Succeed())
+		})
+	})
+
+	Describe("SanitizeFileName", func() {
+		It("returns the original name when nothing is unsafe", func() {
+			Expect(SanitizeFileName("model.bin")).To(Equal("model.bin"))
+		})
+
+		It("strips leading directory components", func() {
+			Expect(SanitizeFileName("subdir/model.bin")).To(Equal("model.bin"))
+		})
+
+		It("strips absolute path prefixes", func() {
+			Expect(SanitizeFileName("/etc/passwd")).To(Equal("passwd"))
+		})
+
+		It("collapses parent-traversal sequences and keeps only the leaf", func() {
+			Expect(SanitizeFileName("../etc/passwd")).To(Equal("passwd"))
+		})
+
+		It("removes embedded .. sequences that Clean+Base alone do not catch", func() {
+			// After Clean+Base "foo..bar" survives unchanged; the explicit
+			// ReplaceAll on ".." in the implementation is the last line of
+			// defence against filenames that look benign but still contain
+			// traversal markers.
+			Expect(SanitizeFileName("foo..bar")).To(Equal("foobar"))
+		})
+
+		It("returns an empty string when the input is only a parent reference", func() {
+			Expect(SanitizeFileName("..")).To(Equal(""))
+		})
+	})
+
+	Describe("GenerateUniqueFileName", func() {
+		It("returns the bare filename when no collision exists", func() {
+			tmpDir := GinkgoT().TempDir()
+			Expect(GenerateUniqueFileName(tmpDir, "model", ".bin")).To(Equal("model.bin"))
+		})
+
+		It("suffixes with _2 when the bare filename already exists", func() {
+			tmpDir := GinkgoT().TempDir()
+			Expect(os.WriteFile(filepath.Join(tmpDir, "model.bin"), nil, 0o600)).To(Succeed())
+
+			Expect(GenerateUniqueFileName(tmpDir, "model", ".bin")).To(Equal("model_2.bin"))
+		})
+
+		It("advances the counter past every existing collision", func() {
+			tmpDir := GinkgoT().TempDir()
+			for _, name := range []string{"model.bin", "model_2.bin", "model_3.bin"} {
+				Expect(os.WriteFile(filepath.Join(tmpDir, name), nil, 0o600)).To(Succeed())
+			}
+
+			Expect(GenerateUniqueFileName(tmpDir, "model", ".bin")).To(Equal("model_4.bin"))
+		})
+
+		It("preserves an empty extension when generating the suffixed name", func() {
+			tmpDir := GinkgoT().TempDir()
+			Expect(os.WriteFile(filepath.Join(tmpDir, "README"), nil, 0o600)).To(Succeed())
+
+			Expect(GenerateUniqueFileName(tmpDir, "README", "")).To(Equal("README_2"))
+		})
+	})
+})
--- a/scripts/changed-backends.js
+++ b/scripts/changed-backends.js
@@ -18,6 +18,14 @@ function inferBackendPath(item) {
  if (item.dockerfile.endsWith("python")) {
    return `backend/python/${item.backend}/`;
  }
+  // parakeet-cpp is a Go backend (Dockerfile.golang) wrapping the parakeet.cpp
+  // ggml port via purego. It lives in backend/go/parakeet-cpp/; this explicit
+  // branch (placed before the generic golang one, which would also resolve it
+  // correctly) documents the mapping and guards against a future
+  // dockerfile-suffix change.
+  if (item.backend === "parakeet-cpp") {
+    return `backend/go/parakeet-cpp/`;
+  }
  if (item.dockerfile.endsWith("golang")) {
    return `backend/go/${item.backend}/`;
  }
--- a/tests/e2e/e2e_suite_test.go
+++ b/tests/e2e/e2e_suite_test.go
@@ -10,7 +10,7 @@ import (
 	"time"

 	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/application"
+	localaiapp "github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/config"
 	httpapi "github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/pkg/system"
@@ -41,6 +41,7 @@ var (
 	cloudProxyPath    string
 	mcpServerURL      string
 	mcpServerShutdown func()
+	localAIApp        *localaiapp.Application

 	// Cloud-proxy fake upstreams. Live for the whole suite so the four
 	// cloud-proxy model YAMLs can point at their URLs at startup time.
@@ -390,7 +391,7 @@ var _ = BeforeSuite(func() {
 	// Create application instance (GeneratedContentDir so sound-generation/TTS can write files the handler sends)
 	generatedDir := filepath.Join(tmpDir, "generated")
 	Expect(os.MkdirAll(generatedDir, 0750)).To(Succeed())
-	application, err := application.New(
+	localAIApp, err = localaiapp.New(
 		config.WithContext(appCtx),
 		config.WithSystemState(systemState),
 		config.WithDebug(true),
@@ -399,14 +400,14 @@ var _ = BeforeSuite(func() {
 	Expect(err).ToNot(HaveOccurred())

 	// Register mock backend (always available for non-realtime tests).
-	application.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
-	application.ModelLoader().SetExternalBackend("opus", mockBackendPath)
+	localAIApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)
+	localAIApp.ModelLoader().SetExternalBackend("opus", mockBackendPath)
 	if cloudProxyPath != "" {
-		application.ModelLoader().SetExternalBackend("cloud-proxy", cloudProxyPath)
+		localAIApp.ModelLoader().SetExternalBackend("cloud-proxy", cloudProxyPath)
 	}

 	// Create HTTP app
-	app, err = httpapi.API(application)
+	app, err = httpapi.API(localAIApp)
 	Expect(err).ToNot(HaveOccurred())

 	// Get free port
@@ -436,6 +437,14 @@ var _ = BeforeSuite(func() {
 })

 var _ = AfterSuite(func() {
+	// Synchronous shutdown — the context-cancel goroutine in application.New
+	// runs the same cleanup asynchronously, which races test-binary exit and
+	// orphans spawned mock-backend children to init.
+	if localAIApp != nil {
+		if err := localAIApp.Shutdown(); err != nil {
+			xlog.Error("error shutting down application", "error", err)
+		}
+	}
 	if appCancel != nil {
 		appCancel()
 	}