docs(llama-cpp): correct run.sh comment for arm64/darwin cpu-all

arm64 and darwin CPU images now also ship llama-cpp-cpu-all (not fallback-only); only GPU images ship fallback-only. Fix the stale comment to match. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
fix(llama-cpp,turboquant): only CPU_ALL_VARIANTS for pure-CPU builds, GPU uses fallback
2026-06-25 09:09:07 -04:00 · 2026-06-25 07:05:06 +00:00 · 2026-06-25 07:04:06 +00:00 · 2026-06-24 21:59:29 +00:00 · 2026-06-24 21:50:29 +00:00 · 2026-06-24 21:33:32 +00:00
115 changed files with 423 additions and 2154 deletions
--- a/.docker/llama-cpp-compile.sh
+++ b/.docker/llama-cpp-compile.sh
@@ -17,19 +17,29 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
  rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
 fi

-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
+cd /LocalAI/backend/cpp/llama-cpp
+if [ -z "${BUILD_TYPE:-}" ]; then
+  # Pure CPU image (BUILD_TYPE empty): one build with ggml CPU_ALL_VARIANTS replaces the
+  # per-microarch binaries (x86: avx/avx2/avx512/fallback; arm64: armv8.x/armv9.x). ggml
+  # dlopens the best libggml-cpu-*.so at runtime by probing host CPU features.
+  #
+  # arm64: the CPU_ALL_VARIANTS table includes armv9.2 SME variants whose -march=...+sme is
+  # rejected by the Ubuntu 24.04 default gcc-13. gcc-14 accepts it, so build the arm64
+  # variants with it (the host never *selects* SME unless it has it, but every variant must
+  # still compile).
+  if [ "${TARGETARCH}" = "arm64" ]; then
+    apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
+    export CC=gcc-14 CXX=g++-14
+  fi
+  make llama-cpp-cpu-all
 else
-  cd /LocalAI/backend/cpp/llama-cpp
-  make llama-cpp-avx
-  make llama-cpp-avx2
-  make llama-cpp-avx512
+  # GPU build (cublas/hipblas/sycl/vulkan/...): the accelerator does the compute, so a
+  # single fallback CPU build is enough - no per-microarch CPU variants needed. (This also
+  # keeps the heavy GPU backend compile from also building the whole CPU variant matrix,
+  # and avoids the gcc-14 apt step on GPU base images such as nvidia l4t.)
  make llama-cpp-fallback
-  make llama-cpp-grpc
-  make llama-cpp-rpc-server
 fi
+make llama-cpp-grpc
+make llama-cpp-rpc-server

 ccache -s || true
--- a/.docker/turboquant-compile.sh
+++ b/.docker/turboquant-compile.sh
@@ -19,17 +19,21 @@ fi

 cd /LocalAI/backend/cpp/turboquant

-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
-  make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
+if [ -z "${BUILD_TYPE:-}" ]; then
+  # Pure CPU image: one ggml CPU_ALL_VARIANTS build replaces the per-microarch binaries.
+  # arm64: the armv9.2 SME variants need gcc-14 (gcc-13 rejects +sme).
+  if [ "${TARGETARCH}" = "arm64" ]; then
+    apt-get update -qq && apt-get install -y -qq gcc-14 g++-14
+    export CC=gcc-14 CXX=g++-14
+  fi
+  make turboquant-cpu-all
 else
-  make turboquant-avx
-  make turboquant-avx2
-  make turboquant-avx512
+  # GPU build (cublas/hipblas/sycl/vulkan/...): single fallback CPU build, the accelerator
+  # does the compute. Keeps the GPU compile from also building the CPU variant matrix and
+  # avoids the gcc-14 apt step on GPU base images such as nvidia l4t.
  make turboquant-fallback
-  make turboquant-grpc
-  make turboquant-rpc-server
 fi
+make turboquant-grpc
+make turboquant-rpc-server

 ccache -s || true
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -4974,12 +4974,6 @@ includeDarwin:
  - backend: "kitten-tts"
    tag-suffix: "-metal-darwin-arm64-kitten-tts"
    build-type: "mps"
-  - backend: "trl"
-    tag-suffix: "-metal-darwin-arm64-trl"
-    build-type: "mps"
-  - backend: "liquid-audio"
-    tag-suffix: "-metal-darwin-arm64-liquid-audio"
-    build-type: "mps"
  - backend: "piper"
    tag-suffix: "-metal-darwin-arm64-piper"
    build-type: "metal"
@@ -4996,10 +4990,6 @@ includeDarwin:
    tag-suffix: "-metal-darwin-arm64-sherpa-onnx"
    build-type: "metal"
    lang: "go"
-  - backend: "supertonic"
-    tag-suffix: "-metal-darwin-arm64-supertonic"
-    build-type: "metal"
-    lang: "go"
  - backend: "local-store"
    tag-suffix: "-metal-darwin-arm64-local-store"
    build-type: "metal"
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=d5507e33ae7ee2b7b41475f08044d3bde3b839ee
+IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -50,8 +50,13 @@ add_custom_command(
        "${hw_proto}"
      DEPENDS "${hw_proto}")

-# hw_grpc_proto
-add_library(hw_grpc_proto
+# hw_grpc_proto: force STATIC. Under the CPU_ALL_VARIANTS build BUILD_SHARED_LIBS=ON
+# (ggml/llama become shared), which would otherwise make this glue library a DSO. As a
+# DSO it references the hidden-visibility symbols in the static libprotobuf.a, which the
+# linker cannot satisfy ("hidden symbol ... in libprotobuf.a is referenced by DSO").
+# Keeping it STATIC links protobuf/gRPC directly into the grpc-server executable while
+# only ggml/llama stay shared. No effect on the static variants (already BUILD_SHARED_LIBS=OFF).
+add_library(hw_grpc_proto STATIC
  ${hw_grpc_srcs}
  ${hw_grpc_hdrs}
  ${hw_proto_srcs}
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=8be759e6f70d629638a7eb70db3824cbdcea370b
+LLAMA_VERSION?=be4a6a63eb2b848e19c277bdcf2bd399e8af76d9
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
@@ -10,8 +10,16 @@ TARGET?=--target grpc-server
 JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
 ARCH?=$(shell uname -m)

-# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+# Shared libs default to OFF: we link static gRPC and the avx/avx2/avx512/fallback
+# variants are fully static. The CPU_ALL_VARIANTS build flips SHARED_LIBS=ON (ggml/llama
+# become shared so the dynamic CPU backends work; gRPC stays static via its imported
+# targets). SHARED_LIBS is a make variable, not an appended -D, so it survives the
+# recursive sub-make into the VARIANT build dir (which re-parses this Makefile) instead
+# of being re-clobbered by a second -DBUILD_SHARED_LIBS=OFF. EXTRA_CMAKE_ARGS is the hook
+# the CPU_ALL_VARIANTS target uses to inject -DGGML_BACKEND_DL/-DGGML_CPU_ALL_VARIANTS.
+SHARED_LIBS?=OFF
+EXTRA_CMAKE_ARGS?=
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=$(SHARED_LIBS) -DLLAMA_CURL=OFF $(EXTRA_CMAKE_ARGS)

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
@@ -120,6 +128,30 @@ llama-cpp-fallback: llama.cpp
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback

+# Single-build CPU backend using ggml's CPU_ALL_VARIANTS. Produces ONE grpc-server
+# plus a set of dlopen-able libggml-cpu-*.so (sandybridge/haswell/skylakex/...) that
+# ggml's backend registry selects from at runtime by probing host CPU features.
+# Replaces the avx/avx2/avx512/fallback multi-binary build on x86.
+#
+# CPU_ALL_VARIANTS requires GGML_BACKEND_DL, which requires BUILD_SHARED_LIBS=ON, so we
+# pass SHARED_LIBS=ON and the DL flags as make variables (NOT pre-expanded into the
+# CMAKE_ARGS env string): command-line make variables propagate through every recursive
+# sub-make, so the deepest VARIANT-dir build computes BUILD_SHARED_LIBS=ON consistently.
+# Only ggml/llama go shared - gRPC is found via its static imported targets, so the
+# grpc-server binary keeps static gRPC and only dynamically links ggml.
+#
+# TARGET adds "ggml": the per-microarch backends are runtime-dlopened, not link deps of
+# grpc-server, so they only build because each is an add_dependencies() of the ggml target.
+llama-cpp-cpu-all: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build purge
+	$(info ${GREEN}I llama-cpp build info:cpu-all-variants${RESET})
+	$(MAKE) SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" VARIANT="llama-cpp-cpu-all-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/grpc-server llama-cpp-cpu-all
+	rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
+	find $(CURRENT_MAKEFILE_DIR)/../llama-cpp-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
+	@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
+
 llama-cpp-grpc: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -37,7 +37,6 @@
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "common.h"
-#include "arg.h"
 #include "chat-auto-parser.h"
 #include <getopt.h>
 #include <grpcpp/ext/proto_server_reflection_plugin.h>
@@ -593,10 +592,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    params.checkpoint_min_step = 256;
 #endif

-    // Raw upstream llama-server flags collected from any option entry that
-    // starts with '-'. Applied once after the loop via common_params_parse.
-    std::vector<std::string> extra_argv;
-
     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
        std::string opt = request->options(i);
@@ -1085,31 +1080,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                } catch (...) {}
            }

-        // --- main model MoE on CPU (upstream --cpu-moe / --n-cpu-moe) ---
-        } else if (!strcmp(optname, "cpu_moe")) {
-            // Bool-style flag: keep all MoE expert weights on CPU.
-            const bool enable = (optval == NULL) ||
-                optval_str == "true" || optval_str == "1" || optval_str == "yes" ||
-                optval_str == "on" || optval_str == "enabled";
-            if (enable) {
-                params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
-            }
-        } else if (!strcmp(optname, "n_cpu_moe")) {
-            if (optval != NULL) {
-                try {
-                    int n = std::stoi(optval_str);
-                    if (n < 0) n = 0;
-                    // Keep override-name storage alive for the lifetime of the
-                    // params struct (mirrors upstream arg.cpp's function-local static).
-                    static std::list<std::string> buft_overrides_main;
-                    for (int i = 0; i < n; ++i) {
-                        buft_overrides_main.push_back(llm_ffn_exps_block_regex(i));
-                        params.tensor_buft_overrides.push_back(
-                            {buft_overrides_main.back().c_str(), ggml_backend_cpu_buffer_type()});
-                    }
-                } catch (...) {}
-            }
-
        // --- draft model tensor buffer overrides (upstream --spec-draft-override-tensor) ---
        } else if (!strcmp(optname, "draft_override_tensor") || !strcmp(optname, "spec_draft_override_tensor")) {
            // Format: <tensor regex>=<buffer type>,<tensor regex>=<buffer type>,...
@@ -1141,30 +1111,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                else { cur.push_back(c); }
            }
            if (!cur.empty()) flush(cur);
-
-        // --- generic passthrough: any entry starting with '-' is a raw
-        //     upstream llama-server flag, forwarded verbatim to the parser. ---
-        } else if (optname[0] == '-') {
-            std::string flag = optname;
-            // These flags make upstream's parser exit() (printing usage /
-            // completion), which would kill the backend process. Skip them.
-            if (flag == "-h" || flag == "--help" || flag == "--usage" ||
-                flag == "--version" || flag == "--license" ||
-                flag == "--list-devices" || flag == "-cl" ||
-                flag == "--cache-list" ||
-                flag.rfind("--completion", 0) == 0) {
-                fprintf(stderr,
-                    "[llama-cpp] ignoring passthrough flag that would exit: %s\n",
-                    flag.c_str());
-            } else {
-                extra_argv.push_back(flag);
-                // Preserve the whole value after the first ':' so embedded
-                // colons (e.g. host:port) survive strtok's truncation of optval.
-                auto colon = opt.find(':');
-                if (colon != std::string::npos) {
-                    extra_argv.push_back(opt.substr(colon + 1));
-                }
-            }
        }
    }

@@ -1200,6 +1146,27 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
        }
    }

+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back();
+        params.kv_overrides.back().key[0] = 0;
+    }
+
+    // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
+    // Real entries are pushed during option parsing; here we pad/terminate so the
+    // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
+    // and so llama_params_fit has the placeholder slots it requires.
+    {
+        const size_t ntbo = llama_max_tensor_buft_overrides();
+        while (params.tensor_buft_overrides.size() < ntbo) {
+            params.tensor_buft_overrides.push_back({nullptr, nullptr});
+        }
+    }
+    // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
+    // the main-model handling above.
+    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
+        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
+    }
+
    // TODO: Add yarn

    if (!request->tensorsplit().empty()) {
@@ -1292,69 +1259,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
            params.sampling.grammar_triggers.push_back(std::move(trigger));
        }
    }
-
-    // Apply any raw upstream flags last so an explicit passthrough flag wins
-    // over the LocalAI-resolved field it maps to (e.g. --ctx-size beats
-    // context_size). This is the same parser llama-server itself uses.
-    if (!extra_argv.empty()) {
-        // common_params_parser_init resets a few fields for the SERVER example
-        // (n_parallel -> -1, use_color). Snapshot n_parallel so an unrelated
-        // passthrough flag can't silently clobber LocalAI's resolved value.
-        const int saved_n_parallel = params.n_parallel;
-
-        std::vector<char *> argv;
-        std::string prog = "llama-server";
-        argv.push_back(prog.data());
-        for (auto & a : extra_argv) {
-            argv.push_back(a.data());
-        }
-
-        // ctx_arg.params is a reference, so this overlays the given flags onto
-        // `params` in place. Returns false on a recoverable parse error (and
-        // self-restores params); may exit() on a hard error, exactly as
-        // passing the same bad flag to llama-server would.
-        if (!common_params_parse((int)argv.size(), argv.data(), params,
-                                 LLAMA_EXAMPLE_SERVER)) {
-            fprintf(stderr,
-                "[llama-cpp] failed to parse passthrough options; ignoring them\n");
-        }
-
-        // Restore n_parallel unless a passthrough flag explicitly set it
-        // (parser_init's reset sentinel for SERVER is -1).
-        if (params.n_parallel == -1) {
-            params.n_parallel = saved_n_parallel;
-        }
-    }
-
-    // Terminate/pad the override vectors only after BOTH the named-option loop
-    // and the generic passthrough (common_params_parse above) have pushed their
-    // real entries, so back() is the null sentinel the model loader asserts on.
-    // Running these before the passthrough let a passthrough flag (--cpu-moe,
-    // --override-tensor, --override-kv, ...) append a real entry after the
-    // sentinel: a GGML_ASSERT crash for tensor_buft_overrides, a silent drop for
-    // kv_overrides. Double-termination is harmless (the while is a no-op if the
-    // passthrough parse already padded; an extra trailing null is ignored).
-
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
-    }
-
-    // tensor_buft_overrides sentinel termination (mirrors upstream common/arg.cpp).
-    // Real entries are pushed during option parsing; here we pad/terminate so the
-    // model loader sees back().pattern == nullptr (GGML_ASSERT at common.cpp:1543)
-    // and so llama_params_fit has the placeholder slots it requires.
-    {
-        const size_t ntbo = llama_max_tensor_buft_overrides();
-        while (params.tensor_buft_overrides.size() < ntbo) {
-            params.tensor_buft_overrides.push_back({nullptr, nullptr});
-        }
-    }
-    // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
-    // the main-model handling above.
-    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
-        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
-    }
 }


--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -14,6 +14,22 @@ mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/

+# Bundle the ggml shared backends produced by the CPU_ALL_VARIANTS build (libggml-base.so,
+# libggml.so, libllama.so and the per-microarch libggml-cpu-*.so), all into package/lib.
+#
+# Two distinct resolution mechanisms both land here:
+#   - NEEDED deps (libggml-base/libggml/libllama): resolved by the dynamic linker via the
+#     LD_LIBRARY_PATH=$CURDIR/lib that run.sh exports.
+#   - The per-microarch libggml-cpu-*.so are NOT linked; ggml *discovers* them at runtime by
+#     scanning the executable's own directory (readlink /proc/self/exe). run.sh launches via
+#     the bundled $CURDIR/lib/ld.so, so /proc/self/exe -> .../lib/ld.so and ggml scans lib/.
+#     That is why the variants must sit in lib/ (next to ld.so), not just on the link path.
+# No-op on builds (arm64/darwin) that don't produce the all-variants set.
+if [ -d "$CURDIR/ggml-shared-libs" ]; then
+    echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
+    cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
+fi
+
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    # x86_64 architecture
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -12,26 +12,12 @@ grep -e "flags" /proc/cpuinfo | head -1

 BINARY=llama-cpp-fallback

-if grep -q -e "\savx\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX    found OK"
-	if [ -e $CURDIR/llama-cpp-avx ]; then
-		BINARY=llama-cpp-avx
-	fi
-fi
-
-if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX2   found OK"
-	if [ -e $CURDIR/llama-cpp-avx2 ]; then
-		BINARY=llama-cpp-avx2
-	fi
-fi
-
-# Check avx 512
-if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX512F found OK"
-	if [ -e $CURDIR/llama-cpp-avx512 ]; then
-		BINARY=llama-cpp-avx512
-	fi
+# CPU images (x86, arm64, darwin) ship a single llama-cpp-cpu-all built with ggml
+# CPU_ALL_VARIANTS: ggml's backend registry dlopens the best libggml-cpu-*.so for this
+# host, so no shell-side AVX probing. GPU images (cublas/sycl/vulkan/hipblas) ship only
+# llama-cpp-fallback (the accelerator does the compute), so fall back to it when absent.
+if [ -e $CURDIR/llama-cpp-cpu-all ]; then
+	BINARY=llama-cpp-cpu-all
 fi

 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
--- a/backend/cpp/turboquant/Makefile
+++ b/backend/cpp/turboquant/Makefile
@@ -65,6 +65,29 @@ turboquant-avx:
 turboquant-fallback:
 	$(call turboquant-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)

+# Single-build CPU backend via ggml CPU_ALL_VARIANTS (mirrors llama-cpp-cpu-all).
+# turboquant reuses backend/cpp/llama-cpp's CMakeLists.txt (hw_grpc_proto STATIC) and
+# Makefile (SHARED_LIBS make-var + EXTRA_CMAKE_ARGS), so this passes the same overrides
+# through to the copied build: SHARED_LIBS=ON, the DL flags, and --target ggml (which
+# pulls in the per-microarch libggml-cpu-*.so via ggml's add_dependencies). The .so set
+# is collected for package.sh to bundle into package/lib.
+turboquant-cpu-all:
+	rm -rf $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
+	cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build purge
+	bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server.cpp
+	$(info $(GREEN)I turboquant build info:cpu-all-variants$(RESET))
+	LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build llama.cpp
+	bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp $(PATCHES_DIR)
+	SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" \
+	LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(TURBOQUANT_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/grpc-server turboquant-cpu-all
+	rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs
+	find $(CURRENT_MAKEFILE_DIR)/../turboquant-cpu-all-build/llama.cpp/build \( -name '*.so*' -o -name '*.dylib' \) -exec cp -av {} ggml-shared-libs/ \;
+	@echo "Collected ggml shared backends:" && ls -la ggml-shared-libs/
+
 turboquant-grpc:
 	$(call turboquant-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)

--- a/backend/cpp/turboquant/package.sh
+++ b/backend/cpp/turboquant/package.sh
@@ -14,6 +14,15 @@ mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/turboquant-* $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/

+# Bundle the ggml shared backends from the CPU_ALL_VARIANTS build into package/lib. ggml
+# discovers the per-microarch libggml-cpu-*.so by scanning the executable directory, which
+# (via the bundled lib/ld.so that run.sh launches through) resolves to lib/. See the
+# matching comment in backend/cpp/llama-cpp/package.sh. No-op on the fallback/ROCm builds.
+if [ -d "$CURDIR/ggml-shared-libs" ]; then
+    echo "Bundling ggml shared backends (CPU_ALL_VARIANTS)..."
+    cp -avf $CURDIR/ggml-shared-libs/*.so* $CURDIR/package/lib/
+fi
+
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    # x86_64 architecture
--- a/backend/cpp/turboquant/run.sh
+++ b/backend/cpp/turboquant/run.sh
@@ -12,26 +12,11 @@ grep -e "flags" /proc/cpuinfo | head -1

 BINARY=turboquant-fallback

-if grep -q -e "\savx\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX    found OK"
-	if [ -e $CURDIR/turboquant-avx ]; then
-		BINARY=turboquant-avx
-	fi
-fi
-
-if grep -q -e "\savx2\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX2   found OK"
-	if [ -e $CURDIR/turboquant-avx2 ]; then
-		BINARY=turboquant-avx2
-	fi
-fi
-
-# Check avx 512
-if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
-	echo "CPU:    AVX512F found OK"
-	if [ -e $CURDIR/turboquant-avx512 ]; then
-		BINARY=turboquant-avx512
-	fi
+# x86/arm64 ship a single turboquant-cpu-all built with ggml CPU_ALL_VARIANTS: ggml's
+# backend registry dlopens the best libggml-cpu-*.so for this host, so no shell-side
+# probing. ROCm ships only turboquant-fallback, so fall back to it when cpu-all is absent.
+if [ -e $CURDIR/turboquant-cpu-all ]; then
+	BINARY=turboquant-cpu-all
 fi

 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
--- a/backend/go/acestep-cpp/Makefile
+++ b/backend/go/acestep-cpp/Makefile
@@ -117,8 +117,7 @@ libgoacestepcpp-custom: CMakeLists.txt cpp/goacestepcpp.cpp cpp/goacestepcpp.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target goacestepcpp && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libgoacestepcpp.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libgoacestepcpp.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libgoacestepcpp.so ./$(SO_TARGET)

 test: acestep-cpp
 	@echo "Running acestep-cpp tests..."
--- a/backend/go/acestep-cpp/main.go
+++ b/backend/go/acestep-cpp/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -23,11 +22,7 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("ACESTEP_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgoacestepcpp-fallback.dylib"
-		} else {
-			libName = "./libgoacestepcpp-fallback.so"
-		}
+		libName = "./libgoacestepcpp-fallback.so"
 	}

 	gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/acestep-cpp/package.sh
+++ b/backend/go/acestep-cpp/package.sh
@@ -13,7 +13,6 @@ mkdir -p $CURDIR/package/lib

 cp -avf $CURDIR/acestep-cpp $CURDIR/package/
 cp -fv $CURDIR/libgoacestepcpp-*.so $CURDIR/package/
-cp -fv $CURDIR/libgoacestepcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
--- a/backend/go/acestep-cpp/run.sh
+++ b/backend/go/acestep-cpp/run.sh
@@ -12,19 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single library variant (Metal or Accelerate). The goacestepcpp
-	# target is built as a CMake MODULE, which emits a .dylib for a SHARED
-	# build but a .so for a MODULE build on Apple, so prefer .dylib and fall
-	# back to .so.
-	LIBRARY="$CURDIR/libgoacestepcpp-fallback.dylib"
-	if [ ! -e "$LIBRARY" ]; then
-		LIBRARY="$CURDIR/libgoacestepcpp-fallback.so"
-	fi
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgoacestepcpp-fallback.so"
+LIBRARY="$CURDIR/libgoacestepcpp-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgoacestepcpp-avx.so ]; then
@@ -46,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libgoacestepcpp-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export ACESTEP_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/ced/Makefile
+++ b/backend/go/ced/Makefile
@@ -57,7 +57,6 @@ libced.so: sources/ced.cpp
 	cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
 	cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
 	cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
-	cp -fv sources/ced.cpp/build-shared/libced.dylib ./ 2>/dev/null || true
 	cp -fv sources/ced.cpp/include/ced_capi.h ./

 ced-grpc: libced.so main.go goced.go
--- a/backend/go/ced/main.go
+++ b/backend/go/ced/main.go
@@ -12,7 +12,6 @@ import (
 	"flag"
 	"fmt"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -28,11 +27,7 @@ type libFunc struct {
 func main() {
 	libName := os.Getenv("CED_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "libced.dylib"
-		} else {
-			libName = "libced.so"
-		}
+		libName = "libced.so"
 	}
 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
 	if err != nil {
--- a/backend/go/ced/package.sh
+++ b/backend/go/ced/package.sh
@@ -15,12 +15,10 @@ mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"

-cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || true
-cp -avf "$CURDIR"/libced.dylib "$CURDIR/package/lib/" 2>/dev/null || true
-if ! ls "$CURDIR"/package/lib/libced.* >/dev/null 2>&1; then
-	echo "ERROR: libced shared library not found in $CURDIR, run 'make' first" >&2
+cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
-fi
+}

 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
--- a/backend/go/ced/run.sh
+++ b/backend/go/ced/run.sh
@@ -3,12 +3,7 @@ set -e

 CURDIR=$(dirname "$(realpath "$0")")

-if [ "$(uname)" = "Darwin" ]; then
-	export DYLD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${DYLD_LIBRARY_PATH:-}"
-	export CED_LIBRARY="$CURDIR/lib/libced.dylib"
-else
-	export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
-fi
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"

 # If a self-contained ld.so was packaged, route through it so the packaged
 # libc / libstdc++ are used instead of the host's (matches the sibling backends).
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -75,8 +75,7 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgocrispasr-avx.so libgocrispasr-avx2.so libgocrispasr-avx512.so libgocrispasr-fallback.so
 else
-	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
-	VARIANT_TARGETS = libgocrispasr-fallback.dylib
+	VARIANT_TARGETS = libgocrispasr-fallback.so
 endif

 crispasr: main.go gocrispasr.go $(VARIANT_TARGETS)
@@ -88,7 +87,7 @@ package: crispasr
 build: package

 clean: purge
-	rm -rf libgocrispasr*.so libgocrispasr*.dylib package sources/CrispASR crispasr
+	rm -rf libgocrispasr*.so package sources/CrispASR crispasr

 purge:
 	rm -rf build*
@@ -119,21 +118,13 @@ libgocrispasr-fallback.so: sources/CrispASR
 	SO_TARGET=libgocrispasr-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgocrispasr-custom
 	rm -rfv build*

-# Build fallback variant as a dylib (Darwin)
-libgocrispasr-fallback.dylib: sources/CrispASR
-	$(MAKE) purge
-	$(info ${GREEN}I crispasr build info:fallback (dylib)${RESET})
-	SO_TARGET=libgocrispasr-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgocrispasr-custom
-	rm -rfv build*
-
 libgocrispasr-custom: CMakeLists.txt cpp/crispasr_shim.cpp cpp/crispasr_shim.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libgocrispasr.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libgocrispasr.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libgocrispasr.so ./$(SO_TARGET)

 test: crispasr
 	CGO_ENABLED=0 $(GOCMD) test -v ./...
--- a/backend/go/crispasr/main.go
+++ b/backend/go/crispasr/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,11 +21,7 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("CRISPASR_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgocrispasr-fallback.dylib"
-		} else {
-			libName = "./libgocrispasr-fallback.so"
-		}
+		libName = "./libgocrispasr-fallback.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/crispasr/package.sh
+++ b/backend/go/crispasr/package.sh
@@ -12,8 +12,7 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib

 cp -avf $CURDIR/crispasr $CURDIR/package/
-cp -fv $CURDIR/libgocrispasr-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/libgocrispasr-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgocrispasr-*.so $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
--- a/backend/go/crispasr/run.sh
+++ b/backend/go/crispasr/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libgocrispasr-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgocrispasr-fallback.so"
+LIBRARY="$CURDIR/libgocrispasr-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgocrispasr-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libgocrispasr-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export CRISPASR_LIBRARY=$LIBRARY

 # Point piper's espeak-ng phonemizer at the bundled voice data. The variable
--- a/backend/go/depth-anything-cpp/Makefile
+++ b/backend/go/depth-anything-cpp/Makefile
@@ -77,7 +77,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libdepthanythingcpp-avx.so libdepthanythingcpp-avx2.so libdepthanythingcpp-avx512.so libdepthanythingcpp-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = libdepthanythingcpp-fallback.dylib
+	VARIANT_TARGETS = libdepthanythingcpp-fallback.so
 endif

 depth-anything-cpp: main.go godepthanythingcpp.go $(VARIANT_TARGETS)
@@ -89,7 +89,7 @@ package: depth-anything-cpp
 build: package

 clean: purge
-	rm -rf libdepthanythingcpp*.so libdepthanythingcpp*.dylib depth-anything-cpp package sources
+	rm -rf libdepthanythingcpp*.so depth-anything-cpp package sources

 purge:
 	rm -rf build*
@@ -116,19 +116,11 @@ libdepthanythingcpp-avx512.so: sources/depth-anything.cpp
 endif

 # Build fallback variant (all platforms)
-ifeq ($(UNAME_S),Darwin)
-libdepthanythingcpp-fallback.dylib: sources/depth-anything.cpp
-	rm -rfv build-$@
-	$(info ${GREEN}I depth-anything-cpp build info:fallback${RESET})
-	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libdepthanythingcpp-custom
-	rm -rfv build-$@
-else
 libdepthanythingcpp-fallback.so: sources/depth-anything.cpp
 	rm -rfv build-$@
 	$(info ${GREEN}I depth-anything-cpp build info:fallback${RESET})
 	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libdepthanythingcpp-custom
 	rm -rfv build-$@
-endif

 libdepthanythingcpp-custom: CMakeLists.txt
 	mkdir -p build-$(SO_TARGET) && \
@@ -136,8 +128,7 @@ libdepthanythingcpp-custom: CMakeLists.txt
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libdepthanything.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libdepthanything.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libdepthanything.so ./$(SO_TARGET)

 all: depth-anything-cpp package

--- a/backend/go/depth-anything-cpp/main.go
+++ b/backend/go/depth-anything-cpp/main.go
@@ -9,7 +9,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -28,11 +27,7 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("DEPTHANYTHING_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libdepthanythingcpp-fallback.dylib"
-		} else {
-			libName = "./libdepthanythingcpp-fallback.so"
-		}
+		libName = "./libdepthanythingcpp-fallback.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/depth-anything-cpp/package.sh
+++ b/backend/go/depth-anything-cpp/package.sh
@@ -10,8 +10,7 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib

-cp -fv $CURDIR/libdepthanythingcpp-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/libdepthanythingcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -avf $CURDIR/libdepthanythingcpp-*.so $CURDIR/package/
 cp -avf $CURDIR/depth-anything-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

--- a/backend/go/depth-anything-cpp/run.sh
+++ b/backend/go/depth-anything-cpp/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libdepthanythingcpp-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libdepthanythingcpp-fallback.so"
+LIBRARY="$CURDIR/libdepthanythingcpp-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libdepthanythingcpp-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libdepthanythingcpp-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export DEPTHANYTHING_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/localvqe/Makefile
+++ b/backend/go/localvqe/Makefile
@@ -67,9 +67,8 @@ $(LIB_SENTINEL): sources/LocalVQE
 	# that the loader picks at runtime. We must build every target — the
 	# default `--target localvqe_shared` drops these. CMAKE_LIBRARY_OUTPUT_DIRECTORY
 	# routes all of them into build/bin; copy them out next to the binary.
-	cp -P build/bin/liblocalvqe.so* . 2>/dev/null || cp -P build/bin/liblocalvqe.dylib . 2>/dev/null || cp -P build/liblocalvqe.so* . 2>/dev/null || cp -P build/liblocalvqe.dylib .
+	cp -P build/bin/liblocalvqe.so* . 2>/dev/null || cp -P build/liblocalvqe.so* .
 	cp -P build/bin/libggml*.so* . 2>/dev/null || true
-	cp -P build/bin/libggml*.dylib . 2>/dev/null || true
 	touch $(LIB_SENTINEL)

 liblocalvqe.so: $(LIB_SENTINEL)
--- a/backend/go/localvqe/main.go
+++ b/backend/go/localvqe/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,11 +21,7 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("LOCALVQE_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./liblocalvqe.dylib"
-		} else {
-			libName = "./liblocalvqe.so"
-		}
+		libName = "./liblocalvqe.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/localvqe/package.sh
+++ b/backend/go/localvqe/package.sh
@@ -15,9 +15,7 @@ cp -avf $CURDIR/localvqe $CURDIR/package/
 # liblocalvqe.so* (with SOVERSION symlinks) and the libggml-*.so runtime
 # variants — LocalVQE picks the matching CPU variant at load time.
 cp -P $CURDIR/liblocalvqe.so* $CURDIR/package/ 2>/dev/null || true
-cp -P $CURDIR/liblocalvqe.dylib $CURDIR/package/ 2>/dev/null || true
 cp -P $CURDIR/libggml*.so* $CURDIR/package/ 2>/dev/null || true
-cp -P $CURDIR/libggml*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -fv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
--- a/backend/go/localvqe/run.sh
+++ b/backend/go/localvqe/run.sh
@@ -10,19 +10,8 @@ CURDIR=$(dirname "$(realpath $0)")
 # exec'ing the binary.
 cd "$CURDIR"

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: LocalVQE is built as a SHARED library, so dyld needs the .dylib +
-	# DYLD_LIBRARY_PATH. Prefer .dylib and fall back to .so just in case.
-	export DYLD_LIBRARY_PATH=$CURDIR:$CURDIR/lib:$DYLD_LIBRARY_PATH
-	LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.dylib
-	if [ ! -e "$LOCALVQE_LIBRARY" ]; then
-		LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.so
-	fi
-	export LOCALVQE_LIBRARY
-else
-	export LD_LIBRARY_PATH=$CURDIR:$CURDIR/lib:$LD_LIBRARY_PATH
-	export LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.so
-fi
+export LD_LIBRARY_PATH=$CURDIR:$CURDIR/lib:$LD_LIBRARY_PATH
+export LOCALVQE_LIBRARY=$CURDIR/liblocalvqe.so

 if [ -f $CURDIR/lib/ld.so ]; then
 	echo "Using lib/ld.so"
--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -70,7 +70,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = liblocateanythingcpp-avx.so liblocateanythingcpp-avx2.so liblocateanythingcpp-avx512.so liblocateanythingcpp-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = liblocateanythingcpp-fallback.dylib
+	VARIANT_TARGETS = liblocateanythingcpp-fallback.so
 endif

 locate-anything-cpp: main.go golocateanythingcpp.go $(VARIANT_TARGETS)
@@ -82,7 +82,7 @@ package: locate-anything-cpp
 build: package

 clean: purge
-	rm -rf liblocateanythingcpp*.so liblocateanythingcpp*.dylib locate-anything-cpp package sources
+	rm -rf liblocateanythingcpp*.so locate-anything-cpp package sources

 purge:
 	rm -rf build*
@@ -109,19 +109,11 @@ liblocateanythingcpp-avx512.so: sources/locate-anything.cpp
 endif

 # Build fallback variant (all platforms)
-ifeq ($(UNAME_S),Darwin)
-liblocateanythingcpp-fallback.dylib: sources/locate-anything.cpp
-	rm -rfv build-$@
-	$(info ${GREEN}I locate-anything-cpp build info:fallback${RESET})
-	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
-	rm -rfv build-$@
-else
 liblocateanythingcpp-fallback.so: sources/locate-anything.cpp
 	rm -rfv build-$@
 	$(info ${GREEN}I locate-anything-cpp build info:fallback${RESET})
 	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
 	rm -rfv build-$@
-endif

 liblocateanythingcpp-custom: CMakeLists.txt
 	mkdir -p build-$(SO_TARGET) && \
@@ -129,8 +121,7 @@ liblocateanythingcpp-custom: CMakeLists.txt
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/liblocateanythingcpp.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/liblocateanythingcpp.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/liblocateanythingcpp.so ./$(SO_TARGET)

 all: locate-anything-cpp package

--- a/backend/go/locate-anything-cpp/main.go
+++ b/backend/go/locate-anything-cpp/main.go
@@ -9,7 +9,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -28,11 +27,7 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("LOCATEANYTHING_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./liblocateanythingcpp-fallback.dylib"
-		} else {
-			libName = "./liblocateanythingcpp-fallback.so"
-		}
+		libName = "./liblocateanythingcpp-fallback.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/locate-anything-cpp/package.sh
+++ b/backend/go/locate-anything-cpp/package.sh
@@ -10,8 +10,7 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib

-cp -fv $CURDIR/liblocateanythingcpp-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/liblocateanythingcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -avf $CURDIR/liblocateanythingcpp-*.so $CURDIR/package/
 cp -avf $CURDIR/locate-anything-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

--- a/backend/go/locate-anything-cpp/run.sh
+++ b/backend/go/locate-anything-cpp/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/liblocateanythingcpp-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/liblocateanythingcpp-fallback.so"
+LIBRARY="$CURDIR/liblocateanythingcpp-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/liblocateanythingcpp-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/liblocateanythingcpp-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export LOCATEANYTHING_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=0f37401bebe9b20c0160a888e592108fc1d17607
+OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
 SO_TARGET?=libgomnivoicecpp.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -65,8 +65,7 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgomnivoicecpp-avx.so libgomnivoicecpp-avx2.so libgomnivoicecpp-avx512.so libgomnivoicecpp-fallback.so
 else
-	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
-	VARIANT_TARGETS = libgomnivoicecpp-fallback.dylib
+	VARIANT_TARGETS = libgomnivoicecpp-fallback.so
 endif

 omnivoice-cpp: main.go gomnivoicecpp.go $(VARIANT_TARGETS)
@@ -78,7 +77,7 @@ package: omnivoice-cpp
 build: package

 clean: purge
-	rm -rf libgomnivoicecpp*.so libgomnivoicecpp*.dylib package sources/omnivoice.cpp omnivoice-cpp
+	rm -rf libgomnivoicecpp*.so package sources/omnivoice.cpp omnivoice-cpp

 purge:
 	rm -rf build*
@@ -107,20 +106,13 @@ libgomnivoicecpp-fallback.so: sources/omnivoice.cpp
 	SO_TARGET=libgomnivoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
 	rm -rf build-libgomnivoicecpp-fallback.so

-# Build fallback variant as a dylib (Darwin)
-libgomnivoicecpp-fallback.dylib: sources/omnivoice.cpp
-	$(info ${GREEN}I omnivoice-cpp build info:fallback (dylib)${RESET})
-	SO_TARGET=libgomnivoicecpp-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgomnivoicecpp-custom
-	rm -rf build-libgomnivoicecpp-fallback.dylib
-
 libgomnivoicecpp-custom: CMakeLists.txt cpp/gomnivoicecpp.cpp cpp/gomnivoicecpp.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target gomnivoicecpp && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libgomnivoicecpp.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libgomnivoicecpp.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libgomnivoicecpp.so ./$(SO_TARGET)

 test: omnivoice-cpp
 	@echo "Running omnivoice-cpp tests..."
--- a/backend/go/omnivoice-cpp/main.go
+++ b/backend/go/omnivoice-cpp/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,11 +21,7 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("OMNIVOICE_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgomnivoicecpp-fallback.dylib"
-		} else {
-			libName = "./libgomnivoicecpp-fallback.so"
-		}
+		libName = "./libgomnivoicecpp-fallback.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/omnivoice-cpp/package.sh
+++ b/backend/go/omnivoice-cpp/package.sh
@@ -12,8 +12,7 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib

 cp -avf $CURDIR/omnivoice-cpp $CURDIR/package/
-cp -fv $CURDIR/libgomnivoicecpp-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/libgomnivoicecpp-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgomnivoicecpp-*.so $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
--- a/backend/go/omnivoice-cpp/run.sh
+++ b/backend/go/omnivoice-cpp/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libgomnivoicecpp-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgomnivoicecpp-fallback.so"
+LIBRARY="$CURDIR/libgomnivoicecpp-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgomnivoicecpp-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libgomnivoicecpp-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export OMNIVOICE_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/parakeet-cpp/Makefile
+++ b/backend/go/parakeet-cpp/Makefile
@@ -74,7 +74,6 @@ libparakeet.so: sources/parakeet.cpp
 	cmake -B sources/parakeet.cpp/build-shared -S sources/parakeet.cpp $(CMAKE_ARGS)
 	cmake --build sources/parakeet.cpp/build-shared --config Release -j$(JOBS)
 	cp -fv sources/parakeet.cpp/build-shared/libparakeet.so* ./ 2>/dev/null || true
-	cp -fv sources/parakeet.cpp/build-shared/libparakeet.dylib ./ 2>/dev/null || true
 	cp -fv sources/parakeet.cpp/include/parakeet_capi.h ./

 parakeet-cpp-grpc: libparakeet.so main.go goparakeetcpp.go
--- a/backend/go/parakeet-cpp/main.go
+++ b/backend/go/parakeet-cpp/main.go
@@ -2,17 +2,15 @@ package main

 // Started internally by LocalAI - one gRPC server per loaded model.
 //
-// Loads the parakeet shared library via purego and registers the flat
-// C-API entry points declared in parakeet_capi.h. The library name can be
-// overridden with PARAKEET_LIBRARY (mirrors the WHISPER_LIBRARY /
-// VIBEVOICECPP_LIBRARY convention in the sibling backends); the default
-// looks next to this binary for libparakeet.so on Linux and
-// libparakeet.dylib on macOS.
+// Loads libparakeet.so via purego and registers the flat C-API entry
+// points declared in parakeet_capi.h. The library name can be overridden
+// with PARAKEET_LIBRARY (mirrors the WHISPER_LIBRARY / VIBEVOICECPP_LIBRARY
+// convention in the sibling backends); the default looks for the .so next
+// to this binary.
 import (
 	"flag"
 	"fmt"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -30,11 +28,7 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("PARAKEET_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "libparakeet.dylib"
-		} else {
-			libName = "libparakeet.so"
-		}
+		libName = "libparakeet.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/parakeet-cpp/package.sh
+++ b/backend/go/parakeet-cpp/package.sh
@@ -16,15 +16,12 @@ mkdir -p "$CURDIR/package/lib"
 cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
 cp -avf "$CURDIR/run.sh" "$CURDIR/package/"

-# libparakeet shared lib + any soname symlinks. On Linux this is
-# libparakeet.so[.X.Y]; on macOS it is libparakeet.dylib. purego.Dlopen
-# resolves it via the *_LIBRARY_PATH that run.sh points at lib/.
-cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || true
-cp -avf "$CURDIR"/libparakeet.dylib "$CURDIR/package/lib/" 2>/dev/null || true
-if ! ls "$CURDIR"/package/lib/libparakeet.* >/dev/null 2>&1; then
-	echo "ERROR: libparakeet shared library not found in $CURDIR, run 'make' first" >&2
+# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
+# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
+cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
 	exit 1
-fi
+}

 # Detect architecture and copy the core runtime libs libparakeet.so links
 # against, plus the matching dynamic loader as lib/ld.so.
@@ -51,7 +48,7 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
 elif [ "$(uname -s)" = "Darwin" ]; then
-    echo "Detected Darwin — system frameworks linked dynamically, no bundled libs needed"
+    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
--- a/backend/go/parakeet-cpp/run.sh
+++ b/backend/go/parakeet-cpp/run.sh
@@ -3,17 +3,11 @@ set -e

 CURDIR=$(dirname "$(realpath "$0")")

-if [ "$(uname)" = "Darwin" ]; then
-	export DYLD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${DYLD_LIBRARY_PATH:-}"
-	export PARAKEET_LIBRARY="$CURDIR/lib/libparakeet.dylib"
-else
-	export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
-	export PARAKEET_LIBRARY="$CURDIR/lib/libparakeet.so"
-fi
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"

 # If a self-contained ld.so was packaged, route through it so the
 # packaged libc / libstdc++ are used instead of the host's (matches the
-# whisper backend's runtime layout). Linux only.
+# whisper backend's runtime layout).
 if [ -f "$CURDIR/lib/ld.so" ]; then
 	echo "Using lib/ld.so"
 	exec "$CURDIR/lib/ld.so" "$CURDIR/parakeet-cpp-grpc" "$@"
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # qwentts.cpp version
 QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp
-QWEN3TTS_CPP_VERSION?=9dbe7ea26a01b30fccb117ae5e86807c1dc23d42
+QWEN3TTS_CPP_VERSION?=4536dcdce27c3764a93a06d6bf64026b124962f5
 SO_TARGET?=libgoqwen3ttscpp.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -65,8 +65,8 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgoqwen3ttscpp-avx.so libgoqwen3ttscpp-avx2.so libgoqwen3ttscpp-avx512.so libgoqwen3ttscpp-fallback.so
 else
-	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
-	VARIANT_TARGETS = libgoqwen3ttscpp-fallback.dylib
+	# On non-Linux (e.g., Darwin), build only fallback variant
+	VARIANT_TARGETS = libgoqwen3ttscpp-fallback.so
 endif

 qwen3-tts-cpp: main.go goqwen3ttscpp.go $(VARIANT_TARGETS)
@@ -78,7 +78,7 @@ package: qwen3-tts-cpp
 build: package

 clean: purge
-	rm -rf libgoqwen3ttscpp*.so libgoqwen3ttscpp*.dylib package sources/qwentts.cpp qwen3-tts-cpp
+	rm -rf libgoqwen3ttscpp*.so package sources/qwentts.cpp qwen3-tts-cpp

 purge:
 	rm -rf build*
@@ -110,20 +110,13 @@ libgoqwen3ttscpp-fallback.so: sources/qwentts.cpp
 	SO_TARGET=libgoqwen3ttscpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
 	rm -rf build-libgoqwen3ttscpp-fallback.so

-# Build fallback variant as a dylib (Darwin)
-libgoqwen3ttscpp-fallback.dylib: sources/qwentts.cpp
-	$(info ${GREEN}I qwen3-tts-cpp build info:fallback (dylib)${RESET})
-	SO_TARGET=libgoqwen3ttscpp-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgoqwen3ttscpp-custom
-	rm -rf build-libgoqwen3ttscpp-fallback.dylib
-
 libgoqwen3ttscpp-custom: CMakeLists.txt cpp/goqwen3ttscpp.cpp cpp/goqwen3ttscpp.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target goqwen3ttscpp && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libgoqwen3ttscpp.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libgoqwen3ttscpp.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libgoqwen3ttscpp.so ./$(SO_TARGET)

 test: qwen3-tts-cpp
 	@echo "Running qwen3-tts-cpp tests..."
--- a/backend/go/qwen3-tts-cpp/main.go
+++ b/backend/go/qwen3-tts-cpp/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,11 +21,7 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("QWEN3TTS_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgoqwen3ttscpp-fallback.dylib"
-		} else {
-			libName = "./libgoqwen3ttscpp-fallback.so"
-		}
+		libName = "./libgoqwen3ttscpp-fallback.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/qwen3-tts-cpp/package.sh
+++ b/backend/go/qwen3-tts-cpp/package.sh
@@ -12,8 +12,7 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib

 cp -avf $CURDIR/qwen3-tts-cpp $CURDIR/package/
-cp -fv $CURDIR/libgoqwen3ttscpp-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/libgoqwen3ttscpp-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgoqwen3ttscpp-*.so $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
--- a/backend/go/qwen3-tts-cpp/run.sh
+++ b/backend/go/qwen3-tts-cpp/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libgoqwen3ttscpp-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgoqwen3ttscpp-fallback.so"
+LIBRARY="$CURDIR/libgoqwen3ttscpp-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgoqwen3ttscpp-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libgoqwen3ttscpp-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export QWEN3TTS_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/rfdetr-cpp/Makefile
+++ b/backend/go/rfdetr-cpp/Makefile
@@ -71,7 +71,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = librfdetrcpp-avx.so librfdetrcpp-avx2.so librfdetrcpp-avx512.so librfdetrcpp-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = librfdetrcpp-fallback.dylib
+	VARIANT_TARGETS = librfdetrcpp-fallback.so
 endif

 rfdetr-cpp: main.go gorfdetrcpp.go $(VARIANT_TARGETS)
@@ -83,7 +83,7 @@ package: rfdetr-cpp
 build: package

 clean: purge
-	rm -rf librfdetrcpp*.so librfdetrcpp*.dylib rfdetr-cpp package sources
+	rm -rf librfdetrcpp*.so rfdetr-cpp package sources

 purge:
 	rm -rf build*
@@ -110,19 +110,11 @@ librfdetrcpp-avx512.so: sources/rt-detr.cpp
 endif

 # Build fallback variant (all platforms)
-ifeq ($(UNAME_S),Darwin)
-librfdetrcpp-fallback.dylib: sources/rt-detr.cpp
-	rm -rfv build-$@
-	$(info ${GREEN}I rfdetr-cpp build info:fallback${RESET})
-	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) librfdetrcpp-custom
-	rm -rfv build-$@
-else
 librfdetrcpp-fallback.so: sources/rt-detr.cpp
 	rm -rfv build-$@
 	$(info ${GREEN}I rfdetr-cpp build info:fallback${RESET})
 	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) librfdetrcpp-custom
 	rm -rfv build-$@
-endif

 librfdetrcpp-custom: CMakeLists.txt
 	mkdir -p build-$(SO_TARGET) && \
@@ -130,8 +122,7 @@ librfdetrcpp-custom: CMakeLists.txt
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/librfdetrcpp.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/librfdetrcpp.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/librfdetrcpp.so ./$(SO_TARGET)

 all: rfdetr-cpp package

--- a/backend/go/rfdetr-cpp/main.go
+++ b/backend/go/rfdetr-cpp/main.go
@@ -9,7 +9,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -28,11 +27,7 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("RFDETR_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./librfdetrcpp-fallback.dylib"
-		} else {
-			libName = "./librfdetrcpp-fallback.so"
-		}
+		libName = "./librfdetrcpp-fallback.so"
 	}

 	rfdetrLib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/rfdetr-cpp/package.sh
+++ b/backend/go/rfdetr-cpp/package.sh
@@ -10,8 +10,7 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib

-cp -fv $CURDIR/librfdetrcpp-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/librfdetrcpp-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -avf $CURDIR/librfdetrcpp-*.so $CURDIR/package/
 cp -avf $CURDIR/rfdetr-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

--- a/backend/go/rfdetr-cpp/run.sh
+++ b/backend/go/rfdetr-cpp/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/librfdetrcpp-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/librfdetrcpp-fallback.so"
+LIBRARY="$CURDIR/librfdetrcpp-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/librfdetrcpp-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/librfdetrcpp-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export RFDETR_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/sam3-cpp/Makefile
+++ b/backend/go/sam3-cpp/Makefile
@@ -66,7 +66,7 @@ ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgosam3-avx.so libgosam3-avx2.so libgosam3-avx512.so libgosam3-fallback.so
 else
 	# On non-Linux (e.g., Darwin), build only fallback variant
-	VARIANT_TARGETS = libgosam3-fallback.dylib
+	VARIANT_TARGETS = libgosam3-fallback.so
 endif

 sam3-cpp: main.go gosam3.go $(VARIANT_TARGETS)
@@ -78,7 +78,7 @@ package: sam3-cpp
 build: package

 clean: purge
-	rm -rf libgosam3*.so libgosam3*.dylib sam3-cpp package sources
+	rm -rf libgosam3*.so sam3-cpp package sources

 purge:
 	rm -rf build*
@@ -105,19 +105,11 @@ libgosam3-avx512.so: sources/sam3.cpp
 endif

 # Build fallback variant (all platforms)
-ifeq ($(UNAME_S),Darwin)
-libgosam3-fallback.dylib: sources/sam3.cpp
-	$(MAKE) purge
-	$(info ${GREEN}I sam3-cpp build info:fallback${RESET})
-	SO_TARGET=libgosam3-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosam3-custom
-	rm -rfv build*
-else
 libgosam3-fallback.so: sources/sam3.cpp
 	$(MAKE) purge
 	$(info ${GREEN}I sam3-cpp build info:fallback${RESET})
 	SO_TARGET=libgosam3-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosam3-custom
 	rm -rfv build*
-endif

 libgosam3-custom: CMakeLists.txt cpp/gosam3.cpp cpp/gosam3.h
 	mkdir -p build-$(SO_TARGET) && \
@@ -125,7 +117,6 @@ libgosam3-custom: CMakeLists.txt cpp/gosam3.cpp cpp/gosam3.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libgosam3.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libgosam3.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libgosam3.so ./$(SO_TARGET)

 all: sam3-cpp package
--- a/backend/go/sam3-cpp/main.go
+++ b/backend/go/sam3-cpp/main.go
@@ -3,7 +3,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,11 +21,7 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("SAM3_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgosam3-fallback.dylib"
-		} else {
-			libName = "./libgosam3-fallback.so"
-		}
+		libName = "./libgosam3-fallback.so"
 	}

 	gosamLib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/sam3-cpp/package.sh
+++ b/backend/go/sam3-cpp/package.sh
@@ -10,8 +10,7 @@ REPO_ROOT="${CURDIR}/../../.."
 # Create lib directory
 mkdir -p $CURDIR/package/lib

-cp -fv $CURDIR/libgosam3-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/libgosam3-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -avf $CURDIR/libgosam3-*.so $CURDIR/package/
 cp -avf $CURDIR/sam3-cpp $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

--- a/backend/go/sam3-cpp/run.sh
+++ b/backend/go/sam3-cpp/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libgosam3-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgosam3-fallback.so"
+LIBRARY="$CURDIR/libgosam3-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgosam3-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libgosam3-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export SAM3_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/sherpa-onnx/backend.go
+++ b/backend/go/sherpa-onnx/backend.go
@@ -7,7 +7,6 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
-	"runtime"
 	"strconv"
 	"strings"
 	"sync"
@@ -239,19 +238,11 @@ func loadSherpaLibs() error {
 func loadSherpaLibsOnce() error {
 	shimLib := os.Getenv("SHERPA_SHIM_LIBRARY")
 	if shimLib == "" {
-		if runtime.GOOS == "darwin" {
-			shimLib = "libsherpa-shim.dylib"
-		} else {
-			shimLib = "libsherpa-shim.so"
-		}
+		shimLib = "libsherpa-shim.so"
 	}
 	capiLib := os.Getenv("SHERPA_ONNX_LIBRARY")
 	if capiLib == "" {
-		if runtime.GOOS == "darwin" {
-			capiLib = "libsherpa-onnx-c-api.dylib"
-		} else {
-			capiLib = "libsherpa-onnx-c-api.so"
-		}
+		capiLib = "libsherpa-onnx-c-api.so"
 	}

 	shim, err := purego.Dlopen(shimLib, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/sherpa-onnx/run.sh
+++ b/backend/go/sherpa-onnx/run.sh
@@ -3,13 +3,7 @@ set -ex

 CURDIR=$(dirname "$(realpath $0)")

-if [ "$(uname)" = "Darwin" ]; then
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-	export SHERPA_SHIM_LIBRARY=$CURDIR/lib/libsherpa-shim.dylib
-	export SHERPA_ONNX_LIBRARY=$CURDIR/lib/libsherpa-onnx-c-api.dylib
-else
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
-fi
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH

 if [ -f $CURDIR/lib/ld.so ]; then
 	echo "Using lib/ld.so"
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=8caa3f908ae6d4a4bef531e73b9a969f266a3d1f
+STABLEDIFFUSION_GGML_VERSION?=f440ad9c29dd8bc34e5d1f4b863832b96d6ea05f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

@@ -131,7 +131,6 @@ libgosd-custom: CMakeLists.txt cpp/gosd.cpp cpp/gosd.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libgosd.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libgosd.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libgosd.so ./$(SO_TARGET)

 all: stablediffusion-ggml package
--- a/backend/go/stablediffusion-ggml/main.go
+++ b/backend/go/stablediffusion-ggml/main.go
@@ -3,7 +3,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,11 +21,7 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("SD_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgosd-fallback.dylib"
-		} else {
-			libName = "./libgosd-fallback.so"
-		}
+		libName = "./libgosd-fallback.so"
 	}

 	gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/stablediffusion-ggml/package.sh
+++ b/backend/go/stablediffusion-ggml/package.sh
@@ -12,7 +12,6 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib

 cp -avf $CURDIR/libgosd-*.so $CURDIR/package/
-cp -fv $CURDIR/libgosd-*.dylib $CURDIR/package/ 2>/dev/null || true
 cp -avf $CURDIR/stablediffusion-ggml $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

--- a/backend/go/stablediffusion-ggml/run.sh
+++ b/backend/go/stablediffusion-ggml/run.sh
@@ -12,18 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single library variant (Metal or Accelerate). The gosd target is
-	# built as a CMake MODULE, which emits a .dylib for a SHARED build but a
-	# .so for a MODULE build on Apple, so prefer .dylib and fall back to .so.
-	LIBRARY="$CURDIR/libgosd-fallback.dylib"
-	if [ ! -e "$LIBRARY" ]; then
-		LIBRARY="$CURDIR/libgosd-fallback.so"
-	fi
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgosd-fallback.so"
+LIBRARY="$CURDIR/libgosd-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgosd-avx.so ]; then
@@ -45,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libgosd-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export SD_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/go/supertonic/helper.go
+++ b/backend/go/supertonic/helper.go
@@ -16,7 +16,6 @@ import (
 	"os"
 	"path/filepath"
 	"regexp"
-	"runtime"
 	"strings"
 	"time"
 	"unicode"
@@ -944,13 +943,7 @@ func InitializeONNXRuntime() error {
 			}
 		}
 		if libPath == "" {
-			// LocalAI: default to the platform-native shared library
-			// extension when nothing else is found (dyld vs ld.so).
-			if runtime.GOOS == "darwin" {
-				libPath = "/usr/local/lib/libonnxruntime.dylib"
-			} else {
-				libPath = "/usr/local/lib/libonnxruntime.so"
-			}
+			libPath = "/usr/local/lib/libonnxruntime.so"
 		}
 	}
 	ort.SetSharedLibraryPath(libPath)
--- a/backend/go/supertonic/package.sh
+++ b/backend/go/supertonic/package.sh
@@ -32,10 +32,6 @@ elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
-elif [ $(uname -s) = "Darwin" ]; then
-    # macOS: dyld resolves the bundled .dylib via DYLD_LIBRARY_PATH (set in
-    # run.sh); there is no ld.so loader nor glibc to bundle.
-    echo "Detected Darwin"
 else
    echo "Error: Could not detect architecture"
    exit 1
--- a/backend/go/supertonic/run.sh
+++ b/backend/go/supertonic/run.sh
@@ -3,19 +3,12 @@ set -ex

 CURDIR=$(dirname "$(realpath $0)")

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS uses dyld: there is no ld.so loader, and the search path env
-	# var is DYLD_LIBRARY_PATH. ONNX Runtime ships as a .dylib here.
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-	export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.dylib
-else
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
-	export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+export ONNXRUNTIME_LIB_PATH=$CURDIR/lib/libonnxruntime.so

-	if [ -f $CURDIR/lib/ld.so ]; then
-		echo "Using lib/ld.so"
-		exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
-	fi
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	exec $CURDIR/lib/ld.so $CURDIR/supertonic "$@"
 fi

 exec $CURDIR/supertonic "$@"
--- a/backend/go/vibevoice-cpp/Makefile
+++ b/backend/go/vibevoice-cpp/Makefile
@@ -70,8 +70,8 @@ UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	VARIANT_TARGETS = libgovibevoicecpp-avx.so libgovibevoicecpp-avx2.so libgovibevoicecpp-avx512.so libgovibevoicecpp-fallback.so
 else
-	# On non-Linux (e.g., Darwin), build only fallback variant (as a dylib)
-	VARIANT_TARGETS = libgovibevoicecpp-fallback.dylib
+	# On non-Linux (e.g., Darwin), build only fallback variant
+	VARIANT_TARGETS = libgovibevoicecpp-fallback.so
 endif

 vibevoice-cpp: main.go govibevoicecpp.go $(VARIANT_TARGETS)
@@ -83,7 +83,7 @@ package: vibevoice-cpp
 build: package

 clean: purge
-	rm -rf libgovibevoicecpp*.so libgovibevoicecpp*.dylib package sources/vibevoice.cpp vibevoice-cpp
+	rm -rf libgovibevoicecpp*.so package sources/vibevoice.cpp vibevoice-cpp

 purge:
 	rm -rf build*
@@ -119,21 +119,13 @@ libgovibevoicecpp-fallback.so: sources/vibevoice.cpp
 	SO_TARGET=libgovibevoicecpp-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgovibevoicecpp-custom
 	rm -rfv build*

-# Build fallback variant as a dylib (Darwin)
-libgovibevoicecpp-fallback.dylib: sources/vibevoice.cpp
-	$(MAKE) purge
-	$(info ${GREEN}I vibevoice-cpp build info:fallback (dylib)${RESET})
-	SO_TARGET=libgovibevoicecpp-fallback.dylib CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgovibevoicecpp-custom
-	rm -rfv build*
-
 libgovibevoicecpp-custom: CMakeLists.txt cpp/govibevoicecpp.cpp cpp/govibevoicecpp.h
 	mkdir -p build-$(SO_TARGET) && \
 	cd build-$(SO_TARGET) && \
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) --target govibevoicecpp && \
 	cd .. && \
-	(mv build-$(SO_TARGET)/libgovibevoicecpp.so ./$(SO_TARGET) 2>/dev/null || \
-	 mv build-$(SO_TARGET)/libgovibevoicecpp.dylib ./$(SO_TARGET) 2>/dev/null)
+	mv build-$(SO_TARGET)/libgovibevoicecpp.so ./$(SO_TARGET)

 test: vibevoice-cpp
 	@echo "Running vibevoice-cpp tests..."
--- a/backend/go/vibevoice-cpp/main.go
+++ b/backend/go/vibevoice-cpp/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -22,11 +21,7 @@ type LibFuncs struct {
 func main() {
 	libName := os.Getenv("VIBEVOICECPP_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgovibevoicecpp-fallback.dylib"
-		} else {
-			libName = "./libgovibevoicecpp-fallback.so"
-		}
+		libName = "./libgovibevoicecpp-fallback.so"
 	}

 	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/vibevoice-cpp/package.sh
+++ b/backend/go/vibevoice-cpp/package.sh
@@ -12,8 +12,7 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib

 cp -avf $CURDIR/vibevoice-cpp $CURDIR/package/
-cp -fv $CURDIR/libgovibevoicecpp-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/libgovibevoicecpp-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgovibevoicecpp-*.so $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
--- a/backend/go/vibevoice-cpp/run.sh
+++ b/backend/go/vibevoice-cpp/run.sh
@@ -11,13 +11,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libgovibevoicecpp-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgovibevoicecpp-fallback.so"
+LIBRARY="$CURDIR/libgovibevoicecpp-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgovibevoicecpp-avx.so ]; then
@@ -38,10 +34,9 @@ else
 			LIBRARY="$CURDIR/libgovibevoicecpp-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export VIBEVOICECPP_LIBRARY=$LIBRARY

 if [ -f $CURDIR/lib/ld.so ]; then
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -117,7 +117,6 @@ libgowhisper-custom: CMakeLists.txt cpp/gowhisper.cpp cpp/gowhisper.h
 	cmake .. $(CMAKE_ARGS) && \
 	cmake --build . --config Release -j$(JOBS) && \
 	cd .. && \
-	mv build-$(SO_TARGET)/libgowhisper.so ./$(SO_TARGET) 2>/dev/null || \
-		mv build-$(SO_TARGET)/libgowhisper.dylib ./$(SO_TARGET:.so=.dylib)
+	mv build-$(SO_TARGET)/libgowhisper.so ./$(SO_TARGET)

 all: whisper package
--- a/backend/go/whisper/main.go
+++ b/backend/go/whisper/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"flag"
 	"os"
-	"runtime"

 	"github.com/ebitengine/purego"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
@@ -23,11 +22,7 @@ func main() {
 	// Get library name from environment variable, default to fallback
 	libName := os.Getenv("WHISPER_LIBRARY")
 	if libName == "" {
-		if runtime.GOOS == "darwin" {
-			libName = "./libgowhisper-fallback.dylib"
-		} else {
-			libName = "./libgowhisper-fallback.so"
-		}
+		libName = "./libgowhisper-fallback.so"
 	}

 	gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
--- a/backend/go/whisper/package.sh
+++ b/backend/go/whisper/package.sh
@@ -12,8 +12,7 @@ REPO_ROOT="${CURDIR}/../../.."
 mkdir -p $CURDIR/package/lib

 cp -avf $CURDIR/whisper $CURDIR/package/
-cp -fv $CURDIR/libgowhisper-*.so $CURDIR/package/ 2>/dev/null || true
-cp -fv $CURDIR/libgowhisper-*.dylib $CURDIR/package/ 2>/dev/null || true
+cp -fv $CURDIR/libgowhisper-*.so $CURDIR/package/
 cp -fv $CURDIR/run.sh $CURDIR/package/

 # Detect architecture and copy appropriate libraries
--- a/backend/go/whisper/run.sh
+++ b/backend/go/whisper/run.sh
@@ -12,13 +12,9 @@ if [ "$(uname)" != "Darwin" ]; then
 	grep -e "flags" /proc/cpuinfo | head -1
 fi

-if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-else
-	LIBRARY="$CURDIR/libgowhisper-fallback.so"
+LIBRARY="$CURDIR/libgowhisper-fallback.so"

+if [ "$(uname)" != "Darwin" ]; then
 	if grep -q -e "\savx\s" /proc/cpuinfo ; then
 		echo "CPU:    AVX    found OK"
 		if [ -e $CURDIR/libgowhisper-avx.so ]; then
@@ -40,10 +36,9 @@ else
 			LIBRARY="$CURDIR/libgowhisper-avx512.so"
 		fi
 	fi
-
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi

+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 export WHISPER_LIBRARY=$LIBRARY

 # If there is a lib/ld.so, use it
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -1284,7 +1284,6 @@
    nvidia-cuda-13: "cuda13-liquid-audio"
    nvidia-cuda-12: "cuda12-liquid-audio"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio"
-    metal: "metal-liquid-audio"
  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png
 - &qwen-tts
  urls:
@@ -1570,7 +1569,6 @@
    - TTS
  capabilities:
    default: "cpu-supertonic"
-    metal: "metal-supertonic"
 - !!merge <<: *neutts
  name: "neutts-development"
  capabilities:
@@ -4614,7 +4612,6 @@
    nvidia-cuda-13: "cuda13-liquid-audio-development"
    nvidia-cuda-12: "cuda12-liquid-audio-development"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
-    metal: "metal-liquid-audio-development"
 - !!merge <<: *liquid-audio
  name: "cpu-liquid-audio"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio"
@@ -4625,16 +4622,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio"
  mirrors:
    - localai/localai-backends:master-cpu-liquid-audio
- !!merge <<: *liquid-audio
-  name: "metal-liquid-audio"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-liquid-audio"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-liquid-audio
- !!merge <<: *liquid-audio
-  name: "metal-liquid-audio-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-liquid-audio"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-liquid-audio
 - !!merge <<: *liquid-audio
  name: "cuda12-liquid-audio"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio"
@@ -5295,7 +5282,6 @@
    nvidia: "cuda12-trl"
    nvidia-cuda-12: "cuda12-trl"
    nvidia-cuda-13: "cuda13-trl"
-    metal: "metal-trl"
 ## TRL backend images
 - !!merge <<: *trl
  name: "cpu-trl"
@@ -5327,16 +5313,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-trl"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-trl
- !!merge <<: *trl
-  name: "metal-trl"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-trl"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-trl
- !!merge <<: *trl
-  name: "metal-trl-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-trl"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-trl
 ## llama.cpp quantization backend
 - &llama-cpp-quantization
  name: "llama-cpp-quantization"
@@ -5508,7 +5484,6 @@
  name: "supertonic-development"
  capabilities:
    default: "cpu-supertonic-development"
-    metal: "metal-supertonic-development"
 - !!merge <<: *supertonic
  name: "cpu-supertonic"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-supertonic"
@@ -5519,13 +5494,3 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-supertonic"
  mirrors:
    - localai/localai-backends:master-cpu-supertonic
- !!merge <<: *supertonic
-  name: "metal-supertonic"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-supertonic"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-supertonic
- !!merge <<: *supertonic
-  name: "metal-supertonic-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-supertonic"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-supertonic
--- a/backend/python/liquid-audio/install.sh
+++ b/backend/python/liquid-audio/install.sh
@@ -14,11 +14,5 @@ else
 fi

 # liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins
-EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
-# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
-# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
-# it on the uv path; Linux/CUDA resolution is unchanged.
-if [ "x${USE_PIP:-}" != "xtrue" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
-fi
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 installRequirements
--- a/backend/python/liquid-audio/requirements-mps.txt
+++ b/backend/python/liquid-audio/requirements-mps.txt
@@ -1,4 +1,3 @@
-# MPS (Apple Silicon / Metal) build profile - installed by the darwin CI job.
 torch>=2.8.0
 torchaudio>=2.8.0
 torchcodec>=0.9.1
--- a/backend/python/trl/install.sh
+++ b/backend/python/trl/install.sh
@@ -8,13 +8,7 @@ else
    source $backend_dir/../common/libbackend.sh
 fi

-EXTRA_PIP_INSTALL_FLAGS+=" --upgrade"
-# --index-strategy is a uv-only flag. The darwin/MPS build installs with pip
-# (USE_PIP=true in scripts/build/python-darwin.sh), which rejects it. Only add
-# it when uv is the installer, keeping the Linux/CUDA resolution unchanged.
-if [ "x${USE_PIP:-}" != "xtrue" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
-fi
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 installRequirements

 # Fetch convert_hf_to_gguf.py and gguf package from the same llama.cpp version
--- a/backend/python/trl/requirements-mps.txt
+++ b/backend/python/trl/requirements-mps.txt
@@ -1,12 +0,0 @@
-torch==2.10.0
-trl
-peft
-datasets>=3.0.0
-transformers>=4.56.2
-accelerate>=1.4.0
-huggingface-hub>=1.3.0
-sentencepiece
-# Note: bitsandbytes is intentionally omitted on MPS. It is only used by the
-# CUDA (cublas) variants for 8-bit/4-bit quantization and has poor support on
-# Apple Silicon. torch here uses the plain PyPI wheels, which ship MPS support
-# on macOS arm64.
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -140,7 +140,7 @@ type RunCMD struct {
 	OIDCIssuer           string `env:"LOCALAI_OIDC_ISSUER" help:"OIDC issuer URL for auto-discovery" group:"auth"`
 	OIDCClientID         string `env:"LOCALAI_OIDC_CLIENT_ID" help:"OIDC Client ID (auto-enables auth)" group:"auth"`
 	OIDCClientSecret     string `env:"LOCALAI_OIDC_CLIENT_SECRET" help:"OIDC Client Secret" group:"auth"`
-	ExternalBaseURL      string `env:"LOCALAI_BASE_URL" help:"External base URL of this instance (e.g. https://localhost:8080). Used for OAuth callbacks and self-referential links (generated images/videos, job status). When unset, derived from X-Forwarded-Proto/Host or Forwarded headers." group:"api"`
+	AuthBaseURL          string `env:"LOCALAI_BASE_URL" help:"Base URL for OAuth callbacks (e.g. http://localhost:8080)" group:"auth"`
 	AuthAdminEmail       string `env:"LOCALAI_ADMIN_EMAIL" help:"Email address to auto-promote to admin role" group:"auth"`
 	AuthRegistrationMode string `env:"LOCALAI_REGISTRATION_MODE" default:"open" help:"Registration mode: 'open' (default), 'approval', or 'invite' (invite code required)" group:"auth"`
 	DisableLocalAuth     bool   `env:"LOCALAI_DISABLE_LOCAL_AUTH" default:"false" help:"Disable local email/password registration and login (use with OAuth/OIDC-only setups)" group:"auth"`
@@ -503,6 +503,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			opts = append(opts, config.WithAuthOIDCClientID(r.OIDCClientID))
 			opts = append(opts, config.WithAuthOIDCClientSecret(r.OIDCClientSecret))
 		}
+		if r.AuthBaseURL != "" {
+			opts = append(opts, config.WithAuthBaseURL(r.AuthBaseURL))
+		}
 		if r.AuthAdminEmail != "" {
 			opts = append(opts, config.WithAuthAdminEmail(r.AuthAdminEmail))
 		}
@@ -520,12 +523,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		}
 	}

-	// Applied unconditionally: the external base URL governs all self-referential
-	// links (not just OAuth callbacks), so it must take effect even when auth is off.
-	if r.ExternalBaseURL != "" {
-		opts = append(opts, config.WithExternalBaseURL(r.ExternalBaseURL))
-	}
-
 	if idleWatchDog || busyWatchDog {
 		opts = append(opts, config.EnableWatchDog)
 		if idleWatchDog {
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -49,13 +49,6 @@ type ApplicationConfig struct {
 	P2PNetworkID                  string
 	Federated                     bool

-	// ExternalBaseURL is the externally visible base URL of this instance
-	// (scheme+host[:port]), set via LOCALAI_BASE_URL. When non-empty it is
-	// authoritative for every self-referential URL LocalAI emits (OAuth
-	// callbacks, generated image/video links, async job StatusURLs),
-	// overriding proxy-header detection. Empty = derive from request headers.
-	ExternalBaseURL string
-
 	// DisableStats turns off per-request token tracking. By default the
 	// routing module's billing recorder runs in every mode (including
 	// no-auth single-user) so dashboards and `/api/usage` are immediately
@@ -203,6 +196,7 @@ type AuthConfig struct {
 	OIDCIssuer          string // OIDC issuer URL for auto-discovery (e.g. https://accounts.google.com)
 	OIDCClientID        string
 	OIDCClientSecret    string
+	BaseURL             string // for OAuth callback URLs (e.g. "http://localhost:8080")
 	AdminEmail          string // auto-promote to admin on login
 	RegistrationMode    string // "open", "approval" (default when empty), "invite"
 	DisableLocalAuth    bool   // disable local email/password registration and login
@@ -956,9 +950,9 @@ func WithAuthGitHubClientSecret(clientSecret string) AppOption {
 	}
 }

-func WithExternalBaseURL(url string) AppOption {
+func WithAuthBaseURL(baseURL string) AppOption {
 	return func(o *ApplicationConfig) {
-		o.ExternalBaseURL = url
+		o.Auth.BaseURL = baseURL
 	}
 }

--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -2,7 +2,6 @@ package config

 import (
 	"fmt"
-	"os"
 	"strconv"
 	"strings"

@@ -10,19 +9,6 @@ import (
 	"github.com/mudler/xlog"
 )

-// HardwareDefaultsDisabled reports whether hardware auto-tuning is turned off via
-// LOCALAI_DISABLE_HARDWARE_DEFAULTS=true (mirrors LOCALAI_DISABLE_GUESSING). When
-// set, ApplyHardwareDefaults and the distributed router's node tuning are
-// skipped entirely, so the backend runs llama.cpp's stock batch/parallel
-// behavior — an escape hatch for users who want predictable, un-tuned defaults.
-func HardwareDefaultsDisabled() bool {
-	// Read directly like the sibling LOCALAI_DISABLE_GUESSING toggle in
-	// hooks_llamacpp.go: these config-layer heuristic switches run deep in the
-	// defaults pipeline with no ApplicationConfig in scope to plumb through.
-	//nolint:forbidigo // config-layer heuristic toggle, mirrors LOCALAI_DISABLE_GUESSING
-	return os.Getenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS") == "true"
-}
-
 // Hardware-driven model-config defaults.
 //
 // This sits alongside the other config overriders (ApplyInferenceDefaults for
@@ -68,35 +54,8 @@ func (g GPU) IsNVIDIABlackwell() bool {
 	return maj >= 12
 }

-// Compute-buffer headroom guard for the raised physical batch.
-//
-// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
-// graph), which is allocated PER DEVICE — it does not benefit from a second GPU
-// the way weights or KV (which are split across devices) do. The buffer scales
-// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
-// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
-// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
-// even though the GB10 it was measured on (128 GiB unified memory) had room.
-//
-// These constants size a conservative guard: only raise the batch when the
-// extra scratch fits the per-device VRAM ceiling.
-const (
-	// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
-	// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
-	// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
-	// the real cost also grows with model width (heads / embedding dim) which we
-	// don't know at config time.
-	computeBufferBytesPerCell = 16
-	// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
-	// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
-	// KV, which already dominate VRAM use.
-	blackwellBatchHeadroomDivisor = 4
-)
-
 // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
-// given hardware class, ignoring context/VRAM headroom. Use
-// PhysicalBatchForContext when a model context and per-device VRAM are known
-// (the load paths) so the raised batch can't overflow a single device.
+// given hardware, used when the model config leaves batch unset.
 func PhysicalBatch(g GPU) int {
 	if g.IsNVIDIABlackwell() {
 		return BlackwellPhysicalBatch
@@ -104,51 +63,6 @@ func PhysicalBatch(g GPU) int {
 	return DefaultPhysicalBatch
 }

-// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
-// the given context: it only raises the batch above the conservative default
-// when the extra compute buffer (which is allocated on a single device and grows
-// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
-// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
-// multi-GPU host), not the summed total — the compute buffer can't be split.
-//
-// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
-// GB10 / unified-memory path reports system RAM, so it still clears the guard.
-func PhysicalBatchForContext(g GPU, ctx int) int {
-	if !g.IsNVIDIABlackwell() {
-		return DefaultPhysicalBatch
-	}
-	if g.VRAM == 0 {
-		return DefaultPhysicalBatch
-	}
-	if largeContextForDevice(g, ctx) {
-		return DefaultPhysicalBatch
-	}
-	return BlackwellPhysicalBatch
-}
-
-// largeContextForDevice reports whether the given context is large relative to
-// the per-device VRAM ceiling — the shared "tight single-model fit" signal that
-// suppresses BOTH throughput-oriented defaults (the Blackwell batch boost and
-// the concurrency slot count). It sizes the extra compute-buffer scratch a
-// raised batch would need at this context (which grows ~n_ubatch * n_ctx and
-// is allocated per device) and asks whether it overflows a fraction of the
-// device VRAM; when it does, the device has no headroom to spend on throughput
-// and the conservative defaults must hold (issue #10485).
-//
-// g.VRAM must be the PER-DEVICE ceiling (the smallest device on a multi-GPU
-// host). VRAM 0 (unknown) is treated as not-large so detection gaps don't
-// silently disable the defaults.
-func largeContextForDevice(g GPU, ctx int) bool {
-	if g.VRAM == 0 {
-		return false
-	}
-	if ctx <= 0 {
-		ctx = DefaultContextSize
-	}
-	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
-	return extra > g.VRAM/blackwellBatchHeadroomDivisor
-}
-
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
 // Callers that re-tune a value chosen by an upstream host (the distributed
 // router correcting the frontend's guess) use this to avoid clobbering an
@@ -185,50 +99,17 @@ func DefaultParallelSlots(g GPU) int {
 	}
 }

-// ParallelSlotsForContext is DefaultParallelSlots gated on per-device VRAM
-// headroom for the given context. A large context already claims most of a
-// single device's VRAM (the KV cache plus the per-slot compute/checkpoint
-// scratch that scales with n_seq_max), so defaulting multiple slots there
-// pushes a tight single-model fit into per-device CUDA OOM (issue #10485): the
-// model loads but the final allocation (e.g. an MTP draft context's KV cache)
-// overflows the tighter card by a few hundred MiB. Returns 1 (no concurrency)
-// in that tight regime, otherwise the VRAM-scaled DefaultParallelSlots.
-//
-// g.VRAM must be the PER-DEVICE ceiling (smallest device on a multi-GPU host).
-// It shares largeContextForDevice with the batch boost so both throughput
-// defaults are suppressed together; the GB10 / unified-memory path reports
-// system RAM and so keeps full concurrency even at large contexts.
-func ParallelSlotsForContext(g GPU, ctx int) int {
-	slots := DefaultParallelSlots(g)
-	if slots <= 1 || g.VRAM == 0 {
-		return slots
-	}
-	if largeContextForDevice(g, ctx) {
-		return 1
-	}
-	return slots
-}
-
-// EnsureParallelOptionForContext appends a VRAM-scaled "parallel:N" backend
-// option when the model doesn't already set one and the GPU warrants (and has
-// headroom for) concurrency at this context. Returns the possibly-extended
-// options. Shared by the single-host config path (ApplyHardwareDefaults) and
-// the distributed router (per selected node).
-func EnsureParallelOptionForContext(opts []string, gpu GPU, ctx int) []string {
-	if slots := ParallelSlotsForContext(gpu, ctx); slots > 1 && !hasParallelOption(opts) {
+// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
+// model doesn't already set one (and the GPU warrants concurrency). Returns the
+// possibly-extended options. Shared by the single-host config path
+// (ApplyHardwareDefaults) and the distributed router (per selected node).
+func EnsureParallelOption(opts []string, gpu GPU) []string {
+	if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
 		return append(opts, fmt.Sprintf("parallel:%d", slots))
 	}
 	return opts
 }

-// EnsureParallelOption is EnsureParallelOptionForContext with no known context
-// (defaults to DefaultContextSize, which clears the headroom gate on any device
-// large enough to warrant concurrency). Kept for callers without a model
-// context.
-func EnsureParallelOption(opts []string, gpu GPU) []string {
-	return EnsureParallelOptionForContext(opts, gpu, 0)
-}
-
 // hasParallelOption reports whether the model already sets parallel/n_parallel
 // so we never override an explicit value (helper shared with serving_defaults.go).
 func hasParallelOption(opts []string) bool {
@@ -241,12 +122,7 @@ func hasParallelOption(opts []string) bool {
 // deterministic device — detection does a live nvidia-smi call.
 var localGPU = func() GPU {
 	vendor, _ := xsysinfo.DetectGPUVendor()
-	// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
-	// tier and the batch headroom guard both reason about what fits on a single
-	// card, and per-device compute buffers can't be split across GPUs. Summing
-	// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
-	// into OOM (issue #10485).
-	vram, _ := xsysinfo.MinPerGPUVRAM()
+	vram, _ := xsysinfo.TotalAvailableVRAM()
 	return GPU{
 		Vendor:            vendor,
 		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
@@ -258,36 +134,25 @@ var localGPU = func() GPU {
 // and were left unset by the user. Currently: a larger physical batch on
 // Blackwell. Explicit config always wins (we only touch zero values).
 func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
-	if cfg == nil || HardwareDefaultsDisabled() {
+	if cfg == nil {
 		return
 	}
-	// Raise the physical batch on Blackwell only when the resulting compute
-	// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
-	// (rather than writing the default 512) preserves the downstream single-pass
-	// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
-	ctx := DefaultContextSize
-	if cfg.ContextSize != nil {
-		ctx = *cfg.ContextSize
-	}
-	if cfg.Batch == 0 {
-		if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
-			cfg.Batch = BlackwellPhysicalBatch
-			xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
-				"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
-		}
+	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
+		cfg.Batch = BlackwellPhysicalBatch
+		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
 	}

 	// Enable concurrent serving by default on a capable GPU: without this the
 	// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
 	// (continuous batching stays off). Unified KV means the slots share the
-	// context budget, but a context large enough to fill a single device leaves
-	// no room for the per-slot scratch, so the slot count is gated on per-device
-	// headroom too (issue #10485). Explicit parallel/n_parallel always wins.
+	// context budget, so this is concurrency without extra KV memory. Explicit
+	// parallel/n_parallel in the model options always wins.
 	if before := len(cfg.Options); true {
-		cfg.Options = EnsureParallelOptionForContext(cfg.Options, gpu, ctx)
+		cfg.Options = EnsureParallelOption(cfg.Options, gpu)
 		if len(cfg.Options) > before {
 			xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
-				"option", cfg.Options[len(cfg.Options)-1], "context", ctx, "vram_gib", gpu.VRAM>>30)
+				"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
 		}
 	}
 }
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -9,37 +9,26 @@ import (
 // GPU. The detection seam (localGPU) is injected so the path is deterministic
 // without a real GPU.
 var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
-	const gib = uint64(1) << 30
-
 	var orig func() GPU
 	BeforeEach(func() { orig = localGPU })
 	AfterEach(func() { localGPU = orig })

-	It("sets the physical batch on a local Blackwell GPU with headroom", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
+	It("sets the physical batch on a local Blackwell GPU", func() {
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 	})

-	It("leaves batch unset when a large context would overflow the device", func() {
-		// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
-		ctx := 204800
-		cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
-		cfg.SetDefaults()
-		Expect(cfg.Batch).To(Equal(0))
-	})
-
 	It("leaves batch unset on a non-Blackwell local GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})

 	It("never overrides an explicit batch", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
 		cfg := &ModelConfig{}
 		cfg.Batch = 1024
 		cfg.SetDefaults()
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -7,8 +7,6 @@ import (
 )

 var _ = Describe("Hardware-driven config defaults", func() {
-	const gib = uint64(1) << 30
-
 	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
 		func(cc string, want bool) {
 			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
@@ -37,70 +35,30 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})

-	Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
-		It("raises the batch when the compute buffer fits the device", func() {
-			// 16 GiB Blackwell with a small context: the extra scratch is tiny.
-			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
-				To(Equal(BlackwellPhysicalBatch))
-		})
-		It("keeps the default batch when a large context would overflow one device", func() {
-			// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
-			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
-				To(Equal(DefaultPhysicalBatch))
-		})
-		It("still raises the batch on a large unified-memory device (GB10)", func() {
-			// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
-			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
-				To(Equal(BlackwellPhysicalBatch))
-		})
-		It("stays conservative when VRAM is unknown", func() {
-			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
-				To(Equal(DefaultPhysicalBatch))
-		})
-		It("never raises the batch on non-Blackwell", func() {
-			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
-				To(Equal(DefaultPhysicalBatch))
-		})
-	})
-
 	Describe("ApplyHardwareDefaults", func() {
-		It("raises an unset batch to 2048 on Blackwell with headroom", func() {
+		It("raises an unset batch to 2048 on Blackwell", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 		})
-		It("leaves batch unset when a large context would overflow one device", func() {
-			// Regression guard for issue #10485: 16 GiB card + ~200k context.
-			ctx := 204800
-			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
-			Expect(cfg.Batch).To(Equal(0))
-		})
 		It("leaves batch unset on non-Blackwell", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("never overrides an explicit batch", func() {
 			cfg := &ModelConfig{}
 			cfg.Batch = 1024
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
 			Expect(cfg.Batch).To(Equal(1024))
 		})
 		It("no-ops on nil", func() {
 			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
 		})
-
-		It("applies nothing when hardware defaults are disabled via env", func() {
-			GinkgoT().Setenv("LOCALAI_DISABLE_HARDWARE_DEFAULTS", "true")
-			Expect(HardwareDefaultsDisabled()).To(BeTrue())
-			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
-			Expect(cfg.Batch).To(Equal(0))
-			Expect(cfg.Options).To(BeEmpty())
-		})
 	})

+	const gib = uint64(1) << 30
+
 	DescribeTable("DefaultParallelSlots (by VRAM)",
 		func(vramGiB uint64, want int) {
 			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
@@ -114,46 +72,12 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		Entry("unknown 0", uint64(0), 1),
 	)

-	Describe("ParallelSlotsForContext (per-device VRAM headroom)", func() {
-		It("keeps the VRAM-scaled slot count when the context fits the device", func() {
-			// 16 GiB card, small context: plenty of room for concurrency.
-			Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 8192)).To(Equal(4))
-		})
-		It("drops to a single slot when a large context already fills the device", func() {
-			// Regression guard for issue #10485: 16 GiB consumer Blackwell, ~200k
-			// context. Even with unified KV, the per-slot compute/checkpoint
-			// scratch from 4 slots is the straw that overflows the tighter device.
-			Expect(ParallelSlotsForContext(GPU{VRAM: 16 * gib}, 204800)).To(Equal(1))
-		})
-		It("keeps concurrency on a large unified-memory device (GB10)", func() {
-			// GB10 reports system RAM (~119 GiB): a 200k context leaves headroom.
-			Expect(ParallelSlotsForContext(GPU{VRAM: 119 * gib}, 204800)).To(Equal(8))
-		})
-		It("keeps concurrency on a big datacenter card with a large context", func() {
-			// 80 GiB A100: 200k context is a small fraction, concurrency stays.
-			Expect(ParallelSlotsForContext(GPU{VRAM: 80 * gib}, 204800)).To(Equal(8))
-		})
-		It("stays a single slot on small/unknown VRAM regardless of context", func() {
-			Expect(ParallelSlotsForContext(GPU{VRAM: 2 * gib}, 8192)).To(Equal(1))
-			Expect(ParallelSlotsForContext(GPU{}, 8192)).To(Equal(1))
-		})
-	})
-
 	Describe("ApplyHardwareDefaults parallel slots", func() {
 		It("adds a VRAM-scaled parallel option on a capable GPU", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Options).To(ContainElement("parallel:8"))
 		})
-		It("adds no parallel option when a large context already fills one device", func() {
-			// Regression guard for issue #10485: 16 GiB card + ~200k context. The
-			// model barely fits; defaulting concurrency tips the tighter GPU into
-			// CUDA OOM during the final (MTP draft) KV allocation.
-			ctx := 204800
-			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
-			Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
-		})
 		It("scales the slot count down with VRAM", func() {
 			cfg := &ModelConfig{}
 			ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1204,6 +1204,11 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)

+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
+	// Uses the local GPU here; in distributed mode the router re-applies the same
+	// heuristics for the selected node's GPU before loading. Explicit config wins.
+	ApplyHardwareDefaults(cfg, localGPU())
+
 	// Apply serving-policy defaults (device-independent): cross-request prefix
 	// caching. Propagates to distributed nodes via the model options.
 	ApplyServingDefaults(cfg)
@@ -1242,16 +1247,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.ContextSize = &ctx
 	}
 	runBackendHooks(cfg, lo.modelPath)
-
-	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
-	// LAST, after the context size is fully resolved (explicit config, LoadOptions,
-	// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
-	// the per-device compute buffer against this model's context, so it must see
-	// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
-	// mode the router re-applies the same heuristics for the selected node's GPU
-	// before loading. Explicit config always wins.
-	ApplyHardwareDefaults(cfg, localGPU())
-
 	cfg.syncKnownUsecasesFromString()
 }

--- a/core/http/app.go
+++ b/core/http/app.go
@@ -149,18 +149,6 @@ func API(application *application.Application) (*echo.Echo, error) {
 	// Middleware - StripPathPrefix must be registered early as it uses Rewrite which runs before routing
 	e.Pre(httpMiddleware.StripPathPrefix())

-	// Stamp the configured external base URL into each request context so
-	// middleware.BaseURL can treat it as authoritative for self-referential
-	// links. Registered as Pre so it runs before routing and handlers.
-	if extBaseURL := application.ApplicationConfig().ExternalBaseURL; extBaseURL != "" {
-		e.Pre(func(next echo.HandlerFunc) echo.HandlerFunc {
-			return func(c echo.Context) error {
-				c.Set("_external_base_url", extBaseURL)
-				return next(c)
-			}
-		})
-	}
-
 	e.Pre(middleware.RemoveTrailingSlash())

 	if application.ApplicationConfig().MachineTag != "" {
--- a/core/http/middleware/baseurl.go
+++ b/core/http/middleware/baseurl.go
@@ -55,70 +55,17 @@ func BasePathPrefix(c echo.Context) string {
 // The returned URL is guaranteed to end with `/`.
 // The method should be used in conjunction with the StripPathPrefix middleware.
 func BaseURL(c echo.Context) string {
-	// An explicit external base URL (LOCALAI_BASE_URL) is authoritative for
-	// the origin. The proxy-derived path prefix is still appended so a
-	// reverse-proxy mount point keeps working. Trailing slashes are
-	// normalized via BasePathPrefix, which always starts and ends with "/".
-	if ext, ok := c.Get("_external_base_url").(string); ok && ext != "" {
-		return strings.TrimRight(ext, "/") + BasePathPrefix(c)
-	}
-
-	fwdProto, fwdHost := parseForwarded(c.Request().Header.Get("Forwarded"))
-
 	scheme := "http"
-	switch {
-	case c.Request().TLS != nil:
+	if c.Request().Header.Get("X-Forwarded-Proto") == "https" {
 		scheme = "https"
-	case strings.EqualFold(firstToken(c.Request().Header.Get("X-Forwarded-Proto")), "https"):
-		scheme = "https"
-	case strings.EqualFold(fwdProto, "https"):
+	} else if c.Request().TLS != nil {
 		scheme = "https"
 	}

 	host := c.Request().Host
 	if forwardedHost := c.Request().Header.Get("X-Forwarded-Host"); forwardedHost != "" {
 		host = forwardedHost
-	} else if fwdHost != "" {
-		host = fwdHost
 	}

 	return scheme + "://" + host + BasePathPrefix(c)
 }
-
-// firstToken returns the first comma-separated token of v, trimmed of spaces.
-// Reverse-proxy chains can emit X-Forwarded-Proto as "https,http"; only the
-// first hop (closest to the client) is meaningful for scheme detection.
-func firstToken(v string) string {
-	if i := strings.IndexByte(v, ','); i >= 0 {
-		v = v[:i]
-	}
-	return strings.TrimSpace(v)
-}
-
-// parseForwarded extracts the proto and host directives from the first element
-// of an RFC 7239 Forwarded header (e.g. `for=x;proto=https;host=h, for=y`).
-// Values may be quoted. Returns empty strings when absent or malformed so the
-// caller can fall through to other signals.
-func parseForwarded(header string) (proto, host string) {
-	if header == "" {
-		return "", ""
-	}
-	// Only the first element (closest proxy to the client) matters here.
-	if i := strings.IndexByte(header, ','); i >= 0 {
-		header = header[:i]
-	}
-	for _, directive := range strings.Split(header, ";") {
-		key, value, ok := strings.Cut(strings.TrimSpace(directive), "=")
-		if !ok {
-			continue
-		}
-		value = strings.Trim(strings.TrimSpace(value), `"`)
-		switch strings.ToLower(strings.TrimSpace(key)) {
-		case "proto":
-			proto = value
-		case "host":
-			host = value
-		}
-	}
-	return proto, host
-}
--- a/core/http/middleware/baseurl_test.go
+++ b/core/http/middleware/baseurl_test.go
@@ -135,138 +135,4 @@ var _ = Describe("BaseURL", func() {
 			Entry("missing leading slash", "evil"),
 		)
 	})
-
-	Context("scheme detection hardening", func() {
-		It("treats comma-separated X-Forwarded-Proto as https when first token is https", func() {
-			app := echo.New()
-			actualURL := ""
-			app.GET("/x", func(c echo.Context) error {
-				actualURL = BaseURL(c)
-				return nil
-			})
-			req := httptest.NewRequest("GET", "/x", nil)
-			req.Header.Set("X-Forwarded-Proto", "https,http")
-			rec := httptest.NewRecorder()
-			app.ServeHTTP(rec, req)
-			Expect(actualURL).To(Equal("https://example.com/"))
-		})
-
-		It("derives https from the RFC 7239 Forwarded proto directive", func() {
-			app := echo.New()
-			actualURL := ""
-			app.GET("/x", func(c echo.Context) error {
-				actualURL = BaseURL(c)
-				return nil
-			})
-			req := httptest.NewRequest("GET", "/x", nil)
-			req.Header.Set("Forwarded", "for=192.0.2.1;proto=https;host=proxy.example")
-			rec := httptest.NewRecorder()
-			app.ServeHTTP(rec, req)
-			Expect(actualURL).To(Equal("https://proxy.example/"))
-		})
-
-		It("prefers X-Forwarded-Host over the Forwarded host directive", func() {
-			app := echo.New()
-			actualURL := ""
-			app.GET("/x", func(c echo.Context) error {
-				actualURL = BaseURL(c)
-				return nil
-			})
-			req := httptest.NewRequest("GET", "/x", nil)
-			req.Header.Set("X-Forwarded-Host", "xfh.example")
-			req.Header.Set("Forwarded", "host=fwd.example;proto=https")
-			rec := httptest.NewRecorder()
-			app.ServeHTTP(rec, req)
-			Expect(actualURL).To(Equal("https://xfh.example/"))
-		})
-	})
-
-	Context("explicit external base URL override", func() {
-		It("uses the configured origin over conflicting forwarded headers", func() {
-			app := echo.New()
-			actualURL := ""
-			app.GET("/x", func(c echo.Context) error {
-				c.Set("_external_base_url", "https://192.168.0.13:34567")
-				actualURL = BaseURL(c)
-				return nil
-			})
-			req := httptest.NewRequest("GET", "/x", nil)
-			req.Header.Set("X-Forwarded-Proto", "http")
-			req.Header.Set("X-Forwarded-Host", "internal:8080")
-			rec := httptest.NewRecorder()
-			app.ServeHTTP(rec, req)
-			Expect(actualURL).To(Equal("https://192.168.0.13:34567/"))
-		})
-
-		It("combines the configured origin with a detected path prefix", func() {
-			app := echo.New()
-			actualURL := ""
-			app.GET("/hello", func(c echo.Context) error {
-				c.Set("_original_path", "/localai/hello")
-				c.Set("_external_base_url", "https://ext.example")
-				actualURL = BaseURL(c)
-				return nil
-			})
-			req := httptest.NewRequest("GET", "/hello", nil)
-			rec := httptest.NewRecorder()
-			app.ServeHTTP(rec, req)
-			Expect(actualURL).To(Equal("https://ext.example/localai/"))
-		})
-
-		It("ignores an empty override", func() {
-			app := echo.New()
-			actualURL := ""
-			app.GET("/x", func(c echo.Context) error {
-				c.Set("_external_base_url", "")
-				actualURL = BaseURL(c)
-				return nil
-			})
-			req := httptest.NewRequest("GET", "/x", nil)
-			rec := httptest.NewRecorder()
-			app.ServeHTTP(rec, req)
-			Expect(actualURL).To(Equal("http://example.com/"))
-		})
-	})
-
-	Context("parseForwarded helper", func() {
-		It("parses unquoted proto and host", func() {
-			proto, host := parseForwarded("for=192.0.2.1;proto=https;host=h.example")
-			Expect(proto).To(Equal("https"))
-			Expect(host).To(Equal("h.example"))
-		})
-
-		It("strips quotes around values", func() {
-			proto, host := parseForwarded(`proto="https";host="h.example"`)
-			Expect(proto).To(Equal("https"))
-			Expect(host).To(Equal("h.example"))
-		})
-
-		It("uses only the first element of a multi-element header", func() {
-			proto, host := parseForwarded("proto=https;host=first.example, proto=http;host=second.example")
-			Expect(proto).To(Equal("https"))
-			Expect(host).To(Equal("first.example"))
-		})
-
-		It("returns empty strings for an empty header", func() {
-			proto, host := parseForwarded("")
-			Expect(proto).To(BeEmpty())
-			Expect(host).To(BeEmpty())
-		})
-
-		It("skips directives without a value", func() {
-			proto, host := parseForwarded("proto;host=h.example")
-			Expect(proto).To(BeEmpty())
-			Expect(host).To(Equal("h.example"))
-		})
-	})
-
-	Context("firstToken helper", func() {
-		It("returns the whole trimmed string when there is no comma", func() {
-			Expect(firstToken("  https  ")).To(Equal("https"))
-		})
-
-		It("returns the first trimmed token when there is a comma", func() {
-			Expect(firstToken("https , http")).To(Equal("https"))
-		})
-	})
 })
--- a/core/http/react-ui/public/locales/en/chat.json
+++ b/core/http/react-ui/public/locales/en/chat.json
@@ -86,7 +86,6 @@
  "input": {
    "placeholder": "Message...",
    "attachFile": "Attach file",
-    "send": "Send message",
    "stopGenerating": "Stop generating",
    "canvasTitle": "Canvas — extract code blocks and media into a side panel for preview, copy, and download",
    "canvasLabel": "Canvas",
--- a/core/http/react-ui/public/locales/en/home.json
+++ b/core/http/react-ui/public/locales/en/home.json
@@ -77,21 +77,6 @@
    "noModelsTitle": "No Models Available",
    "noModelsBody": "There are no models installed yet. Ask your administrator to set up models so you can start chatting."
  },
-  "starters": {
-    "title": "Recommended for your hardware",
-    "tier": {
-      "cpu": "CPU-only",
-      "gpu-small": "GPU",
-      "gpu-mid": "GPU",
-      "gpu-large": "GPU"
-    },
-    "cpuNote": "No GPU detected — these small models stay responsive on CPU.",
-    "gpuNote": "Picked to fit your available VRAM with room for context.",
-    "install": "Install",
-    "installing": "Installing",
-    "installStarted": "Installing {{model}}…",
-    "installFailed": "Install failed: {{message}}"
-  },
  "connect": {
    "title": "One endpoint, every API",
    "subtitle": "LocalAI serves its own full API — image & video generation, depth, object detection, reranking, audio, face & voice recognition, and realtime voice over WebRTC and WebSocket. On top of that, a drop-in compatibility layer lets any app built for OpenAI, Anthropic, Ollama or OpenAI Responses talk to it unchanged.",
--- a/core/http/react-ui/public/locales/en/models.json
+++ b/core/http/react-ui/public/locales/en/models.json
@@ -2,16 +2,6 @@
  "title": "Install Models",
  "subtitle": "Browse and install AI models from the gallery",
  "models": "Models",
-  "recommended": {
-    "title": "Recommended for your hardware",
-    "cpuNote": "No GPU detected - small models that stay responsive on CPU.",
-    "gpuNote": "Sized to fit your available VRAM with room for context.",
-    "install": "Install",
-    "installing": "Installing",
-    "installStarted": "Installing {{model}}…",
-    "installFailed": "Install failed: {{message}}",
-    "dismiss": "Dismiss recommendations"
-  },
  "stats": {
    "available": "Available",
    "installed": "Installed"
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -6363,130 +6363,6 @@ select.input {
  justify-content: center;
 }

-/* ──────────────────── Home: hardware-aware starter models ──────────────────── */
-
-.home-starters {
-  margin: var(--spacing-lg) 0;
-  padding: var(--spacing-lg);
-}
-.home-starters-head {
-  display: flex;
-  align-items: center;
-  justify-content: space-between;
-  gap: var(--spacing-md);
-}
-.home-starters-head strong {
-  font-size: 0.9375rem;
-}
-.home-starters-tier {
-  display: inline-flex;
-  align-items: center;
-  gap: var(--spacing-xs);
-  font-size: 0.75rem;
-  color: var(--color-text-muted);
-}
-.home-starters-sub {
-  margin: var(--spacing-xs) 0 var(--spacing-md);
-  font-size: 0.8125rem;
-  color: var(--color-text-secondary);
-}
-.home-starters-list {
-  list-style: none;
-  margin: 0;
-  padding: 0;
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-}
-.home-starters-item {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-md);
-  padding: var(--spacing-xs) 0;
-}
-.home-starters-name {
-  font-weight: 500;
-  font-size: 0.875rem;
-  word-break: break-all;
-}
-.home-starters-badge {
-  font-size: 0.625rem;
-}
-.home-starters-size {
-  margin-left: auto;
-  font-size: 0.75rem;
-  color: var(--color-text-muted);
-  white-space: nowrap;
-}
-
-/* ──────────────────── Models gallery: recommended-for-your-hardware strip ──────────────────── */
-
-.rec-models {
-  margin-bottom: var(--spacing-md);
-  padding: var(--spacing-md) var(--spacing-lg);
-}
-.rec-models-head {
-  display: flex;
-  align-items: flex-start;
-  justify-content: space-between;
-  gap: var(--spacing-md);
-}
-.rec-models-title {
-  display: flex;
-  align-items: center;
-  gap: var(--spacing-sm);
-  flex-wrap: wrap;
-}
-.rec-models-title i {
-  color: var(--color-primary);
-}
-.rec-models-note {
-  font-size: 0.8125rem;
-  color: var(--color-text-secondary);
-}
-.rec-models-dismiss {
-  background: none;
-  border: none;
-  color: var(--color-text-muted);
-  cursor: pointer;
-  padding: 4px;
-  flex-shrink: 0;
-}
-.rec-models-dismiss:hover {
-  color: var(--color-text-primary);
-}
-.rec-models-grid {
-  display: grid;
-  grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
-  gap: var(--spacing-sm);
-  margin-top: var(--spacing-md);
-}
-.rec-models-item {
-  display: flex;
-  flex-direction: column;
-  gap: var(--spacing-xs);
-  padding: var(--spacing-sm) var(--spacing-md);
-  border: 1px solid var(--color-border-subtle);
-  border-radius: var(--radius-md);
-  background: var(--color-bg-primary);
-}
-.rec-models-item-name {
-  font-weight: 500;
-  font-size: 0.8125rem;
-  word-break: break-all;
-}
-.rec-models-item-meta {
-  display: flex;
-  gap: var(--spacing-sm);
-  font-size: 0.75rem;
-  color: var(--color-text-muted);
-}
-.rec-models-item-fit {
-  display: inline-flex;
-  align-items: center;
-  gap: 4px;
-}
-
 /* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */

 .home-connect {
--- a/core/http/react-ui/src/components/ModelSelector.jsx
+++ b/core/http/react-ui/src/components/ModelSelector.jsx
@@ -1,25 +1,8 @@
-import { useEffect, useMemo, useCallback } from 'react'
+import { useEffect, useMemo } from 'react'
 import { useModels } from '../hooks/useModels'
 import SearchableSelect from './SearchableSelect'
 import { useTranslation } from 'react-i18next'

-// Remember the last model the user picked, keyed by capability, so returning to
-// a page (Home chat box, Image, TTS, Talk...) defaults to that model instead of
-// whatever happens to sort first. Only persisted when a capability key exists —
-// `externalOptions` callers pass no capability and get the old first-item
-// behaviour. localStorage access is wrapped because private-browsing modes throw.
-const LAST_MODEL_PREFIX = 'localai_last_model:'
-
-function readLastModel(capability) {
-  if (!capability) return null
-  try { return localStorage.getItem(LAST_MODEL_PREFIX + capability) } catch { return null }
-}
-
-function writeLastModel(capability, model) {
-  if (!capability || !model) return
-  try { localStorage.setItem(LAST_MODEL_PREFIX + capability, model) } catch { /* ignore */ }
-}
-
 export default function ModelSelector({
  value, onChange, capability, className = '',
  options: externalOptions, loading: externalLoading,
@@ -36,27 +19,16 @@ export default function ModelSelector({
  const isLoading = externalOptions ? (externalLoading || false) : hookLoading
  const isDisabled = isLoading || (externalDisabled || false)

-  // Persist genuine selections so the next visit can restore them.
-  const handleChange = useCallback((next) => {
-    writeLastModel(capability, next)
-    onChange(next)
-  }, [capability, onChange])
-
  useEffect(() => {
    if (modelNames.length > 0 && (!value || !modelNames.includes(value))) {
-      // Prefer the remembered model when it's still available; otherwise fall
-      // back to the first option. Don't re-persist here — auto-select is not a
-      // user choice, and writing back the stored value would be a harmless but
-      // pointless round-trip.
-      const remembered = readLastModel(capability)
-      onChange(remembered && modelNames.includes(remembered) ? remembered : modelNames[0])
+      onChange(modelNames[0])
    }
-  }, [modelNames, value, onChange, capability])
+  }, [modelNames, value, onChange])

  return (
    <SearchableSelect
      value={value || ''}
-      onChange={handleChange}
+      onChange={onChange}
      options={modelNames}
      placeholder={isLoading ? t('selector.loading') : (modelNames.length === 0 ? t('selector.noModels') : t('selector.selectModel'))}
      searchPlaceholder={searchPlaceholder || t('selector.searchPlaceholder')}
--- a/core/http/react-ui/src/components/RecommendedModels.jsx
+++ b/core/http/react-ui/src/components/RecommendedModels.jsx
@@ -1,86 +0,0 @@
-import { useState } from 'react'
-import { useTranslation } from 'react-i18next'
-import { modelsApi } from '../utils/api'
-import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
-
-const DISMISS_KEY = 'localai_rec_models_dismissed'
-
-// "Recommended for your hardware" strip at the top of the Models gallery. Shares
-// the hardware-fit ranking with the empty-state starter widget via
-// useRecommendedModels, but styled for the gallery page and dismissible (the
-// gallery is a repeat-visit surface, so it shouldn't nag).
-export default function RecommendedModels({ addToast }) {
-  const { t } = useTranslation('models')
-  const { recommended, tier, loading } = useRecommendedModels({ count: 4 })
-  const [installing, setInstalling] = useState(() => new Set())
-  const [dismissed, setDismissed] = useState(() => {
-    try { return localStorage.getItem(DISMISS_KEY) === '1' } catch { return false }
-  })
-
-  if (loading || dismissed) return null
-  if (!recommended || recommended.length === 0) return null
-
-  const dismiss = () => {
-    try { localStorage.setItem(DISMISS_KEY, '1') } catch { /* ignore */ }
-    setDismissed(true)
-  }
-
-  const install = async (name) => {
-    setInstalling(prev => new Set(prev).add(name))
-    try {
-      await modelsApi.install(name)
-      addToast?.(t('recommended.installStarted', { model: name }), 'success')
-    } catch (err) {
-      addToast?.(t('recommended.installFailed', { message: err.message }), 'error')
-      setInstalling(prev => {
-        const next = new Set(prev)
-        next.delete(name)
-        return next
-      })
-    }
-  }
-
-  const isGpu = tier.id !== 'cpu'
-
-  return (
-    <div className="rec-models card">
-      <div className="rec-models-head">
-        <div className="rec-models-title">
-          <i className={`fas ${isGpu ? 'fa-microchip' : 'fa-memory'}`} aria-hidden="true" />
-          <strong>{t('recommended.title')}</strong>
-          <span className="rec-models-note">{isGpu ? t('recommended.gpuNote') : t('recommended.cpuNote')}</span>
-        </div>
-        <button type="button" className="rec-models-dismiss" onClick={dismiss} aria-label={t('recommended.dismiss')} title={t('recommended.dismiss')}>
-          <i className="fas fa-times" aria-hidden="true" />
-        </button>
-      </div>
-      <div className="rec-models-grid">
-        {recommended.map(m => {
-          const busy = installing.has(m.name)
-          return (
-            <div key={m.name} className="rec-models-item">
-              <div className="rec-models-item-name">{m.name}</div>
-              <div className="rec-models-item-meta">
-                {isNvfp4Name(m.name) && <span className="badge badge-info">NVFP4</span>}
-                {m.sizeDisplay && <span>{m.sizeDisplay}</span>}
-                {isGpu && m.vramDisplay && (
-                  <span className="rec-models-item-fit"><i className="fas fa-microchip" aria-hidden="true" /> {m.vramDisplay}</span>
-                )}
-              </div>
-              <button
-                type="button"
-                className="btn btn-primary btn-sm"
-                disabled={busy}
-                onClick={() => install(m.name)}
-              >
-                {busy
-                  ? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('recommended.installing')}</>)
-                  : (<><i className="fas fa-download" aria-hidden="true" /> {t('recommended.install')}</>)}
-              </button>
-            </div>
-          )
-        })}
-      </div>
-    </div>
-  )
-}
--- a/core/http/react-ui/src/components/StarterModels.jsx
+++ b/core/http/react-ui/src/components/StarterModels.jsx
@@ -1,129 +0,0 @@
-import { useState } from 'react'
-import { useTranslation } from 'react-i18next'
-import { modelsApi } from '../utils/api'
-import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
-
-// Static fallback used only when the live gallery / estimates can't be reached
-// (offline, trimmed gallery). The hook is the primary, data-driven path; these
-// are real gallery names kept as a safety net so onboarding never shows nothing.
-// Gemma picks use the QAT (quantization-aware-trained) Q4 builds. NVIDIA boxes
-// get NVFP4 + MTP variants at the mid/large tiers (see NVIDIA below).
-const BASE = {
-  cpu: [
-    { name: 'gemma-4-e2b-it-qat-q4_0', size: '~1.5 GB' },
-    { name: 'qwen3.5-4b-claude-4.6-opus-reasoning-distilled', size: '~2.5 GB' },
-    { name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
-    { name: 'lfm2.5-1.2b-instruct', size: '~0.8 GB' },
-  ],
-  'gpu-small': [
-    { name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
-    { name: 'lfm2.5-8b-a1b', size: '~5 GB' },
-    { name: 'qwen3.5-9b', size: '~5.5 GB' },
-    { name: 'gemma-4-12b-it-qat-q4_0', size: '~7 GB' },
-  ],
-  'gpu-mid': [
-    { name: 'qwen3.6-27b', size: '~16 GB' },
-    { name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
-    { name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
-    { name: 'qwen3.5-27b', size: '~16 GB' },
-  ],
-  'gpu-large': [
-    { name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
-    { name: 'qwen3.6-35b-a3b-claude-4.6-opus-reasoning-distilled', size: '~20 GB' },
-    { name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
-    { name: 'qwen3.5-35b-a3b-apex', size: '~20 GB' },
-  ],
-}
-
-// NVIDIA-only overrides: NVFP4 is a Blackwell-optimised 4-bit format paired with
-// MTP (multi-token prediction) for speed. Only the mid/large tiers have these.
-const NVIDIA = {
-  'gpu-mid': [
-    { name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
-    { name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
-    { name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
-    { name: 'qwen3.6-27b', size: '~16 GB' },
-  ],
-  'gpu-large': [
-    { name: 'qwen3.6-35b-a3b-nvfp4-mtp', size: '~18 GB' },
-    { name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
-    { name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
-    { name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
-  ],
-}
-
-function fallbackFor(tierId, isNvidia) {
-  if (isNvidia && NVIDIA[tierId]) return NVIDIA[tierId]
-  return BASE[tierId] || BASE.cpu
-}
-
-export default function StarterModels({ addToast, onInstallStarted }) {
-  const { t } = useTranslation('home')
-  const { recommended, tier, isNvidia, loading } = useRecommendedModels({ count: 4 })
-  const [installing, setInstalling] = useState(() => new Set())
-
-  // While the hardware probe + gallery query are in flight, render nothing
-  // rather than flashing fallback content that may be replaced a moment later.
-  if (loading) return null
-
-  // Prefer live recommendations; fall back to the static list only when the
-  // gallery yielded nothing.
-  const items = (recommended && recommended.length > 0)
-    ? recommended.map(r => ({ name: r.name, size: r.sizeDisplay }))
-    : fallbackFor(tier.id, isNvidia)
-
-  if (items.length === 0) return null
-
-  const install = async (name) => {
-    setInstalling(prev => new Set(prev).add(name))
-    try {
-      await modelsApi.install(name)
-      addToast?.(t('starters.installStarted', { model: name }), 'success')
-      onInstallStarted?.(name)
-    } catch (err) {
-      addToast?.(t('starters.installFailed', { message: err.message }), 'error')
-      setInstalling(prev => {
-        const next = new Set(prev)
-        next.delete(name)
-        return next
-      })
-    }
-  }
-
-  return (
-    <section className="home-starters card">
-      <div className="home-starters-head">
-        <strong>{t('starters.title')}</strong>
-        <span className="home-starters-tier">
-          <i className={`fas ${tier.id === 'cpu' ? 'fa-memory' : 'fa-microchip'}`} aria-hidden="true" />
-          {t(`starters.tier.${tier.id}`)}
-        </span>
-      </div>
-      <p className="home-starters-sub">
-        {tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
-      </p>
-      <ul className="home-starters-list">
-        {items.map(c => {
-          const busy = installing.has(c.name)
-          return (
-            <li key={c.name} className="home-starters-item">
-              <span className="home-starters-name">{c.name}</span>
-              {isNvfp4Name(c.name) && <span className="badge badge-info home-starters-badge">NVFP4</span>}
-              {c.size && <span className="home-starters-size">{c.size}</span>}
-              <button
-                type="button"
-                className="btn btn-primary btn-sm"
-                disabled={busy}
-                onClick={() => install(c.name)}
-              >
-                {busy
-                  ? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('starters.installing')}</>)
-                  : (<><i className="fas fa-download" aria-hidden="true" /> {t('starters.install')}</>)}
-              </button>
-            </li>
-          )
-        })}
-      </ul>
-    </section>
-  )
-}
--- a/core/http/react-ui/src/hooks/usePolling.js
+++ b/core/http/react-ui/src/hooks/usePolling.js
@@ -1,66 +0,0 @@
-import { useEffect, useRef, useCallback } from 'react'
-
-// usePolling runs `fn` immediately and then on a fixed interval, with two
-// behaviours every hand-rolled setInterval in this app was missing:
-//
-//   1. Visibility-aware: the timer pauses while the tab is hidden
-//      (document.hidden) and fires an immediate catch-up poll when the tab
-//      becomes visible again. A backgrounded dashboard no longer hammers the
-//      server every few seconds for data nobody is looking at.
-//   2. Non-overlapping: if `fn` returns a promise that takes longer than the
-//      interval, the next tick waits for it instead of stacking requests.
-//
-// `enabled: false` stops polling entirely (one-shot or gated polls). The
-// returned `refetch` runs `fn` on demand and is stable across renders.
-export function usePolling(fn, intervalMs = 5000, { enabled = true, immediate = true } = {}) {
-  const fnRef = useRef(fn)
-  fnRef.current = fn
-
-  const runningRef = useRef(false)
-  const refetch = useCallback(async () => {
-    // Guard against overlap: a slow poll shouldn't pile up behind a fast timer.
-    if (runningRef.current) return
-    runningRef.current = true
-    try {
-      return await fnRef.current()
-    } finally {
-      runningRef.current = false
-    }
-  }, [])
-
-  useEffect(() => {
-    if (!enabled) return
-    let timer = null
-
-    const tick = () => { refetch() }
-
-    const start = () => {
-      if (timer != null) return
-      timer = setInterval(tick, intervalMs)
-    }
-    const stop = () => {
-      if (timer != null) { clearInterval(timer); timer = null }
-    }
-
-    const onVisibility = () => {
-      if (document.hidden) {
-        stop()
-      } else {
-        // Catch up immediately on return, then resume the cadence.
-        tick()
-        start()
-      }
-    }
-
-    if (immediate) tick()
-    if (!document.hidden) start()
-    document.addEventListener('visibilitychange', onVisibility)
-
-    return () => {
-      stop()
-      document.removeEventListener('visibilitychange', onVisibility)
-    }
-  }, [enabled, intervalMs, immediate, refetch])
-
-  return { refetch }
-}
--- a/core/http/react-ui/src/hooks/useRecommendedModels.js
+++ b/core/http/react-ui/src/hooks/useRecommendedModels.js
@@ -1,108 +0,0 @@
-import { useState, useEffect } from 'react'
-import { modelsApi } from '../utils/api'
-import { useResources } from './useResources'
-
-// Data-driven "recommended for your hardware" model picks. The gallery exposes
-// no popularity/download signal and the list response carries no size, so we:
-//   1. ask the server for chat-capable models in their natural (curated) order,
-//   2. estimate size/VRAM for the top candidates (same endpoint the Models page
-//      uses), and
-//   3. rank by hardware fit — smallest on CPU-only boxes, largest-that-fits on
-//      GPUs (bigger == better quality while still fitting VRAM).
-//
-// Returns `recommended === null` while loading, `[]` when nothing could be
-// resolved (gallery/estimates unavailable) so callers can fall back.
-
-const GB = 1024 * 1024 * 1024
-const DEFAULT_CTX = 4096
-
-// NVFP4 is a Blackwell/NVIDIA-specific 4-bit format — only worth suggesting on
-// NVIDIA hardware, and to be filtered out elsewhere.
-export const isNvfp4Name = (name) => /nvfp4/i.test(name || '')
-
-export function hasNvidiaGpu(resources) {
-  return Array.isArray(resources?.gpus) &&
-    resources.gpus.some(g => (g?.vendor || '').toLowerCase() === 'nvidia')
-}
-
-export function recommendTier(resources) {
-  const isGpu = resources?.type === 'gpu'
-  const vram = resources?.aggregate?.total_memory || 0
-  if (!isGpu || vram <= 0) return { id: 'cpu', vram: 0 }
-  if (vram < 8 * GB) return { id: 'gpu-small', vram }
-  if (vram < 24 * GB) return { id: 'gpu-mid', vram }
-  return { id: 'gpu-large', vram }
-}
-
-function rank(candidates, tier, count, isNvidia) {
-  // NVFP4 only runs on NVIDIA (Blackwell) — drop it everywhere else, and prefer
-  // it on NVIDIA boxes where it's the fastest path.
-  const pool = candidates.filter(c => c.sizeBytes != null && (isNvidia || !isNvfp4Name(c.name)))
-  if (tier.id === 'cpu') {
-    // No GPU: smallest models stay responsive on CPU.
-    return [...pool].sort((a, b) => a.sizeBytes - b.sizeBytes).slice(0, count)
-  }
-  const limit = tier.vram * 0.95
-  const fits = pool.filter(c => c.vramBytes != null && c.vramBytes <= limit)
-  const base = fits.length > 0 ? fits : pool // tiny GPU where nothing fits → fall through to smallest
-  const byPreference = (a, b) => {
-    // On NVIDIA, surface NVFP4 first; then largest-that-fits (best quality).
-    if (isNvidia) {
-      const an = isNvfp4Name(a.name), bn = isNvfp4Name(b.name)
-      if (an !== bn) return an ? -1 : 1
-    }
-    return fits.length > 0 ? b.sizeBytes - a.sizeBytes : a.sizeBytes - b.sizeBytes
-  }
-  return [...base].sort(byPreference).slice(0, count)
-}
-
-export function useRecommendedModels({ count = 4, candidatePool = 10 } = {}) {
-  const { resources } = useResources()
-  const [recommended, setRecommended] = useState(null)
-  const [error, setError] = useState(null)
-
-  const resReady = resources !== null
-  const tier = recommendTier(resources)
-  const isNvidia = hasNvidiaGpu(resources)
-
-  useEffect(() => {
-    if (!resReady) return
-    let cancelled = false
-    setRecommended(null)
-    setError(null)
-    ;(async () => {
-      try {
-        const data = await modelsApi.list({ tag: 'chat', items: candidatePool, page: 1 })
-        // Recommend models the user hasn't installed yet.
-        const models = (data?.models || []).filter(m => !m.installed)
-        const estimated = await Promise.all(models.map(async (m) => {
-          const name = m.name || m.id
-          try {
-            const e = await modelsApi.estimate(name, [DEFAULT_CTX])
-            const ctx = e?.estimates?.[String(DEFAULT_CTX)]
-            return {
-              name,
-              description: m.description,
-              sizeBytes: e?.sizeBytes ?? null,
-              sizeDisplay: e?.sizeDisplay ?? null,
-              vramBytes: ctx?.vramBytes ?? null,
-              vramDisplay: ctx?.vramDisplay ?? null,
-            }
-          } catch {
-            return { name, sizeBytes: null }
-          }
-        }))
-        if (cancelled) return
-        setRecommended(rank(estimated, tier, count, isNvidia))
-      } catch (e) {
-        if (cancelled) return
-        setError(e.message)
-        setRecommended([])
-      }
-    })()
-    return () => { cancelled = true }
-    // tier.id / tier.vram / isNvidia are primitives, so resource polling doesn't re-run this.
-  }, [resReady, tier.id, tier.vram, isNvidia, count, candidatePool])
-
-  return { recommended, tier, isNvidia, error, loading: recommended === null }
-}
--- a/core/http/react-ui/src/hooks/useResources.js
+++ b/core/http/react-ui/src/hooks/useResources.js
@@ -1,11 +1,11 @@
-import { useState, useCallback } from 'react'
+import { useState, useEffect, useCallback, useRef } from 'react'
 import { resourcesApi } from '../utils/api'
-import { usePolling } from './usePolling'

 export function useResources(pollInterval = 5000) {
  const [resources, setResources] = useState(null)
  const [loading, setLoading] = useState(true)
  const [error, setError] = useState(null)
+  const intervalRef = useRef(null)

  const fetchResources = useCallback(async () => {
    try {
@@ -19,10 +19,13 @@ export function useResources(pollInterval = 5000) {
    }
  }, [])

-  // Visibility-aware polling: pauses while the tab is hidden and catches up on
-  // return (see usePolling). Resource stats are pure dashboard data, so there's
-  // no reason to keep fetching them for a backgrounded tab.
-  const { refetch } = usePolling(fetchResources, pollInterval)
+  useEffect(() => {
+    fetchResources()
+    intervalRef.current = setInterval(fetchResources, pollInterval)
+    return () => {
+      if (intervalRef.current) clearInterval(intervalRef.current)
+    }
+  }, [fetchResources, pollInterval])

-  return { resources, loading, error, refetch }
+  return { resources, loading, error, refetch: fetchResources }
 }
--- a/core/http/react-ui/src/pages/AgentChat.jsx
+++ b/core/http/react-ui/src/pages/AgentChat.jsx
@@ -765,10 +765,8 @@ export default function AgentChat() {
            className="chat-send-btn"
            onClick={handleSend}
            disabled={processing || !input.trim()}
-            aria-label="Send message"
-            title="Send message"
          >
-            <i className="fas fa-paper-plane" aria-hidden="true" />
+            <i className="fas fa-paper-plane" />
          </button>
        </div>
      </div>
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -1427,10 +1427,8 @@ export default function Chat() {
                className="chat-send-btn"
                onClick={handleSend}
                disabled={!input.trim() && files.length === 0}
-                aria-label={t('input.send')}
-                title={t('input.send')}
              >
-                <i className="fas fa-paper-plane" aria-hidden="true" />
+                <i className="fas fa-paper-plane" />
              </button>
            )}
          </div>
--- a/core/http/react-ui/src/pages/Home.jsx
+++ b/core/http/react-ui/src/pages/Home.jsx
@@ -10,7 +10,6 @@ import UnifiedMCPDropdown from '../components/UnifiedMCPDropdown'
 import ConfirmDialog from '../components/ConfirmDialog'
 import HomeConnect from '../components/HomeConnect'
 import { useResources } from '../hooks/useResources'
-import { usePolling } from '../hooks/usePolling'
 import { fileToBase64, backendControlApi, systemApi, modelsApi, mcpApi, nodesApi } from '../utils/api'
 import { API_CONFIG } from '../utils/config'
 import { greetingKey } from '../utils/greeting'
@@ -18,7 +17,6 @@ import StatusPill from '../components/StatusPill'
 import Skeleton from '../components/Skeleton'
 import SectionHeading from '../components/SectionHeading'
 import EmptyState from '../components/EmptyState'
-import StarterModels from '../components/StarterModels'
 import { staggerStyle } from '../hooks/useStagger'

 export default function Home() {
@@ -70,36 +68,40 @@ export default function Home() {
      .catch(() => {})
  }, [])

-  // Poll cluster node data in distributed mode. Visibility-aware + gated on
-  // distributedMode so a non-distributed or backgrounded tab makes no calls.
-  const fetchCluster = useCallback(async () => {
-    try {
-      const data = await nodesApi.list()
-      const nodes = Array.isArray(data) ? data : []
-      const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
-      const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
-      const usedVRAM = backendNodes.reduce((sum, n) => {
-        if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
-        return sum
-      }, 0)
-      const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
-      const usedRAM = backendNodes.reduce((sum, n) => {
-        if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
-        return sum
-      }, 0)
-      const isGPU = totalVRAM > 0
-      const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
-      const totalCount = backendNodes.length
-      setClusterData({
-        totalMem: isGPU ? totalVRAM : totalRAM,
-        usedMem: isGPU ? usedVRAM : usedRAM,
-        isGPU,
-        healthyCount,
-        totalCount,
-      })
-    } catch { setClusterData(null) }
-  }, [])
-  usePolling(fetchCluster, 5000, { enabled: distributedMode })
+  // Poll cluster node data in distributed mode
+  useEffect(() => {
+    if (!distributedMode) return
+    const fetchCluster = async () => {
+      try {
+        const data = await nodesApi.list()
+        const nodes = Array.isArray(data) ? data : []
+        const backendNodes = nodes.filter(n => !n.node_type || n.node_type === 'backend')
+        const totalVRAM = backendNodes.reduce((sum, n) => sum + (n.total_vram || 0), 0)
+        const usedVRAM = backendNodes.reduce((sum, n) => {
+          if (n.total_vram && n.available_vram != null) return sum + (n.total_vram - n.available_vram)
+          return sum
+        }, 0)
+        const totalRAM = backendNodes.reduce((sum, n) => sum + (n.total_ram || 0), 0)
+        const usedRAM = backendNodes.reduce((sum, n) => {
+          if (n.total_ram && n.available_ram != null) return sum + (n.total_ram - n.available_ram)
+          return sum
+        }, 0)
+        const isGPU = totalVRAM > 0
+        const healthyCount = backendNodes.filter(n => n.status === 'healthy').length
+        const totalCount = backendNodes.length
+        setClusterData({
+          totalMem: isGPU ? totalVRAM : totalRAM,
+          usedMem: isGPU ? usedVRAM : usedRAM,
+          isGPU,
+          healthyCount,
+          totalCount,
+        })
+      } catch { setClusterData(null) }
+    }
+    fetchCluster()
+    const interval = setInterval(fetchCluster, 5000)
+    return () => clearInterval(interval)
+  }, [distributedMode])

  // Fetch configured models (to know if any exist) and loaded models (currently running)
  const fetchSystemInfo = useCallback(async () => {
@@ -121,7 +123,11 @@ export default function Home() {
    }
  }, [])

-  usePolling(fetchSystemInfo, 5000)
+  useEffect(() => {
+    fetchSystemInfo()
+    const interval = setInterval(fetchSystemInfo, 5000)
+    return () => clearInterval(interval)
+  }, [fetchSystemInfo])

  // Check MCP availability when selected model changes
  useEffect(() => {
@@ -517,8 +523,6 @@ export default function Home() {
            </div>
          </div>

-          <StarterModels addToast={addToast} onInstallStarted={fetchSystemInfo} />
-
          <div className="home-wizard-actions">
            <button className="btn btn-primary" onClick={() => navigate('/app/models')}>
              <i className="fas fa-store" /> {t('wizard.browseGallery')}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ettore Di Giacinto	c23fc5fb42	docs(llama-cpp): correct run.sh comment for arm64/darwin cpu-all arm64 and darwin CPU images now also ship llama-cpp-cpu-all (not fallback-only); only GPU images ship fallback-only. Fix the stale comment to match. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]	2026-06-25 07:05:06 +00:00
Ettore Di Giacinto	292c1cab94	fix(llama-cpp,turboquant): only CPU_ALL_VARIANTS for pure-CPU builds, GPU uses fallback The previous gate sent every non-hipblas build through llama-cpp-cpu-all, so the GPU image builds (cublas, sycl_f16/f32, vulkan, nvidia l4t) compiled the whole CPU microarch variant matrix on top of their already-huge GPU backend - blowing the build time (the sycl job was only 59% done after 2h11m) - and the arm64 l4t build failed at `apt-get install gcc-14` (exit 100) on the Jetson base. Gate on an empty BUILD_TYPE instead: only the pure CPU image (build-type: '' in .github/backend-matrix.yml) builds the CPU_ALL_VARIANTS set; every GPU build gets a single fallback CPU grpc-server, since the accelerator does the compute. This also confines the arm64 gcc-14 step (needed for the armv9.2 SME variants) to the CPU build, away from the GPU base images. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]	2026-06-25 07:04:06 +00:00
Ettore Di Giacinto	4e9bb4f879	fix(llama-cpp-darwin): distribute ggml backends by suffix (.so root, .dylib lib) ggml emits its loadable backends (per-microarch CPU variants, metal, blas) with a .so suffix even on darwin, while the core libraries (ggml-base/ggml/llama/ llama-common/mtmd) use .dylib. Split the distribution by suffix: .so DL backends go in the package root for ggml's executable-directory scan, .dylib core libs go in lib/ for DYLD_LIBRARY_PATH. The previous .dylib name-pattern matched none of the variants. Verified on an M4: ggml loads the apple_m4 CPU variant (SME=1) and Metal, model loads and generates correct tokens. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]	2026-06-24 21:59:29 +00:00
Ettore Di Giacinto	3b47122e54	feat(llama-cpp,turboquant): arm64 gcc-14 for SME variants + darwin cpu-all packaging - arm64: ggml CPU_ALL_VARIANTS builds armv9.2 SME variants whose -march=...+sme is rejected by the Ubuntu 24.04 default gcc-13. Build the arm64 variants with gcc-14 (installed in the compile step). The host only selects a variant it actually supports at runtime, but every variant must still compile. - darwin: scripts/build/llama-cpp-darwin.sh builds llama-cpp-cpu-all instead of the fallback binary, keeps Metal (GGML_METAL stays ON; --target ggml also builds ggml-metal). The per-microarch libggml-cpu-*.dylib are placed in the package root next to the binary (darwin has no bundled ld.so, so ggml's executable-dir scan looks there), while the other shared dylibs go in lib/ for DYLD_LIBRARY_PATH. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]	2026-06-24 21:50:29 +00:00
Ettore Di Giacinto	379fa3e525	feat(llama-cpp,turboquant): extend CPU_ALL_VARIANTS to arm64 + turboquant - llama-cpp: x86 AND arm64 now use the single llama-cpp-cpu-all build (only hipblas keeps the fallback build). ggml's arm64 variant table (armv8.x / armv9.x, plus apple_m* on darwin) is selected at runtime. - turboquant: same recipe via a turboquant-cpu-all target. turboquant copies backend/cpp/llama-cpp's CMakeLists.txt + Makefile per flavor, so the hw_grpc_proto STATIC fix and the SHARED_LIBS / EXTRA_CMAKE_ARGS make-vars are inherited; the target just passes SHARED_LIBS=ON, the DL flags and --target ggml through, then collects the .so set. run.sh and package.sh updated to ship/select turboquant-cpu-all. - Makefile lib-collection find now also matches *.dylib (for the darwin build, which emits dylibs rather than .so). ik-llama-cpp is intentionally left unchanged: its pinned ggml has no CPU_ALL_VARIANTS support and its IQK kernels require AVX2, so the per-microarch dynamic backend set does not apply. Scope still excludes the darwin packaging wiring (separate change). Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]	2026-06-24 21:33:32 +00:00
Ettore Di Giacinto	e47c58656f	feat(llama-cpp): single x86 CPU build via ggml CPU_ALL_VARIANTS Replace the per-microarch avx/avx2/avx512/fallback multi-binary build on x86 with a single grpc-server plus the dlopen-able libggml-cpu-.so set that ggml's backend registry selects at runtime by probing host CPU features. One build instead of four, broader microarch coverage (adds alderlake AVX-VNNI, zen4 AVX512-BF16, sapphirerapids AMX), and the shell-side /proc/cpuinfo probing in run.sh goes away. Build/link notes: - CPU_ALL_VARIANTS requires GGML_BACKEND_DL + BUILD_SHARED_LIBS=ON, so ggml/llama become shared objects. SHARED_LIBS is now a make variable (default OFF) so the override survives the recursive sub-make into the VARIANT build dir instead of being re-clobbered by the base flags. - The cpu-all target also builds "--target ggml": the per-microarch backends are runtime-dlopened, not link deps, so they only compile via ggml's add_dependencies(). - hw_grpc_proto is pinned STATIC. Under BUILD_SHARED_LIBS=ON it would otherwise become a DSO referencing hidden-visibility symbols in the static libprotobuf.a, which fails to link ("hidden symbol ... is referenced by DSO"). Keeping it static links gRPC/protobuf into the executable while only ggml/llama stay shared, so no PIC or base-image change is required. - package.sh bundles the libggml-.so set into package/lib; ggml finds them by scanning the bundled ld.so directory (/proc/self/exe), which run.sh launches from. Scope: x86 only. arm64/darwin keep the single fallback build. The ik-llama-cpp / turboquant forks and the other ggml C++ backends are unchanged; the same recipe applies but is out of scope here. Validated with a full docker build plus a live inference smoke test: the model loads, ggml selects the AVX512_BF16 variant on a Zen-class host, and tokens generate correctly. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]	2026-06-24 21:21:03 +00:00