fix(backend): don't let a client disconnect cancel the model load

Image generation (and the tts/transcript/embeddings/vad/rerank/llm helpers) pass the request context to loader.Load so distributed routing decisions reach the request's X-LocalAI-Node holder. That context also governs cancellation of the load, so when a client disconnects mid-load the LoadModel RPC is aborted, stopLoadProcess tears down the backend process, and every retry restarts from scratch. Heavy diffusers/LLM models on a slow host (e.g. a shared-memory iGPU) take long enough to load that the request routinely ends first, so the model never finishes loading and the UI shows "NetworkError when attempting to fetch resource". Wrap the load context with context.WithoutCancel: the routing holder value still propagates, but the request's cancellation no longer aborts the load, so it runs to completion and caches for the next request. Inference keeps the cancellable request context, so a disconnect still stops generation. Adds a regression spec asserting a canceled request context does not cancel the model load while the routing holder still reaches the router. Fixes #10636 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-07-04 21:37:02 -04:00 · 2026-07-02 20:52:51 +00:00
101 changed files with 548 additions and 2993 deletions
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=bbc7de475178dd0535c16ad85f204a2529806c9d
+IK_LLAMA_VERSION?=068b173649f2fd8dc96b35ada5a0b76d8985105d
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=d4cff114c0084f1fbc9b4c62717eca8fb2ae494a
+LLAMA_VERSION?=4fc4ec5541b243957ae5099edb67372f8f3b550e
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -36,12 +36,6 @@ else
 	if [ -d "$CURDIR/lib/rocblas/library" ]; then
 		export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
 	fi
-	# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
-	# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
-	# the bundled data or it falls back to slow generic kernels (issue #10660).
-	if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
-		export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
-	fi
 fi

 # If there is a lib/ld.so, use it
--- a/backend/cpp/privacy-filter/Makefile
+++ b/backend/cpp/privacy-filter/Makefile
@@ -8,7 +8,7 @@
 # Local development: point at a working checkout instead of cloning, e.g.
 #   make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server

-PRIVACY_FILTER_VERSION?=735a6c28607ee82afc3a670383f41b55266a3b9a
+PRIVACY_FILTER_VERSION?=595f59630c69d361b5196f2aba2c71c873d0c13c
 PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
 PRIVACY_FILTER_SRC?=

--- a/backend/cpp/turboquant/run.sh
+++ b/backend/cpp/turboquant/run.sh
@@ -34,12 +34,6 @@ else
 	if [ -d "$CURDIR/lib/rocblas/library" ]; then
 		export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
 	fi
-	# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
-	# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
-	# the bundled data or it falls back to slow generic kernels (issue #10660).
-	if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
-		export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
-	fi
 fi

 # If there is a lib/ld.so, use it
--- a/backend/go/acestep-cpp/CMakeLists.txt
+++ b/backend/go/acestep-cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ target_include_directories(goacestepcpp PRIVATE ${ACESTEP_DIR}/src ${ACESTEP_DIR
 target_include_directories(goacestepcpp SYSTEM PRIVATE ${ACESTEP_DIR}/ggml/include)

 # Link GPU backends if available (mirrors link_ggml_backends macro)
-foreach(backend blas cuda hip metal vulkan)
+foreach(backend blas cuda metal vulkan)
    if(TARGET ggml-${backend})
        target_link_libraries(goacestepcpp PRIVATE ggml-${backend})
        string(TOUPPER ${backend} BACKEND_UPPER)
--- a/backend/go/acestep-cpp/Makefile
+++ b/backend/go/acestep-cpp/Makefile
@@ -24,14 +24,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
-	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
-	ROCM_HOME ?= /opt/rocm
-	ROCM_PATH ?= /opt/rocm
-	export CXX=$(ROCM_HOME)/llvm/bin/clang++
-	export CC=$(ROCM_HOME)/llvm/bin/clang
-	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=f35185b876fc482fcb2053a81a2697936ed5fcc0
+CRISPASR_VERSION?=fcbc8718e654995e3bd2d0c98bcb8e55e297d23c
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/omnivoice-cpp/CMakeLists.txt
+++ b/backend/go/omnivoice-cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src)
 target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include)

 # Link GPU backends if the upstream ggml created them.
-foreach(backend blas cuda hip metal vulkan sycl)
+foreach(backend blas cuda metal vulkan sycl)
    if(TARGET ggml-${backend})
        target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend})
        if(backend STREQUAL "cuda")
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -24,14 +24,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
-	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
-	ROCM_HOME ?= /opt/rocm
-	ROCM_PATH ?= /opt/rocm
-	export CXX=$(ROCM_HOME)/llvm/bin/clang++
-	export CC=$(ROCM_HOME)/llvm/bin/clang
-	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/qwen3-tts-cpp/CMakeLists.txt
+++ b/backend/go/qwen3-tts-cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src)
 target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include)

 # Link GPU backends if the upstream ggml created them.
-foreach(backend blas cuda hip metal vulkan sycl)
+foreach(backend blas cuda metal vulkan sycl)
    if(TARGET ggml-${backend})
        target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend})
        if(backend STREQUAL "cuda")
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -24,14 +24,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
-	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
-	ROCM_HOME ?= /opt/rocm
-	ROCM_PATH ?= /opt/rocm
-	export CXX=$(ROCM_HOME)/llvm/bin/clang++
-	export CC=$(ROCM_HOME)/llvm/bin/clang
-	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=2574f5936571645f784b77623e1f09bad97d948a
+STABLEDIFFUSION_GGML_VERSION?=3590aa8d626e671a1b1dc84506ea2932a243a480

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/vibevoice-cpp/CMakeLists.txt
+++ b/backend/go/vibevoice-cpp/CMakeLists.txt
@@ -50,7 +50,7 @@ target_include_directories(govibevoicecpp SYSTEM PRIVATE ${VIBEVOICE_DIR}/third_
 # Link GPU backends if available — vibevoice's own CMake already links
 # these to the libvibevoice STATIC library, but we re-link them on the
 # MODULE so resolved symbols include all backend kernels.
-foreach(backend blas cuda hip metal vulkan)
+foreach(backend blas cuda metal vulkan)
    if(TARGET ggml-${backend})
        target_link_libraries(govibevoicecpp PRIVATE ggml-${backend})
        string(TOUPPER ${backend} BACKEND_UPPER)
--- a/backend/go/vibevoice-cpp/Makefile
+++ b/backend/go/vibevoice-cpp/Makefile
@@ -29,14 +29,7 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
-	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
-	ROCM_HOME ?= /opt/rocm
-	ROCM_PATH ?= /opt/rocm
-	export CXX=$(ROCM_HOME)/llvm/bin/clang++
-	export CC=$(ROCM_HOME)/llvm/bin/clang
-	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DVIBEVOICE_GGML_HIPBLAS=ON
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DVIBEVOICE_GGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/python/common/mlx_utils.py
+++ b/backend/python/common/mlx_utils.py
@@ -20,15 +20,7 @@ def split_reasoning(text, think_start, think_end):
    Returns ``(reasoning_content, remaining_text)``. When ``think_start`` is
    empty or not found, returns ``("", text)`` unchanged.
    """
-    if not think_start or not text:
-        return "", text
-    if think_start not in text:
-        # Models like Qwen3.5 open assistant turns already INSIDE thinking, so
-        # the generated text carries only the closing tag. Everything before it
-        # is reasoning that would otherwise leak into the content.
-        if think_end and think_end in text:
-            head, _, tail = text.partition(think_end)
-            return head.strip(), tail.strip()
+    if not think_start or not text or think_start not in text:
        return "", text
    pattern = re.compile(
        re.escape(think_start) + r"(.*?)" + re.escape(think_end or ""),
--- a/backend/python/common/mlx_utils_test.py
+++ b/backend/python/common/mlx_utils_test.py
@@ -1,75 +0,0 @@
-"""Unit tests for the mlx/mlx-vlm shared helpers (mlx_utils.py).
-
-Run standalone (Python standard library only, no backend venv needed):
-    python3 -m unittest mlx_utils_test
-
-These mirror the server-less helper tests in backend/python/mlx/test.py
-(TestSharedHelpers), but live here so they run on any platform: the mlx
-test module imports grpc/backend_pb2 at import time and needs the MLX venv,
-whereas mlx_utils only needs the standard library.
-"""
-
-import types
-import unittest
-
-from mlx_utils import parse_tool_calls, split_reasoning
-
-
-class TestSplitReasoning(unittest.TestCase):
-    def test_both_tags(self):
-        r, c = split_reasoning(
-            "<think>step 1\nstep 2</think>The answer is 42.", "<think>", "</think>"
-        )
-        self.assertEqual(r, "step 1\nstep 2")
-        self.assertEqual(c, "The answer is 42.")
-
-    def test_implicit_opener_only_closing_tag(self):
-        # Qwen3.5 opens the assistant turn already inside thinking, so the
-        # output carries only the closing tag; everything before it is reasoning.
-        r, c = split_reasoning(
-            "The user is asking about the weather.\n</think>\n\nThe weather in Rome is sunny.",
-            "<think>",
-            "</think>",
-        )
-        self.assertEqual(r, "The user is asking about the weather.")
-        self.assertEqual(c, "The weather in Rome is sunny.")
-
-    def test_no_tags_at_all(self):
-        r, c = split_reasoning("just text", "<think>", "</think>")
-        self.assertEqual(r, "")
-        self.assertEqual(c, "just text")
-
-    def test_empty_think_end_and_no_opener_match(self):
-        # No think_end to anchor on, and the opener is absent → return unchanged.
-        r, c = split_reasoning("no opener here", "<think>", "")
-        self.assertEqual(r, "")
-        self.assertEqual(c, "no opener here")
-
-    def test_empty_text(self):
-        r, c = split_reasoning("", "<think>", "</think>")
-        self.assertEqual(r, "")
-        self.assertEqual(c, "")
-
-
-class TestParseToolCalls(unittest.TestCase):
-    def test_with_shim(self):
-        tm = types.SimpleNamespace(
-            tool_call_start="<tool_call>",
-            tool_call_end="</tool_call>",
-            parse_tool_call=lambda body, tools: {
-                "name": "get_weather",
-                "arguments": {"location": body.strip()},
-            },
-        )
-        calls, remaining = parse_tool_calls(
-            "Sure: <tool_call>Paris</tool_call>", tm, tools=None
-        )
-        self.assertEqual(len(calls), 1)
-        self.assertEqual(calls[0]["name"], "get_weather")
-        self.assertEqual(calls[0]["arguments"], '{"location": "Paris"}')
-        self.assertEqual(calls[0]["index"], 0)
-        self.assertNotIn("<tool_call>", remaining)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/backend/python/common/python_utils.py
+++ b/backend/python/common/python_utils.py
@@ -58,18 +58,7 @@ def messages_to_dicts(proto_messages):
            d["reasoning_content"] = msg.reasoning_content
        if msg.tool_calls:
            try:
-                tool_calls = json.loads(msg.tool_calls)
-                # Chat templates (e.g. Qwen) iterate function.arguments as a
-                # mapping, but the OpenAI wire format carries it as a JSON
-                # string — decode it back so the template's .items() works.
-                for tc in tool_calls:
-                    fn = tc.get("function") if isinstance(tc, dict) else None
-                    if isinstance(fn, dict) and isinstance(fn.get("arguments"), str):
-                        try:
-                            fn["arguments"] = json.loads(fn["arguments"])
-                        except json.JSONDecodeError:
-                            pass
-                d["tool_calls"] = tool_calls
+                d["tool_calls"] = json.loads(msg.tool_calls)
            except json.JSONDecodeError:
                pass
        result.append(d)
--- a/backend/python/common/python_utils_test.py
+++ b/backend/python/common/python_utils_test.py
@@ -1,122 +0,0 @@
-"""Unit tests for the shared python backend helpers (python_utils.py).
-
-Run standalone (Python standard library only, no backend venv needed):
-    python3 -m unittest python_utils_test
-
-These mirror the server-less helper tests in backend/python/mlx/test.py
-(TestSharedHelpers), but live here so they run on any platform: the mlx
-test module imports grpc/backend_pb2 at import time and needs the MLX venv,
-whereas python_utils has no third-party dependency. Proto Message objects
-are faked with types.SimpleNamespace (real proto fields default to "").
-"""
-
-import json
-import types
-import unittest
-
-from python_utils import messages_to_dicts, parse_options
-
-
-def _msg(**fields):
-    """Fake a proto Message: every unset field is the empty string, as protobuf."""
-    defaults = {
-        "role": "",
-        "content": "",
-        "name": "",
-        "tool_call_id": "",
-        "reasoning_content": "",
-        "tool_calls": "",
-    }
-    defaults.update(fields)
-    return types.SimpleNamespace(**defaults)
-
-
-class TestParseOptions(unittest.TestCase):
-    def test_type_inference(self):
-        opts = parse_options(
-            ["temperature:0.7", "max_tokens:128", "trust:true", "name:hello", "no_colon_skipped"]
-        )
-        self.assertEqual(opts["temperature"], 0.7)
-        self.assertEqual(opts["max_tokens"], 128)
-        self.assertIs(opts["trust"], True)
-        self.assertEqual(opts["name"], "hello")
-        self.assertNotIn("no_colon_skipped", opts)
-
-
-class TestMessagesToDicts(unittest.TestCase):
-    def test_basic_fields(self):
-        out = messages_to_dicts(
-            [
-                _msg(role="user", content="hi"),
-                _msg(role="tool", content="42", tool_call_id="call_1", name="f"),
-            ]
-        )
-        self.assertEqual(out[0], {"role": "user", "content": "hi"})
-        self.assertEqual(out[1]["tool_call_id"], "call_1")
-        self.assertEqual(out[1]["name"], "f")
-
-    def test_tool_call_arguments_string_decoded_to_mapping(self):
-        # OpenAI wire format ships function.arguments as a JSON *string*; chat
-        # templates iterate it as a mapping, so it must come back as a dict.
-        out = messages_to_dicts(
-            [
-                _msg(
-                    role="assistant",
-                    tool_calls=json.dumps(
-                        [
-                            {
-                                "id": "call_1",
-                                "type": "function",
-                                "function": {
-                                    "name": "get_weather",
-                                    "arguments": '{"location": "Rome"}',
-                                },
-                            }
-                        ]
-                    ),
-                )
-            ]
-        )
-        args = out[0]["tool_calls"][0]["function"]["arguments"]
-        self.assertEqual(args, {"location": "Rome"})
-        self.assertEqual(dict(args.items()), {"location": "Rome"})
-
-    def test_tool_call_arguments_already_mapping_is_idempotent(self):
-        out = messages_to_dicts(
-            [
-                _msg(
-                    role="assistant",
-                    tool_calls=json.dumps(
-                        [{"function": {"name": "f", "arguments": {"a": 1}}}]
-                    ),
-                )
-            ]
-        )
-        self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], {"a": 1})
-
-    def test_tool_call_arguments_invalid_json_left_as_string(self):
-        out = messages_to_dicts(
-            [
-                _msg(
-                    role="assistant",
-                    tool_calls=json.dumps(
-                        [{"function": {"name": "f", "arguments": "not-json"}}]
-                    ),
-                )
-            ]
-        )
-        self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], "not-json")
-
-    def test_tool_call_without_function_key(self):
-        out = messages_to_dicts(
-            [_msg(role="assistant", tool_calls=json.dumps([{"id": "call_1"}]))]
-        )
-        self.assertEqual(out[0]["tool_calls"], [{"id": "call_1"}])
-
-    def test_tool_calls_invalid_json_dropped(self):
-        out = messages_to_dicts([_msg(role="assistant", tool_calls="{not json")])
-        self.assertNotIn("tool_calls", out[0])
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -35,21 +35,6 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi

-# AMD ROCm: vLLM ships prebuilt ROCm wheels, but on a DEDICATED index
-# (https://wheels.vllm.ai/rocm/), NOT PyPI, and ONLY for CPython 3.12. On any
-# other Python the installer silently falls back to the CUDA-only PyPI wheel,
-# which is unusable on an AMD GPU (import fails, so the backend never finds the
-# vllm module). Force Python 3.12 before the venv is created (matches the
-# intel/l4t13 cp312 bump); the hipblas branch below pulls vllm from the ROCm
-# wheel index. unsafe-best-match lets uv consult that index and PyPI together.
-# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
-if [ "x${BUILD_TYPE}" == "xhipblas" ]; then
-    PYTHON_VERSION="3.12"
-    PYTHON_PATCH="12"
-    PY_STANDALONE_TAG="20251120"
-    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
-fi
-
 # cublas13 pulls the vLLM wheel from a per-tag cu130 index (PyPI's vllm wheel
 # is built against CUDA 12 and won't load on cu130). uv's default per-package
 # first-match strategy would still pick the PyPI wheel, so allow it to consult
@@ -119,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
    # can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
    # vllm pin (requirements-cublas13-after.txt, bumped independently against
    # vllm/vllm) until vllm-metal supports a newer vLLM.
-    VLLM_METAL_VERSION="v0.3.0.dev20260701212152"
+    VLLM_METAL_VERSION="v0.3.0.dev20260701132215"

    # The coupled vLLM source version is whatever this vllm-metal release builds
    # against -- it declares it in its own installer as `vllm_v=`. Derive it from
@@ -209,22 +194,6 @@ elif [ "x${BUILD_TYPE}" == "xintel" ]; then
        export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
        VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
    popd
-# AMD ROCm: install vllm from its dedicated ROCm wheel index instead of the
-# CUDA-only PyPI wheel. installRequirements brings the base ROCm
-# torch/transformers (requirements-hipblas.txt), then we pull vllm (plus the
-# matching ROCm torch, via --upgrade) from wheels.vllm.ai/rocm. This is the
-# method upstream prescribes for AMD; the Python-3.12 pin is set above.
-# There is intentionally no requirements-hipblas-after.txt: a bare `vllm`
-# there would resolve to the CUDA wheel, and installRequirements never loads
-# a ${BUILD_TYPE}-after file for hipblas anyway (BUILD_TYPE == BUILD_PROFILE).
-# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
-elif [ "x${BUILD_TYPE}" == "xhipblas" ]; then
-    installRequirements
-
-    # --upgrade reconciles the base ROCm torch to whatever the vllm ROCm wheel
-    # pins; --extra-index-url adds the ROCm wheel repository on top of PyPI.
-    uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
-        --extra-index-url https://wheels.vllm.ai/rocm/ --upgrade vllm
 # FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
 # requirements-cpu-after.txt and compiles vllm locally against the host's
 # actual CPU. Not used by default because it takes ~30-40 minutes, but
--- a/backend/python/vllm/requirements-hipblas-after.txt
+++ b/backend/python/vllm/requirements-hipblas-after.txt
@@ -0,0 +1 @@
+vllm
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -473,13 +473,20 @@ func New(opts ...config.AppOption) (*Application, error) {

 	if options.LoadToMemory != nil && !options.SingleBackend {
 		for _, m := range options.LoadToMemory {
-			xlog.Debug("Auto loading model into memory from file", "model", m)
-			// Same path as POST /backend/load: a realtime pipeline model expands
-			// to its sub-models, and load failures are recorded as model_load
-			// traces.
-			if _, err := backend.PreloadModelByName(options.Context, application.ModelConfigLoader(), application.ModelLoader(), options, m); err != nil {
+			cfg, err := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(m, options)
+			if err != nil {
 				return nil, err
 			}
+
+			xlog.Debug("Auto loading model into memory from file", "model", m, "file", cfg.Model)
+
+			o := backend.ModelOptions(*cfg, options)
+
+			var backendErr error
+			_, backendErr = application.ModelLoader().Load(o...)
+			if backendErr != nil {
+				return nil, backendErr
+			}
 		}
 	}

--- a/core/backend/ctx_propagation_test.go
+++ b/core/backend/ctx_propagation_test.go
@@ -157,6 +157,33 @@ var _ = Describe("X-LocalAI-Node ctx propagation contract", func() {
 		stampViaRouterCtx()
 	})

+	// Regression for #10636: a canceled request context must NOT cancel the
+	// model LOAD. The heavy image/audio backends bind the load to the request
+	// context so the routing holder reaches the SmartRouter; but a large
+	// diffusers/LLM model on a slow (e.g. shared-memory iGPU) host can take
+	// far longer to load than the client stays connected. If the request's
+	// cancellation propagates to the load, the LoadModel RPC is aborted, the
+	// backend process is torn down, and every retry restarts from scratch and
+	// never converges. The load must instead run to completion and cache while
+	// still carrying the request's routing holder value.
+	It("ImageGeneration does not propagate request cancellation to the model load", func() {
+		canceledCtx, cancel := context.WithCancel(reqCtx)
+		cancel() // client disconnected while the (slow) load was still running
+
+		_, err := backend.ImageGeneration(canceledCtx, 64, 64, 1, 0, "p", "", "", "/tmp/out.png", loader, modelCfg, appCfg, nil)
+		// The load reached the router (short-circuit sentinel), i.e. it was
+		// NOT aborted early by the already-canceled request context.
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("router short-circuit (test)"))
+
+		routerCtx := routerCtxOf()
+		Expect(routerCtx).ToNot(BeNil(), "router callback must have been invoked")
+		Expect(routerCtx.Err()).To(BeNil(),
+			"a canceled request must not cancel the model load")
+		// The routing holder value still propagates despite the decoupling.
+		stampViaRouterCtx()
+	})
+
 	It("does NOT leak the holder when the app context is used instead", func() {
 		// Sanity: the bug being fixed manifests as the router getting
 		// appCfg.Context (no holder) instead of reqCtx (holder). A direct
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -40,10 +40,14 @@ func (e *modelEmbedder) Embed(ctx context.Context, text string) ([]float32, erro

 func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {

-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))

 	inferenceModel, err := loader.Load(opts...)
 	if err != nil {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -13,10 +13,14 @@ import (

 func ImageGeneration(ctx context.Context, height, width, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {

-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	inferenceModel, err := loader.Load(
 		opts...,
 	)
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -111,7 +111,12 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 	}
 	ctx = distributedhdr.MaybeWithPrefixChain(ctx, c.ModelID(), chainSource)

-	opts := ModelOptions(*c, o, model.WithContext(ctx))
+	// context.WithoutCancel decouples the model load from the request's
+	// cancellation while preserving its routing values, so a slow load still
+	// completes and caches if the client disconnects instead of aborting the
+	// LoadModel RPC mid-load (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(*c, o, model.WithContext(context.WithoutCancel(ctx)))
 	inferenceModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(o, c.Name, c.Backend, err, map[string]any{"model_file": modelFile})
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -52,22 +52,6 @@ func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.Back
 	}
 }

-// PreloadModel warms a model into memory without running any inference, so the
-// first real request doesn't pay the backend's cold-start load cost. It uses
-// the same ModelOptions + ml.Load path the modality functions use, so a
-// subsequent inference call hits the loader cache instead of reloading. Load
-// failures are recorded and returned; callers that warm models opportunistically
-// (e.g. realtime session warm-up) typically log and continue, since the lazy
-// path will retry on first use.
-func PreloadModel(ctx context.Context, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) error {
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
-	if _, err := ml.Load(opts...); err != nil {
-		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
-		return err
-	}
-	return nil
-}
-
 // recordModelLoadFailure records a backend trace when model loading fails.
 func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
 	if !appConfig.EnableTracing {
--- a/core/backend/preload.go
+++ b/core/backend/preload.go
@@ -1,122 +0,0 @@
-package backend
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"sync"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/xlog"
-)
-
-// PreloadModelByName loads the named model into memory so the first request
-// that uses it pays no cold-start load cost — the inverse of shutting a model
-// down. If the model is a realtime pipeline (its config declares a `pipeline:`
-// block), each configured sub-model (VAD, transcription, LLM, TTS,
-// sound_detection, voice_recognition) is loaded concurrently instead of the
-// pipeline stub, which has no backend of its own. It returns the model names
-// actually loaded and a joined error naming each sub-model that failed (nil on
-// full success); a partial pipeline load reports both the loaded names and the
-// failures so the caller can surface exactly what is and isn't resident.
-// Compaction's summary_model is deliberately left cold: it is only invoked off
-// the response path, so it can stay lazy.
-func PreloadModelByName(ctx context.Context, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, name string) ([]string, error) {
-	cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(name, appConfig)
-	if err != nil {
-		return nil, err
-	}
-
-	stages, err := pipelineStages(cl, &cfg.Pipeline, ml.ModelPath)
-	if err != nil {
-		return nil, err
-	}
-	if len(stages) == 0 {
-		// Not a pipeline: load the model's own backend directly.
-		if err := PreloadModel(ctx, ml, *cfg, appConfig); err != nil {
-			return nil, err
-		}
-		return []string{cfg.Name}, nil
-	}
-	return PreloadStages(ctx, ml, appConfig, stages)
-}
-
-// PreloadStage names one pipeline sub-model to preload and the resolved config
-// to load it from (nil = stage absent, skipped). Role labels the pipeline slot
-// in errors and logs.
-type PreloadStage struct {
-	Role string
-	Cfg  *config.ModelConfig
-}
-
-// loadStage is PreloadModel behind a seam so PreloadStages can be unit-tested
-// without spawning real backends.
-var loadStage = PreloadModel
-
-// pipelineStages resolves each populated pipeline stage to its concrete model
-// config, following a single alias hop — the same resolution the realtime
-// pipeline itself uses. A stage that fails to resolve is a misconfiguration,
-// so it fails fast rather than being deferred to load. A pipeline with no
-// stages set returns nil, which callers treat as "not a pipeline".
-func pipelineStages(cl *config.ModelConfigLoader, p *config.Pipeline, modelPath string) ([]PreloadStage, error) {
-	voiceRec := ""
-	if p.VoiceRecognition != nil {
-		voiceRec = p.VoiceRecognition.Model
-	}
-	var stages []PreloadStage
-	for _, s := range []struct{ role, name string }{
-		{"vad", p.VAD},
-		{"transcription", p.Transcription},
-		{"llm", p.LLM},
-		{"tts", p.TTS},
-		{"sound_detection", p.SoundDetection},
-		{"voice_recognition", voiceRec},
-	} {
-		if s.name == "" {
-			continue
-		}
-		cfg, err := cl.LoadResolvedModelConfig(s.name, modelPath)
-		if err != nil {
-			return nil, fmt.Errorf("%s (%s): %w", s.role, s.name, err)
-		}
-		stages = append(stages, PreloadStage{Role: s.role, Cfg: cfg})
-	}
-	return stages, nil
-}
-
-// PreloadStages loads every present stage at once and waits for all of them, so
-// a pipeline warms in the time of its slowest stage rather than the sum. Absent
-// (nil-config) stages are skipped. A failed stage does not cancel the others —
-// they all run to completion so the joined error names every broken stage at
-// once, alongside the names that did load.
-func PreloadStages(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, stages []PreloadStage) ([]string, error) {
-	var (
-		wg     sync.WaitGroup
-		mu     sync.Mutex
-		loaded []string
-		errs   []error
-	)
-	for _, s := range stages {
-		if s.Cfg == nil {
-			continue
-		}
-		wg.Add(1)
-		go func(s PreloadStage) {
-			defer wg.Done()
-			if err := loadStage(ctx, ml, *s.Cfg, appConfig); err != nil {
-				xlog.Warn("preload: failed to load pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name, "error", err)
-				mu.Lock()
-				errs = append(errs, fmt.Errorf("%s (%s): %w", s.Role, s.Cfg.Name, err))
-				mu.Unlock()
-				return
-			}
-			xlog.Debug("preload: loaded pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name)
-			mu.Lock()
-			loaded = append(loaded, s.Cfg.Name)
-			mu.Unlock()
-		}(s)
-	}
-	wg.Wait()
-	return loaded, errors.Join(errs...)
-}
--- a/core/backend/preload_internal_test.go
+++ b/core/backend/preload_internal_test.go
@@ -1,146 +0,0 @@
-package backend
-
-import (
-	"context"
-	"errors"
-	"os"
-	"path/filepath"
-	"sync"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/model"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("pipelineStages", func() {
-	seed := func(dir string, names ...string) *config.ModelConfigLoader {
-		for _, n := range names {
-			yaml := "name: " + n + "\nbackend: fake-backend\n"
-			Expect(os.WriteFile(filepath.Join(dir, n+".yaml"), []byte(yaml), 0o644)).To(Succeed())
-		}
-		cl := config.NewModelConfigLoader(dir)
-		Expect(cl.LoadModelConfigsFromPath(dir)).To(Succeed())
-		return cl
-	}
-
-	It("resolves only the populated stages, in load order", func() {
-		dir := GinkgoT().TempDir()
-		cl := seed(dir, "vad-m", "stt-m", "llm-m", "tts-m")
-
-		stages, err := pipelineStages(cl, &config.Pipeline{
-			VAD:           "vad-m",
-			Transcription: "stt-m",
-			LLM:           "llm-m",
-			TTS:           "tts-m",
-		}, dir)
-		Expect(err).ToNot(HaveOccurred())
-
-		roles := make([]string, len(stages))
-		names := make([]string, len(stages))
-		for i, s := range stages {
-			roles[i] = s.Role
-			names[i] = s.Cfg.Name
-		}
-		Expect(roles).To(Equal([]string{"vad", "transcription", "llm", "tts"}))
-		Expect(names).To(Equal([]string{"vad-m", "stt-m", "llm-m", "tts-m"}))
-	})
-
-	It("skips unset stages and includes sound_detection and voice_recognition when set", func() {
-		dir := GinkgoT().TempDir()
-		cl := seed(dir, "stt-m", "ced", "spk")
-
-		stages, err := pipelineStages(cl, &config.Pipeline{
-			Transcription:    "stt-m",
-			SoundDetection:   "ced",
-			VoiceRecognition: &config.PipelineVoiceRecognition{Model: "spk"},
-		}, dir)
-		Expect(err).ToNot(HaveOccurred())
-
-		roles := make([]string, len(stages))
-		for i, s := range stages {
-			roles[i] = s.Role
-		}
-		Expect(roles).To(ConsistOf("transcription", "sound_detection", "voice_recognition"))
-	})
-
-	It("returns nil for a pipeline with no stages (not a pipeline)", func() {
-		dir := GinkgoT().TempDir()
-		cl := seed(dir)
-
-		stages, err := pipelineStages(cl, &config.Pipeline{}, dir)
-		Expect(err).ToNot(HaveOccurred())
-		Expect(stages).To(BeNil())
-	})
-})
-
-var _ = Describe("PreloadStages", func() {
-	var (
-		mu   sync.Mutex
-		seen []string
-	)
-
-	// stubLoader swaps the loadStage seam for a recorder so no real backends
-	// are spawned; errFor injects per-model failures.
-	stubLoader := func(errFor map[string]error) {
-		loadStage = func(_ context.Context, _ *model.ModelLoader, cfg config.ModelConfig, _ *config.ApplicationConfig) error {
-			mu.Lock()
-			seen = append(seen, cfg.Name)
-			mu.Unlock()
-			return errFor[cfg.Name]
-		}
-	}
-
-	BeforeEach(func() {
-		seen = nil
-	})
-	AfterEach(func() {
-		loadStage = PreloadModel
-	})
-
-	mkStage := func(role, name string) PreloadStage {
-		return PreloadStage{Role: role, Cfg: &config.ModelConfig{Name: name}}
-	}
-
-	It("loads every present stage, skips absent (nil-config) ones, and returns the loaded names", func() {
-		stubLoader(nil)
-
-		loaded, err := PreloadStages(context.Background(), nil, nil, []PreloadStage{
-			mkStage("vad", "vad-m"),
-			{Role: "transcription"}, // absent stage
-			mkStage("llm", "llm-m"),
-		})
-
-		Expect(err).ToNot(HaveOccurred())
-		Expect(loaded).To(ConsistOf("vad-m", "llm-m"))
-		// Barrier: every stage has run by the time PreloadStages returns, so
-		// reading seen without the lock here is safe.
-		Expect(seen).To(ConsistOf("vad-m", "llm-m"))
-	})
-
-	It("reports a joined error naming each failed stage while still loading the rest", func() {
-		stubLoader(map[string]error{
-			"vad-m": errors.New("vad boom"),
-			"tts-m": errors.New("tts boom"),
-		})
-
-		loaded, err := PreloadStages(context.Background(), nil, nil, []PreloadStage{
-			mkStage("vad", "vad-m"),
-			mkStage("llm", "llm-m"),
-			mkStage("tts", "tts-m"),
-		})
-
-		// Every stage ran (a failure does not cancel the others)...
-		Expect(seen).To(ConsistOf("vad-m", "llm-m", "tts-m"))
-		// ...the stage that loaded fine is reported as loaded...
-		Expect(loaded).To(ConsistOf("llm-m"))
-		// ...and the joined error names every broken stage and its cause.
-		Expect(err).To(HaveOccurred())
-		Expect(err.Error()).To(ContainSubstring("vad (vad-m)"))
-		Expect(err.Error()).To(ContainSubstring("vad boom"))
-		Expect(err.Error()).To(ContainSubstring("tts (tts-m)"))
-		Expect(err.Error()).To(ContainSubstring("tts boom"))
-		Expect(err.Error()).ToNot(ContainSubstring("llm"))
-	})
-})
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -57,10 +57,14 @@ func (r *modelReranker) Rerank(ctx context.Context, query string, documents []st
 }

 func Rerank(ctx context.Context, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig) (*proto.RerankResult, error) {
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	rerankModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -45,10 +45,14 @@ func loadTranscriptionModel(ctx context.Context, ml *model.ModelLoader, modelCon
 	if modelConfig.Backend == "" {
 		modelConfig.Backend = model.WhisperBackend
 	}
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	transcriptionModel, err := ml.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -50,10 +50,14 @@ func ModelTTS(
 	appConfig *config.ApplicationConfig,
 	modelConfig config.ModelConfig,
 ) (string, *proto.Result, error) {
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	ttsModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
@@ -153,7 +157,9 @@ func ModelTTSStream(
 	modelConfig config.ModelConfig,
 	audioCallback func([]byte) error,
 ) error {
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// See ModelTTS above: WithoutCancel decouples the load from request
+	// cancellation while preserving routing values (issue #10636).
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	ttsModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -14,10 +14,14 @@ func VAD(request *schema.VADRequest,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	modelConfig config.ModelConfig) (*schema.VADResponse, error) {
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	vadModel, err := ml.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -67,6 +67,16 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
 		ApplyMTPDefaults(cfg, n)
 	}

+	// Sliding-window-attention models (Gemma 2/3, Cohere2, Llama 4, ...) ship
+	// with a reduced SWA KV cache by default, which cannot reuse a prompt
+	// prefix across requests and so defeats the cross-request prefix cache
+	// (cache_reuse) we enable in serving_defaults.go. Enable the full SWA cache
+	// for these models so the prefix survives; skipped for dense models and
+	// when the user already pinned an SWA cache option.
+	if w, ok := HasSlidingWindowAttention(f); ok {
+		ApplySWAFullDefault(cfg, w)
+	}
+
 	// Thinking support detection is done after model load via DetectThinkingSupportFromBackend

 	// template estimations
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -599,13 +599,6 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "toggle",
 			Order:       89,
 		},
-		"pipeline.disable_warmup": {
-			Section:     "pipeline",
-			Label:       "Disable Warmup",
-			Description: "Turn off eager pre-loading of the pipeline's sub-models at realtime session start. By default LocalAI loads every configured sub-model backend (VAD, transcription, LLM, TTS, sound detection, voice recognition) before the session starts and blocks until they are ready, so the first turn pays no cold-start cost and a model that fails to load is reported at session start instead of mid-call. Enable this to restore the lazy 'load on first use' behavior — session start no longer waits on loading and load errors surface on the first turn instead. Useful to keep idle sessions from holding model memory they may never use.",
-			Component:   "toggle",
-			Order:       90,
-		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/model_capabilities.go
+++ b/core/config/model_capabilities.go
@@ -1,197 +0,0 @@
-package config
-
-// This file is the single source of truth for deriving a model's user-facing
-// capabilities and input/output modalities from its ModelConfig. Both the
-// OpenAI-compatible /v1/models/capabilities endpoint and the Ollama-compatible
-// /api/tags|/api/show surface consume these, so the vocabulary stays consistent
-// across clients. Keep the detection heuristics here rather than duplicating
-// them per endpoint.
-
-// VisionSupported reports whether the model can accept image inputs.
-//
-// We deliberately avoid HasUsecases(FLAG_VISION): GuessUsecases has no
-// FLAG_VISION branch and reports true for any chat model, so it would paint
-// vision onto text-only models. Instead we look for explicit signals: the
-// declared KnownUsecases bit, a multimodal projector, or a template/backend
-// multimodal marker.
-func (c *ModelConfig) VisionSupported() bool {
-	if c.KnownUsecases != nil && (*c.KnownUsecases&FLAG_VISION) == FLAG_VISION {
-		return true
-	}
-	if c.MMProj != "" {
-		return true
-	}
-	if c.TemplateConfig.Multimodal != "" {
-		return true
-	}
-	if c.MediaMarker != "" {
-		return true
-	}
-	return false
-}
-
-// ToolSupported reports whether the model is wired up for tool / function
-// calling. We look for any of the explicit knobs LocalAI uses to drive
-// function-call extraction (regex match, response regex, grammar triggers, XML
-// format) or the auto-detected tool-format markers the llama.cpp backend
-// populates during model load.
-func (c *ModelConfig) ToolSupported() bool {
-	fc := c.FunctionsConfig
-	if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
-		return true
-	}
-	if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
-		return true
-	}
-	if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
-		return true
-	}
-	if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
-		return true
-	}
-	return false
-}
-
-// ThinkingSupported reports whether the model has reasoning / thinking enabled.
-// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
-// when the backend probe reports that the model supports thinking.
-func (c *ModelConfig) ThinkingSupported() bool {
-	rc := c.ReasoningConfig
-	if rc.DisableReasoning != nil && !*rc.DisableReasoning {
-		return true
-	}
-	if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
-		// Explicit thinking markers imply support unless explicitly disabled.
-		return rc.DisableReasoning == nil || !*rc.DisableReasoning
-	}
-	return false
-}
-
-// AudioInputSupported reports whether a chat/generation model accepts audio as
-// input (e.g. vLLM omni models). The signal is the vLLM per-prompt audio limit;
-// there is no FLAG_* for "chat model that hears audio", which is exactly why a
-// plain usecase list can't express it. Transcription models are handled
-// separately in InputModalities via FLAG_TRANSCRIPT.
-func (c *ModelConfig) AudioInputSupported() bool {
-	return c.LimitMMPerPrompt.LimitAudioPerPrompt > 0
-}
-
-// VideoInputSupported reports whether a chat/generation model accepts video as
-// input. The signal is the vLLM per-prompt video limit. Note this is distinct
-// from FLAG_VIDEO, which denotes video *generation* (diffusers) — an output
-// modality, not an input one.
-func (c *ModelConfig) VideoInputSupported() bool {
-	return c.LimitMMPerPrompt.LimitVideoPerPrompt > 0
-}
-
-// Capabilities returns the ordered list of capability strings the model
-// supports, using the canonical usecase vocabulary (chat, vision, transcript,
-// tts, embeddings, image, video, ...) plus the modifier capabilities "tools"
-// and "thinking". Vision is resolved via VisionSupported (not HasUsecases) to
-// avoid the guess-heuristic false positive.
-func (c *ModelConfig) Capabilities() []string {
-	chat := c.HasUsecases(FLAG_CHAT)
-	completion := c.HasUsecases(FLAG_COMPLETION)
-
-	var caps []string
-	add := func(cond bool, name string) {
-		if cond {
-			caps = append(caps, name)
-		}
-	}
-
-	add(chat, UsecaseChat)
-	add(completion, UsecaseCompletion)
-	add(c.HasUsecases(FLAG_EDIT), UsecaseEdit)
-	add(c.HasUsecases(FLAG_EMBEDDINGS), UsecaseEmbeddings)
-	add(c.HasUsecases(FLAG_RERANK), UsecaseRerank)
-	// Vision is only meaningful as an image-understanding modifier on a chat/
-	// completion model. Gating on (chat||completion) matches the Ollama surface
-	// and avoids a false positive when config defaults hydrate a MediaMarker on
-	// a non-chat model (e.g. a pure ASR/TTS backend).
-	add((chat || completion) && c.VisionSupported(), UsecaseVision)
-	// tools/thinking are modifiers on the chat/completion surface.
-	add((chat || completion) && c.ToolSupported(), "tools")
-	add((chat || completion) && c.ThinkingSupported(), "thinking")
-	add(c.HasUsecases(FLAG_TRANSCRIPT), UsecaseTranscript)
-	add(c.HasUsecases(FLAG_TTS), UsecaseTTS)
-	add(c.HasUsecases(FLAG_SOUND_GENERATION), UsecaseSoundGeneration)
-	add(c.HasUsecases(FLAG_IMAGE), UsecaseImage)
-	add(c.HasUsecases(FLAG_VIDEO), UsecaseVideo)
-	add(c.HasUsecases(FLAG_VAD), UsecaseVAD)
-	add(c.HasUsecases(FLAG_DETECTION), UsecaseDetection)
-	add(c.HasUsecases(FLAG_DEPTH), UsecaseDepth)
-	add(c.HasUsecases(FLAG_AUDIO_TRANSFORM), UsecaseAudioTransform)
-	add(c.HasUsecases(FLAG_DIARIZATION), UsecaseDiarization)
-	add(c.HasUsecases(FLAG_SOUND_CLASSIFICATION), UsecaseSoundClassification)
-	add(c.HasUsecases(FLAG_REALTIME_AUDIO), UsecaseRealtimeAudio)
-	add(c.HasUsecases(FLAG_FACE_RECOGNITION), UsecaseFaceRecognition)
-	add(c.HasUsecases(FLAG_SPEAKER_RECOGNITION), UsecaseSpeakerRecognition)
-	return caps
-}
-
-// InputModalities returns the set of modalities (text, image, audio, video) the
-// model accepts as input, ordered text→image→audio→video. This is what an
-// attachment router consults to decide whether an image/audio/video file can be
-// handed to the active model directly.
-func (c *ModelConfig) InputModalities() []string {
-	imageGen := c.HasUsecases(FLAG_IMAGE)
-	videoGen := c.HasUsecases(FLAG_VIDEO)
-	chatish := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION)
-
-	textIn := chatish || c.HasUsecases(FLAG_EDIT) ||
-		c.HasUsecases(FLAG_EMBEDDINGS) || c.HasUsecases(FLAG_RERANK) || c.HasUsecases(FLAG_TOKENIZE) ||
-		c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) || imageGen || videoGen
-
-	// Image input via a chat model requires vision (gated on chat, like the
-	// Ollama surface); detection/depth/face models consume images directly.
-	imageIn := (chatish && c.VisionSupported()) || c.LimitMMPerPrompt.LimitImagePerPrompt > 0 ||
-		c.HasUsecases(FLAG_DETECTION) || c.HasUsecases(FLAG_DEPTH) || c.HasUsecases(FLAG_FACE_RECOGNITION)
-
-	audioIn := c.AudioInputSupported() || c.HasUsecases(FLAG_TRANSCRIPT) || c.HasUsecases(FLAG_AUDIO_TRANSFORM) ||
-		c.HasUsecases(FLAG_REALTIME_AUDIO) || c.HasUsecases(FLAG_VAD) || c.HasUsecases(FLAG_DIARIZATION) ||
-		c.HasUsecases(FLAG_SOUND_CLASSIFICATION) || c.HasUsecases(FLAG_SPEAKER_RECOGNITION)
-
-	videoIn := c.VideoInputSupported()
-
-	var mods []string
-	if textIn {
-		mods = append(mods, "text")
-	}
-	if imageIn {
-		mods = append(mods, "image")
-	}
-	if audioIn {
-		mods = append(mods, "audio")
-	}
-	if videoIn {
-		mods = append(mods, "video")
-	}
-	return mods
-}
-
-// OutputModalities returns the set of modalities (text, image, audio, video)
-// the model produces, ordered text→image→audio→video.
-func (c *ModelConfig) OutputModalities() []string {
-	textOut := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION) || c.HasUsecases(FLAG_EDIT) ||
-		c.HasUsecases(FLAG_TRANSCRIPT)
-	imageOut := c.HasUsecases(FLAG_IMAGE)
-	audioOut := c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) ||
-		c.HasUsecases(FLAG_AUDIO_TRANSFORM) || c.HasUsecases(FLAG_REALTIME_AUDIO)
-	videoOut := c.HasUsecases(FLAG_VIDEO)
-
-	var mods []string
-	if textOut {
-		mods = append(mods, "text")
-	}
-	if imageOut {
-		mods = append(mods, "image")
-	}
-	if audioOut {
-		mods = append(mods, "audio")
-	}
-	if videoOut {
-		mods = append(mods, "video")
-	}
-	return mods
-}
--- a/core/config/model_capabilities_test.go
+++ b/core/config/model_capabilities_test.go
@@ -1,103 +0,0 @@
-package config
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func usecaseBits(flags ModelConfigUsecase) *ModelConfigUsecase {
-	return &flags
-}
-
-var _ = Describe("Model capabilities derivation", func() {
-	Describe("VisionSupported", func() {
-		It("is false for a plain text chat model", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
-			Expect(cfg.VisionSupported()).To(BeFalse())
-		})
-
-		It("is true when the FLAG_VISION bit is declared", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
-			Expect(cfg.VisionSupported()).To(BeTrue())
-		})
-
-		It("is true when an mmproj projector is set", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
-			cfg.MMProj = "mmproj.gguf" // promoted field from the embedded options struct
-			Expect(cfg.VisionSupported()).To(BeTrue())
-		})
-
-		It("does not fall for the GuessUsecases FLAG_VISION false positive", func() {
-			// A chat model with a chat template would make HasUsecases(FLAG_VISION)
-			// return true via the guess heuristic; VisionSupported must not.
-			cfg := &ModelConfig{Backend: "llama.cpp"}
-			cfg.TemplateConfig.Chat = "{{.Input}}"
-			Expect(cfg.VisionSupported()).To(BeFalse())
-		})
-	})
-
-	Describe("AudioInputSupported / VideoInputSupported", func() {
-		It("detects vLLM omni audio input via limit_mm_per_prompt", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
-			cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
-			Expect(cfg.AudioInputSupported()).To(BeTrue())
-			Expect(cfg.VideoInputSupported()).To(BeFalse())
-		})
-
-		It("detects vLLM omni video input via limit_mm_per_prompt", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
-			cfg.LimitMMPerPrompt.LimitVideoPerPrompt = 2
-			Expect(cfg.VideoInputSupported()).To(BeTrue())
-		})
-	})
-
-	Describe("Capabilities + modalities", func() {
-		It("a text-only chat model exposes chat and text-only modalities", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
-			Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
-			Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseVision))
-			Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseTranscript))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
-		})
-
-		It("a vision chat model accepts text+image input", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
-			Expect(cfg.Capabilities()).To(ContainElements(UsecaseChat, UsecaseVision))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text", "image"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
-		})
-
-		It("an omni chat model accepts text+audio input without an audio capability flag", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
-			cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
-			// audio-in is a modality, not a usecase string — this is exactly the
-			// case a plain capability list cannot express.
-			Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text", "audio"}))
-		})
-
-		It("a transcription model reads audio and writes text", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TRANSCRIPT), Backend: "parakeet-cpp"}
-			Expect(cfg.Capabilities()).To(Equal([]string{UsecaseTranscript}))
-			Expect(cfg.InputModalities()).To(Equal([]string{"audio"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
-		})
-
-		It("an image-generation model reads text and writes an image", func() {
-			// stablediffusion-ggml is image-only; plain "stablediffusion" is also
-			// in GuessUsecases' video-backend list, so it would report video too.
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_IMAGE), Backend: "stablediffusion-ggml"}
-			Expect(cfg.Capabilities()).To(Equal([]string{UsecaseImage}))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"image"}))
-		})
-
-		It("a TTS model reads text and writes audio", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TTS), Backend: "piper"}
-			Expect(cfg.Capabilities()).To(ContainElement(UsecaseTTS))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"audio"}))
-		})
-	})
-})
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -656,18 +656,6 @@ type Pipeline struct {
 	// to benefit. A client session.update still overrides type and eagerness
 	// per session; retranscribe is server-side only. Unset keeps server_vad.
 	TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"`
-
-	// DisableWarmup turns off eager pre-loading of the pipeline's sub-models at
-	// realtime session start. By default (false) LocalAI loads every configured
-	// sub-model backend (VAD, transcription, LLM, TTS, sound detection, voice
-	// recognition) into memory (concurrently) before the
-	// session is announced and blocks until they are ready, so the first turn
-	// pays no cold-start cost and a model that fails to load surfaces as an error
-	// at session start rather than mid-call. Set true to restore the lazy "load
-	// on first use" behavior — session start no longer blocks on loading and
-	// load errors surface on first use instead (e.g. to keep idle sessions from
-	// holding model memory they may never use).
-	DisableWarmup bool `yaml:"disable_warmup,omitempty" json:"disable_warmup,omitempty"`
 }

 // PipelineCompaction configures summarize-then-drop for a realtime pipeline.
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -155,25 +155,6 @@ func (bcl *ModelConfigLoader) LoadModelConfigFileByNameDefaultOptions(modelName
 		ModelPath(appConfig.SystemState.Model.ModelsPath))
 }

-// LoadResolvedModelConfig loads a model config by name and follows a single
-// alias hop, so a caller that references an alias (e.g. a pipeline with
-// `llm: default`) gets the alias target's full config (Backend, Model, ...)
-// rather than the alias stub with an empty Backend. Without this the alias
-// survives unresolved into model loading and fails downstream — notably in
-// distributed mode with "backend name is empty". Mirrors the top-level alias
-// resolution in core/http/middleware/request.go.
-func (bcl *ModelConfigLoader) LoadResolvedModelConfig(modelName, modelPath string) (*ModelConfig, error) {
-	cfg, err := bcl.LoadModelConfigFileByName(modelName, modelPath)
-	if err != nil {
-		return nil, err
-	}
-	resolved, _, err := bcl.ResolveAlias(cfg)
-	if err != nil {
-		return nil, err
-	}
-	return resolved, nil
-}
-
 // This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile
 func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()
--- a/core/config/swa.go
+++ b/core/config/swa.go
@@ -0,0 +1,56 @@
+package config
+
+import (
+	gguf "github.com/gpustack/gguf-parser-go"
+	"github.com/mudler/xlog"
+)
+
+// swaCacheOptionNames lists the backend option keys that control the
+// sliding-window-attention KV cache. If the user pinned any of these we leave
+// the SWA cache alone instead of forcing swa_full.
+var swaCacheOptionNames = []string{"swa_full", "n_swa"}
+
+// HasSlidingWindowAttention reports whether the parsed GGUF describes a
+// sliding-window-attention (SWA) model — Gemma 2/3, Cohere2, Llama 4 and the
+// like. The gguf-parser library normalizes the per-architecture
+// `<arch>.attention.sliding_window` metadata key into
+// GGUFArchitecture.AttentionSlidingWindow, applying the same family-specific
+// rules llama.cpp uses (e.g. Phi-3 carries the key but does not actually run
+// SWA, and is normalized to 0). A non-zero window means the model interleaves
+// SWA layers, so the returned size is also the diagnostic value we log.
+func HasSlidingWindowAttention(f *gguf.GGUFFile) (uint64, bool) {
+	if f == nil {
+		return 0, false
+	}
+	w := f.Architecture().AttentionSlidingWindow
+	return w, w > 0
+}
+
+// ApplySWAFullDefault enables the full-size SWA KV cache (swa_full:true) for a
+// sliding-window model, unless the user already pinned an SWA cache option.
+//
+// Why: llama.cpp defaults to a reduced SWA KV cache sized to the sliding window
+// (memory-light), but that reduced cache cannot preserve a prompt prefix across
+// requests. So for SWA models the cross-request prefix cache we enable in
+// serving_defaults.go (cache_reuse) is silently defeated — every turn
+// reprocesses the entire prompt. Setting swa_full:true makes llama.cpp keep the
+// full KV cache so the shared prefix is actually reused.
+//
+// The tradeoff is memory: the full SWA cache scales with context_size, so this
+// is gated to models that are genuinely SWA (never applied to dense models,
+// where it would only waste memory) and never overrides an explicit user
+// choice. `slidingWindow` is the value read from the GGUF and is used only for
+// the diagnostic log line.
+func ApplySWAFullDefault(cfg *ModelConfig, slidingWindow uint64) {
+	if cfg == nil || slidingWindow == 0 {
+		return
+	}
+	if backendOptionSet(cfg.Options, swaCacheOptionNames...) {
+		xlog.Debug("[swa] sliding-window model but an SWA cache option is already set; leaving user choice intact",
+			"name", cfg.Name, "sliding_window", slidingWindow)
+		return
+	}
+	cfg.Options = append(cfg.Options, "swa_full:true")
+	xlog.Debug("[swa] enabling swa_full for sliding-window model so the cross-request prompt-prefix cache survives (reduced SWA cache cannot reuse a prefix across requests)",
+		"name", cfg.Name, "sliding_window", slidingWindow)
+}
--- a/core/config/swa_test.go
+++ b/core/config/swa_test.go
@@ -0,0 +1,120 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+
+	gguf "github.com/gpustack/gguf-parser-go"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// ggufWithSlidingWindow fabricates a minimal in-memory GGUF carrying the given
+// `general.architecture` and `<arch>.attention.sliding_window` so the SWA
+// detection can be exercised without a real model file. A window of 0 omits the
+// key, modelling a dense (non-SWA) model.
+func ggufWithSlidingWindow(arch string, window uint32) *gguf.GGUFFile {
+	kvs := gguf.GGUFMetadataKVs{
+		{
+			Key:       "general.architecture",
+			ValueType: gguf.GGUFMetadataValueTypeString,
+			Value:     arch,
+		},
+	}
+	if window > 0 {
+		kvs = append(kvs, gguf.GGUFMetadataKV{
+			Key:       arch + ".attention.sliding_window",
+			ValueType: gguf.GGUFMetadataValueTypeUint32,
+			Value:     window,
+		})
+	}
+	return &gguf.GGUFFile{
+		Header: gguf.GGUFHeader{MetadataKV: kvs},
+	}
+}
+
+var _ = Describe("SWA full-cache auto-default", func() {
+	Context("HasSlidingWindowAttention", func() {
+		It("returns false on a nil GGUF file", func() {
+			w, ok := HasSlidingWindowAttention(nil)
+			Expect(ok).To(BeFalse())
+			Expect(w).To(BeZero())
+		})
+
+		It("detects a sliding-window model (Gemma 3 style)", func() {
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma3", 1024))
+			Expect(ok).To(BeTrue())
+			Expect(w).To(Equal(uint64(1024)))
+		})
+
+		It("detects Gemma 2 even without an explicit key (family default window)", func() {
+			// gguf-parser applies llama.cpp's family rules: gemma2 defaults the
+			// sliding window to 4096 when the metadata key is absent.
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("gemma2", 0))
+			Expect(ok).To(BeTrue())
+			Expect(w).To(Equal(uint64(4096)))
+		})
+
+		It("reports a dense model as non-SWA", func() {
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("llama", 0))
+			Expect(ok).To(BeFalse())
+			Expect(w).To(BeZero())
+		})
+
+		It("treats Phi-3 as non-SWA even when the key is present", func() {
+			// Phi-3 carries attention.sliding_window but does not actually run
+			// SWA; gguf-parser normalizes it to 0 to match llama.cpp.
+			w, ok := HasSlidingWindowAttention(ggufWithSlidingWindow("phi3", 2048))
+			Expect(ok).To(BeFalse())
+			Expect(w).To(BeZero())
+		})
+	})
+
+	Context("ApplySWAFullDefault", func() {
+		It("enables swa_full for a sliding-window model when unset", func() {
+			cfg := &ModelConfig{Name: "gemma3"}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(ContainElement("swa_full:true"))
+		})
+
+		It("is a no-op for a dense model (window 0)", func() {
+			cfg := &ModelConfig{Name: "llama"}
+			ApplySWAFullDefault(cfg, 0)
+			Expect(cfg.Options).To(BeEmpty())
+		})
+
+		It("preserves an explicit swa_full:false", func() {
+			cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:false"}}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{"swa_full:false"}))
+		})
+
+		It("preserves an explicit swa_full:true without duplicating it", func() {
+			cfg := &ModelConfig{Name: "gemma3", Options: []string{"swa_full:true"}}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{"swa_full:true"}))
+		})
+
+		It("respects the n_swa alias", func() {
+			cfg := &ModelConfig{Name: "gemma3", Options: []string{"n_swa:512"}}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{"n_swa:512"}))
+		})
+
+		It("preserves unrelated options already on the config", func() {
+			cfg := &ModelConfig{
+				Name:    "gemma3",
+				Options: []string{"use_jinja:true", "cache_reuse:256"},
+			}
+			ApplySWAFullDefault(cfg, 1024)
+			Expect(cfg.Options).To(Equal([]string{
+				"use_jinja:true",
+				"cache_reuse:256",
+				"swa_full:true",
+			}))
+		})
+
+		It("tolerates a nil config", func() {
+			Expect(func() { ApplySWAFullDefault(nil, 1024) }).ToNot(Panic())
+		})
+	})
+})
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -15,35 +15,14 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/system"
-	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/pkg/xsync"
 	"github.com/mudler/xlog"

 	"gopkg.in/yaml.v3"
 )

-// validateGalleryConfigURL guards the gallery config fetch against SSRF. A
-// gallery config URL can be attacker-controlled (e.g. POST /models/apply with
-// an empty id fetches it directly), so a plain http(s) URL must not be allowed
-// to reach private, loopback, link-local or cloud-metadata addresses. Other
-// schemes (huggingface://, github:, oci://, ollama://, file://) resolve to
-// fixed public services or local files and are not a network-SSRF vector, so
-// they are left untouched.
-// See https://github.com/mudler/LocalAI/issues/10665
-func validateGalleryConfigURL(rawURL string) error {
-	lower := strings.ToLower(strings.TrimSpace(rawURL))
-	if strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") {
-		return utils.ValidateExternalURL(rawURL)
-	}
-	return nil
-}
-
 func GetGalleryConfigFromURL[T any](url string, basePath string) (T, error) {
 	var config T
-	if err := validateGalleryConfigURL(url); err != nil {
-		xlog.Error("refusing to fetch gallery config", "error", err, "url", url)
-		return config, err
-	}
 	uri := downloader.URI(url)
 	err := uri.ReadWithCallback(basePath, func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &config)
@@ -57,10 +36,6 @@ func GetGalleryConfigFromURL[T any](url string, basePath string) (T, error) {

 func GetGalleryConfigFromURLWithContext[T any](ctx context.Context, url string, basePath string) (T, error) {
 	var config T
-	if err := validateGalleryConfigURL(url); err != nil {
-		xlog.Error("refusing to fetch gallery config", "error", err, "url", url)
-		return config, err
-	}
 	uri := downloader.URI(url)
 	err := uri.ReadWithAuthorizationAndCallback(ctx, basePath, "", func(url string, d []byte) error {
 		return yaml.Unmarshal(d, &config)
--- a/core/gallery/request_test.go
+++ b/core/gallery/request_test.go
@@ -1,10 +1,6 @@
 package gallery_test

 import (
-	"context"
-	"net/http"
-	"net/http/httptest"
-
 	. "github.com/mudler/LocalAI/core/gallery"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -23,49 +19,4 @@ var _ = Describe("Gallery API tests", func() {
 			Expect(e.Name).To(Equal("gpt4all-j"))
 		})
 	})
-
-	// SSRF guard: a user-supplied gallery config URL (e.g. POST /models/apply
-	// with an empty id) must not be able to reach internal network addresses.
-	// See https://github.com/mudler/LocalAI/issues/10665
-	Context("SSRF protection on config URLs", func() {
-		var server *httptest.Server
-
-		BeforeEach(func() {
-			// A reachable internal server that would happily serve a valid
-			// gallery config. Without the SSRF guard the fetch succeeds; the
-			// guard must block it before the request ever leaves the process.
-			server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-				w.WriteHeader(http.StatusOK)
-				_, _ = w.Write([]byte("name: internal-ssrf\nfiles: []\n"))
-			}))
-		})
-
-		AfterEach(func() {
-			server.Close()
-		})
-
-		It("blocks fetching a config from a loopback address", func() {
-			_, err := GetGalleryConfigFromURL[ModelConfig](server.URL, "")
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("not allowed"))
-		})
-
-		It("blocks fetching a config from a loopback address (context variant)", func() {
-			_, err := GetGalleryConfigFromURLWithContext[ModelConfig](context.Background(), server.URL, "")
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("not allowed"))
-		})
-
-		It("blocks well-known internal hostnames and metadata endpoints", func() {
-			for _, u := range []string{
-				"http://localhost/secret",
-				"http://10.0.0.1/config.yaml",
-				"http://192.168.1.1/config.yaml",
-				"http://169.254.169.254/latest/meta-data/",
-			} {
-				_, err := GetGalleryConfigFromURL[ModelConfig](u, "")
-				Expect(err).To(HaveOccurred(), "expected %s to be rejected", u)
-			}
-		})
-	})
 })
--- a/core/http/endpoints/localai/backend_load.go
+++ b/core/http/endpoints/localai/backend_load.go
@@ -1,54 +0,0 @@
-package localai
-
-import (
-	"net/http"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/xlog"
-)
-
-// LoadModelEndpoint pre-loads a model into memory by name — the inverse of
-// /backend/shutdown. For a realtime pipeline model every configured sub-model
-// (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded; for a regular
-// model its own backend is loaded. The call blocks until loading finishes so
-// clients can drive warm-up explicitly and learn up front whether a model
-// fails to load.
-// @Summary Pre-load a model into memory
-// @Description Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.
-// @Tags monitoring
-// @Accept json
-// @Produce json
-// @Param request body schema.ModelLoadRequest true "Model to load"
-// @Success 200 {object} schema.ModelLoadResponse "Model loaded"
-// @Failure 400 {object} schema.ModelLoadResponse "Missing model name"
-// @Failure 500 {object} schema.ModelLoadResponse "Load failed (Loaded lists any sub-models that did load)"
-// @Router /backend/load [post]
-func LoadModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
-	return func(c echo.Context) error {
-		input := new(schema.ModelLoadRequest)
-		if err := c.Bind(input); err != nil {
-			return err
-		}
-		if input.Model == "" {
-			return c.JSON(http.StatusBadRequest, schema.ModelLoadResponse{Message: "model is required"})
-		}
-
-		loaded, err := backend.PreloadModelByName(c.Request().Context(), cl, ml, appConfig, input.Model)
-		if err != nil {
-			xlog.Error("failed to pre-load model", "model", input.Model, "loaded", loaded, "error", err)
-			return c.JSON(http.StatusInternalServerError, schema.ModelLoadResponse{
-				Loaded:  loaded,
-				Message: "failed to load model: " + err.Error(),
-			})
-		}
-
-		return c.JSON(http.StatusOK, schema.ModelLoadResponse{
-			Loaded:  loaded,
-			Message: "model loaded",
-		})
-	}
-}
--- a/core/http/endpoints/localai/backend_load_test.go
+++ b/core/http/endpoints/localai/backend_load_test.go
@@ -1,102 +0,0 @@
-package localai_test
-
-import (
-	"bytes"
-	"encoding/json"
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"path/filepath"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	. "github.com/mudler/LocalAI/core/http/endpoints/localai"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("LoadModelEndpoint (/backend/load)", func() {
-	var (
-		app          *echo.Echo
-		tempDir      string
-		configLoader *config.ModelConfigLoader
-		modelLoader  *model.ModelLoader
-		appConfig    *config.ApplicationConfig
-	)
-
-	post := func(body string) *httptest.ResponseRecorder {
-		req := httptest.NewRequest(http.MethodPost, "/backend/load", bytes.NewBufferString(body))
-		req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
-		rec := httptest.NewRecorder()
-		app.ServeHTTP(rec, req)
-		return rec
-	}
-
-	decode := func(rec *httptest.ResponseRecorder) schema.ModelLoadResponse {
-		var resp schema.ModelLoadResponse
-		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
-		return resp
-	}
-
-	writeConfig := func(name, contents string) {
-		Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(contents), 0o600)).To(Succeed())
-	}
-
-	BeforeEach(func() {
-		var err error
-		tempDir, err = os.MkdirTemp("", "backend-load-test-*")
-		Expect(err).NotTo(HaveOccurred())
-
-		systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
-		Expect(err).NotTo(HaveOccurred())
-
-		appConfig = config.NewApplicationConfig(config.WithSystemState(systemState))
-		configLoader = config.NewModelConfigLoader(tempDir)
-		modelLoader = model.NewModelLoader(systemState) // no backends installed
-
-		app = echo.New()
-		app.POST("/backend/load", LoadModelEndpoint(configLoader, modelLoader, appConfig))
-	})
-
-	AfterEach(func() {
-		_ = os.RemoveAll(tempDir)
-	})
-
-	It("rejects a request with no model name", func() {
-		rec := post(`{}`)
-		Expect(rec.Code).To(Equal(http.StatusBadRequest))
-		Expect(decode(rec).Message).To(ContainSubstring("model is required"))
-	})
-
-	It("reports a load failure for a regular model with nothing loaded", func() {
-		writeConfig("solo", "name: solo\n")
-
-		rec := post(`{"model":"solo"}`)
-		Expect(rec.Code).To(Equal(http.StatusInternalServerError))
-
-		resp := decode(rec)
-		Expect(resp.Loaded).To(BeEmpty())
-		Expect(resp.Message).To(ContainSubstring("failed to load model"))
-	})
-
-	It("expands a pipeline model and reports each sub-model that failed to load", func() {
-		writeConfig("voicebot", "name: voicebot\npipeline:\n  vad: vad-m\n  transcription: stt-m\n  llm: llm-m\n  tts: tts-m\n")
-		writeConfig("vad-m", "name: vad-m\n")
-		writeConfig("stt-m", "name: stt-m\n")
-		writeConfig("llm-m", "name: llm-m\n")
-		writeConfig("tts-m", "name: tts-m\n")
-
-		rec := post(`{"model":"voicebot"}`)
-		Expect(rec.Code).To(Equal(http.StatusInternalServerError))
-
-		resp := decode(rec)
-		Expect(resp.Message).To(ContainSubstring("failed to load model"))
-		// The pipeline stub itself is never loaded; its sub-models are what the
-		// endpoint tries, so the error names them rather than "voicebot".
-		Expect(resp.Message).To(ContainSubstring("vad-m"))
-		Expect(resp.Message).ToNot(ContainSubstring("voicebot"))
-	})
-})
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -51,9 +51,6 @@ func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any)
 	return nil
 }
 func (stubClient) ReloadModels(_ context.Context) error { return nil }
-func (stubClient) LoadModel(_ context.Context, model string) ([]string, error) {
-	return []string{model}, nil
-}
 func (stubClient) SetAlias(_ context.Context, _, _ string) error {
 	return nil
 }
--- a/core/http/endpoints/ollama/capabilities.go
+++ b/core/http/endpoints/ollama/capabilities.go
@@ -49,23 +49,62 @@ func modelCapabilities(cfg *config.ModelConfig) []string {
 	return caps
 }

-// hasVisionSupport reports whether the model can accept image inputs.
-// The detection heuristic is the canonical config.ModelConfig.VisionSupported —
-// kept as a thin wrapper here so the Ollama capability mapping reads cleanly.
+// hasVisionSupport reports whether the model can accept image inputs. We avoid
+// cfg.HasUsecases(FLAG_VISION) because GuessUsecases has no FLAG_VISION case
+// and returns true for any chat model — see core/config/model_config.go. Instead
+// we look for explicit signals: KnownUsecases bit, multimodal projector, or
+// template/backend-reported multimodal markers.
 func hasVisionSupport(cfg *config.ModelConfig) bool {
-	return cfg.VisionSupported()
+	if cfg.KnownUsecases != nil && (*cfg.KnownUsecases&config.FLAG_VISION) == config.FLAG_VISION {
+		return true
+	}
+	if cfg.MMProj != "" {
+		return true
+	}
+	if cfg.TemplateConfig.Multimodal != "" {
+		return true
+	}
+	if cfg.MediaMarker != "" {
+		return true
+	}
+	return false
 }

-// hasToolSupport reports whether the model is wired up for tool / function
-// calling. Delegates to the canonical config.ModelConfig.ToolSupported.
+// hasToolSupport reports whether the model is wired up for tool / function calling.
+// We look for any of the explicit configuration knobs LocalAI uses to drive
+// function-call extraction (regex match, response regex, grammar triggers, XML
+// format) or for the auto-detected tool-format markers populated by the
+// llama.cpp backend during model load.
 func hasToolSupport(cfg *config.ModelConfig) bool {
-	return cfg.ToolSupported()
+	fc := cfg.FunctionsConfig
+	if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
+		return true
+	}
+	if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
+		return true
+	}
+	if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
+		return true
+	}
+	if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
+		return true
+	}
+	return false
 }

 // hasThinkingSupport reports whether the model has reasoning / thinking enabled.
-// Delegates to the canonical config.ModelConfig.ThinkingSupported.
+// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
+// when the backend probe reports that the model supports thinking.
 func hasThinkingSupport(cfg *config.ModelConfig) bool {
-	return cfg.ThinkingSupported()
+	rc := cfg.ReasoningConfig
+	if rc.DisableReasoning != nil && !*rc.DisableReasoning {
+		return true
+	}
+	if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
+		// Explicit thinking markers imply support unless explicitly disabled.
+		return rc.DisableReasoning == nil || !*rc.DisableReasoning
+	}
+	return false
 }

 // quantRegex matches GGUF-style quantization suffixes (Q4_K_M, Q8_0, IQ3_XS, F16, ...).
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -21,11 +21,48 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
 		authDB = db[0]
 	}
 	return func(c echo.Context) error {
-		modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
+		// If blank, no filter is applied.
+		filter := c.QueryParam("filter")
+
+		// By default, exclude any loose files that are already referenced by a configuration file.
+		var policy galleryop.LooseFilePolicy
+		excludeConfigured := c.QueryParam("excludeConfigured")
+		if excludeConfigured == "" || excludeConfigured == "true" {
+			policy = galleryop.SKIP_IF_CONFIGURED
+		} else {
+			policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
+		}
+
+		filterFn, err := config.BuildNameFilterFn(filter)
 		if err != nil {
 			return err
 		}

+		modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
+		if err != nil {
+			return err
+		}
+
+		// Filter models by user's allowlist if auth is enabled
+		if authDB != nil {
+			if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
+				perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
+				if err == nil && perm.AllowedModels.Enabled {
+					allowed := map[string]bool{}
+					for _, m := range perm.AllowedModels.Models {
+						allowed[m] = true
+					}
+					filtered := make([]string, 0, len(modelNames))
+					for _, m := range modelNames {
+						if allowed[m] {
+							filtered = append(filtered, m)
+						}
+					}
+					modelNames = filtered
+				}
+			}
+		}
+
 		// Map from a slice of names to a slice of OpenAIModel response objects
 		dataModels := []schema.OpenAIModel{}
 		for _, m := range modelNames {
@@ -38,53 +75,3 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
 		})
 	}
 }
-
-// listVisibleModelNames resolves the model names visible to the caller, applying
-// the same query filters (filter, excludeConfigured) and per-user allowlist as
-// the OpenAI models listing. Shared by ListModelsEndpoint and
-// ListModelCapabilitiesEndpoint so both stay consistent.
-func listVisibleModelNames(c echo.Context, bcl *config.ModelConfigLoader, ml *model.ModelLoader, authDB *gorm.DB) ([]string, error) {
-	// If blank, no filter is applied.
-	filter := c.QueryParam("filter")
-
-	// By default, exclude any loose files that are already referenced by a configuration file.
-	var policy galleryop.LooseFilePolicy
-	excludeConfigured := c.QueryParam("excludeConfigured")
-	if excludeConfigured == "" || excludeConfigured == "true" {
-		policy = galleryop.SKIP_IF_CONFIGURED
-	} else {
-		policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
-	}
-
-	filterFn, err := config.BuildNameFilterFn(filter)
-	if err != nil {
-		return nil, err
-	}
-
-	modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
-	if err != nil {
-		return nil, err
-	}
-
-	// Filter models by user's allowlist if auth is enabled
-	if authDB != nil {
-		if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
-			perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
-			if err == nil && perm.AllowedModels.Enabled {
-				allowed := map[string]bool{}
-				for _, m := range perm.AllowedModels.Models {
-					allowed[m] = true
-				}
-				filtered := make([]string, 0, len(modelNames))
-				for _, m := range modelNames {
-					if allowed[m] {
-						filtered = append(filtered, m)
-					}
-				}
-				modelNames = filtered
-			}
-		}
-	}
-
-	return modelNames, nil
-}
--- a/core/http/endpoints/openai/list_capabilities.go
+++ b/core/http/endpoints/openai/list_capabilities.go
@@ -1,50 +0,0 @@
-package openai
-
-import (
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	model "github.com/mudler/LocalAI/pkg/model"
-	"gorm.io/gorm"
-)
-
-// ListModelCapabilitiesEndpoint is a LocalAI-specific extension of the OpenAI
-// models listing. It returns the same set of models as /v1/models but enriches
-// each entry with the capabilities and input/output modalities the model
-// supports, so clients can decide whether an image/audio/video attachment can be
-// handed to a given model directly (or must be converted/transcribed first).
-//
-// It is purely additive: clients that don't know about it keep using /v1/models
-// and see no change.
-// @Summary List available models enriched with capabilities and input/output modalities.
-// @Tags models
-// @Success 200 {object} schema.ModelCapabilitiesResponse "Response"
-// @Router /v1/models/capabilities [get]
-func ListModelCapabilitiesEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, db ...*gorm.DB) echo.HandlerFunc {
-	var authDB *gorm.DB
-	if len(db) > 0 {
-		authDB = db[0]
-	}
-	return func(c echo.Context) error {
-		modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
-		if err != nil {
-			return err
-		}
-
-		dataModels := []schema.ModelCapabilities{}
-		for _, m := range modelNames {
-			entry := schema.ModelCapabilities{ID: m, Object: "model"}
-			if cfg, ok := bcl.GetModelConfig(m); ok {
-				entry.Capabilities = cfg.Capabilities()
-				entry.InputModalities = cfg.InputModalities()
-				entry.OutputModalities = cfg.OutputModalities()
-			}
-			dataModels = append(dataModels, entry)
-		}
-
-		return c.JSON(200, schema.ModelCapabilitiesResponse{
-			Object: "list",
-			Data:   dataModels,
-		})
-	}
-}
--- a/core/http/endpoints/openai/list_capabilities_test.go
+++ b/core/http/endpoints/openai/list_capabilities_test.go
@@ -1,119 +0,0 @@
-package openai
-
-import (
-	"encoding/json"
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"path/filepath"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("ListModelCapabilitiesEndpoint", func() {
-	var (
-		e       *echo.Echo
-		tmpDir  string
-		bcl     *config.ModelConfigLoader
-		ml      *model.ModelLoader
-		appConf *config.ApplicationConfig
-	)
-
-	BeforeEach(func() {
-		var err error
-		e = echo.New()
-		tmpDir, err = os.MkdirTemp("", "models-caps-test-*")
-		Expect(err).NotTo(HaveOccurred())
-
-		st, err := system.GetSystemState(system.WithModelPath(tmpDir))
-		Expect(err).NotTo(HaveOccurred())
-		ml = model.NewModelLoader(st)
-		bcl = config.NewModelConfigLoader(tmpDir)
-		appConf = config.NewApplicationConfig()
-	})
-
-	AfterEach(func() {
-		_ = os.RemoveAll(tmpDir)
-	})
-
-	writeConfig := func(name, yaml string) {
-		path := filepath.Join(tmpDir, name+".yaml")
-		Expect(os.WriteFile(path, []byte(yaml), 0o644)).To(Succeed())
-		Expect(bcl.ReadModelConfig(path)).To(Succeed())
-	}
-
-	// call exercises the endpoint with auth disabled (no auth DB), which is the
-	// standard deployment path. The per-user allowlist branch is shared verbatim
-	// with ListModelsEndpoint (listVisibleModelNames) and covered there.
-	call := func() schema.ModelCapabilitiesResponse {
-		req := httptest.NewRequest(http.MethodGet, "/v1/models/capabilities", nil)
-		rec := httptest.NewRecorder()
-		c := e.NewContext(req, rec)
-
-		handler := ListModelCapabilitiesEndpoint(bcl, ml, appConf)
-		Expect(handler(c)).To(Succeed())
-		Expect(rec.Code).To(Equal(http.StatusOK))
-
-		var resp schema.ModelCapabilitiesResponse
-		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
-		return resp
-	}
-
-	entryFor := func(resp schema.ModelCapabilitiesResponse, id string) *schema.ModelCapabilities {
-		for i := range resp.Data {
-			if resp.Data[i].ID == id {
-				return &resp.Data[i]
-			}
-		}
-		return nil
-	}
-
-	It("returns the list envelope even with no models", func() {
-		resp := call()
-		Expect(resp.Object).To(Equal("list"))
-	})
-
-	It("enriches a vision chat model with capabilities and image input modality", func() {
-		writeConfig("vlm", `
-name: vlm
-backend: llama-cpp
-known_usecases:
-  - FLAG_CHAT
-  - FLAG_VISION
-template:
-  chat: "{{ .Input }}"
-parameters:
-  model: qwen2.5-vl-Q4_K_M.gguf
-`)
-		entry := entryFor(call(), "vlm")
-		Expect(entry).NotTo(BeNil())
-		Expect(entry.Object).To(Equal("model"))
-		Expect(entry.Capabilities).To(ContainElements("chat", "vision"))
-		Expect(entry.InputModalities).To(ContainElements("text", "image"))
-		Expect(entry.OutputModalities).To(ContainElement("text"))
-	})
-
-	It("marks a parakeet model as an audio-in/text-out transcription model", func() {
-		writeConfig("parakeet", `
-name: parakeet
-backend: parakeet-cpp
-known_usecases:
-  - FLAG_TRANSCRIPT
-parameters:
-  model: parakeet-tdt-0.6b
-`)
-		entry := entryFor(call(), "parakeet")
-		Expect(entry).NotTo(BeNil())
-		Expect(entry.Capabilities).To(ContainElement("transcript"))
-		Expect(entry.InputModalities).To(Equal([]string{"audio"}))
-		Expect(entry.OutputModalities).To(Equal([]string{"text"}))
-		Expect(entry.Capabilities).NotTo(ContainElement("chat"))
-	})
-})
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -7,7 +7,6 @@ import (
 	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -267,12 +266,6 @@ type Model interface {
 	// grpcerrors.IsLiveTranscriptionUnsupported.
 	TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error)
 	PredictConfig() *config.ModelConfig
-	// Warmup eagerly loads the pipeline's sub-model backends into memory so the
-	// first realtime turn doesn't pay each backend's cold-start load cost. Loads
-	// run concurrently; Warmup blocks until they all finish and returns a joined
-	// error naming every stage that failed to load (nil if all succeeded), so a
-	// caller can surface model-load failures at session start instead of mid-call.
-	Warmup(ctx context.Context) error
 }

 var upgrader = websocket.Upgrader{
@@ -590,8 +583,18 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.ModelInterface = m

-	// The voice gate is built before the warm-up below so its
-	// speaker-recognition model can warm alongside the pipeline stages.
+	if session.SummaryModel != "" {
+		summaryModelName := session.SummaryModel
+		sid := sessionID
+		session.summarizerFactory = func() (Model, error) {
+			summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig())
+			if lerr != nil {
+				return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr)
+			}
+			return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid))
+		}
+	}
+
 	if cfg.Pipeline.VoiceGateEnabled() {
 		gate, gerr := newVoiceGate(
 			*cfg.Pipeline.VoiceRecognition,
@@ -609,47 +612,6 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		xlog.Info("realtime voice recognition gate enabled", "mode", gate.cfg.Mode, "when", gate.cfg.When)
 	}

-	// Warm the pipeline's sub-model backends before announcing the session.
-	// Loads run concurrently but we block here until they all finish, so a model
-	// that fails to load (missing weights, bad backend, OOM) surfaces as an error
-	// at session start rather than stalling — or failing — mid-call on the first
-	// turn (VAD on the first audio chunk, STT at end-of-speech, LLM on the first
-	// reply, TTS on the first spoken output). On success the backends are already
-	// resident, so the first turn pays no cold-start cost. Opt out per pipeline
-	// with `pipeline.disable_warmup: true` to restore lazy load-on-first-use
-	// (errors then surface on first use instead of at session start).
-	if !cfg.Pipeline.DisableWarmup {
-		warmErr := make(chan error, 1)
-		go func() { warmErr <- m.Warmup(context.Background()) }()
-		// The voice-gate model warms concurrently with the pipeline stages: an
-		// enforced gate blocks each utterance on speaker resolution, so its
-		// cold-start would otherwise land on the first turn too. (Compaction's
-		// summary_model stays lazy — it only runs off the response path.)
-		var gateErr error
-		if session.voiceGate != nil {
-			_, gateErr = backend.PreloadStages(context.Background(), application.ModelLoader(), application.ApplicationConfig(), []backend.PreloadStage{
-				{Role: "voice_recognition", Cfg: session.voiceGate.recCfg},
-			})
-		}
-		if err := errors.Join(<-warmErr, gateErr); err != nil {
-			xlog.Error("realtime warmup failed", "model", model, "error", err)
-			sendError(t, "model_load_error", "Failed to load pipeline models: "+err.Error(), "", "")
-			return
-		}
-	}
-
-	if session.SummaryModel != "" {
-		summaryModelName := session.SummaryModel
-		sid := sessionID
-		session.summarizerFactory = func() (Model, error) {
-			summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig())
-			if lerr != nil {
-				return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr)
-			}
-			return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid))
-		}
-	}
-
 	// Store the session and notify the transport (for WebRTC audio track handling)
 	sessionLock.Lock()
 	sessions[sessionID] = session
@@ -1163,21 +1125,6 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 			return err
 		}
 		session.ModelInterface = m
-		// A session.update that swaps the model/voice rebuilds the pipeline, so
-		// warm the new backends too (unless opted out) — otherwise the next turn
-		// pays the cold-start load the original session warm-up already avoided.
-		// Unlike session start this stays non-blocking: updateSession runs under
-		// the global sessionLock, so blocking on a multi-second load here would
-		// stall every other session. Load errors are logged (and still surface on
-		// first use); per-stage failures are already warned inside
-		// backend.PreloadStages.
-		if !session.ModelConfig.Pipeline.DisableWarmup {
-			go func() {
-				if err := m.Warmup(context.Background()); err != nil {
-					xlog.Error("realtime warmup failed after session.update", "error", err)
-				}
-			}()
-		}
 	}

 	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.TurnDetectionSet {
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -174,8 +174,6 @@ func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(bac

 func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }

-func (m *fakeModel) Warmup(ctx context.Context) error { return nil }
-
 // fakeLiveSession records what semantic_vad fed and closed; closeEvents are
 // replayed through onEvent during Close, mimicking the backend's finalize
 // flush (trailing delta + Final) landing before Close returns.
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -110,15 +110,6 @@ func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
 	return nil
 }

-func (m *transcriptOnlyModel) Warmup(ctx context.Context) error {
-	_, err := backend.PreloadStages(ctx, m.modelLoader, m.appConfig, []backend.PreloadStage{
-		{Role: "vad", Cfg: m.VADConfig},
-		{Role: "transcription", Cfg: m.TranscriptionConfig},
-		{Role: "sound_detection", Cfg: m.SoundDetectionConfig},
-	})
-	return err
-}
-
 func (m *wrappedModel) VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error) {
 	return backend.VAD(request, ctx, m.modelLoader, m.appConfig, *m.VADConfig)
 }
@@ -369,17 +360,6 @@ func (m *wrappedModel) PredictConfig() *config.ModelConfig {
 	return m.LLMConfig
 }

-func (m *wrappedModel) Warmup(ctx context.Context) error {
-	_, err := backend.PreloadStages(ctx, m.modelLoader, m.appConfig, []backend.PreloadStage{
-		{Role: "vad", Cfg: m.VADConfig},
-		{Role: "transcription", Cfg: m.TranscriptionConfig},
-		{Role: "llm", Cfg: m.LLMConfig},
-		{Role: "tts", Cfg: m.TTSConfig},
-		{Role: "sound_detection", Cfg: m.SoundDetectionConfig},
-	})
-	return err
-}
-
 // wavStreamHeaderBytes is the size of the WAV header that backend.ModelTTSStream
 // emits as its first audio callback; the sample rate lives at byte offset 24.
 const wavStreamHeaderBytes = 44
@@ -460,7 +440,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 	if pipeline.SoundDetection == "" {
 		return nil, nil
 	}
-	cfg, err := cl.LoadResolvedModelConfig(pipeline.SoundDetection, ml.ModelPath)
+	cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
 	}
@@ -471,7 +451,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 }

 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
-	cfgVAD, err := cl.LoadResolvedModelConfig(pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
 	if err != nil {

 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -481,7 +461,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}

-	cfgSST, err := cl.LoadResolvedModelConfig(pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
 	if err != nil {

 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -570,11 +550,30 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
 	}
 }

+// loadPipelineSubModel loads a pipeline sub-model config by name and follows a
+// single alias hop, so a pipeline that references an alias (e.g. `llm: default`)
+// gets the alias target's full config (Backend, Model, ...) rather than the
+// alias stub with an empty Backend. Without this the alias survives unresolved
+// into model loading and fails downstream — notably in distributed mode with
+// "backend name is empty". Mirrors the top-level alias resolution in
+// core/http/middleware/request.go.
+func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) {
+	cfg, err := cl.LoadModelConfigFileByName(name, modelPath)
+	if err != nil {
+		return nil, err
+	}
+	resolved, _, err := cl.ResolveAlias(cfg)
+	if err != nil {
+		return nil, err
+	}
+	return resolved, nil
+}
+
 // returns and loads either a wrapped model or a model that support audio-to-audio
 func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
 	xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)

-	cfgVAD, err := cl.LoadResolvedModelConfig(pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -585,7 +584,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	}

 	// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
-	cfgSST, err := cl.LoadResolvedModelConfig(pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -617,7 +616,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	xlog.Debug("Loading a wrapped model")

 	// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
-	cfgLLM, err := cl.LoadResolvedModelConfig(pipeline.LLM, ml.ModelPath)
+	cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -632,7 +631,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	applyPipelineReasoning(cfgLLM, *pipeline)
 	applyPipelineThinking(cfgLLM, *pipeline)

-	cfgTTS, err := cl.LoadResolvedModelConfig(pipeline.TTS, ml.ModelPath)
+	cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
--- a/core/http/endpoints/openai/realtime_model_alias_test.go
+++ b/core/http/endpoints/openai/realtime_model_alias_test.go
@@ -1,4 +1,4 @@
-package config_test
+package openai

 import (
 	"os"
@@ -10,14 +10,14 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 )

-// LoadResolvedModelConfig must resolve a model that references an alias
-// (e.g. a pipeline with `llm: default`) one hop to the alias target's full
-// config — so the effective backend is the target's backend, not the empty
-// backend of the alias stub. This mirrors the top-level alias resolution done
-// in core/http/middleware/request.go, which the realtime pipeline previously
+// loadPipelineSubModel must resolve a pipeline sub-model that references an
+// alias (e.g. `llm: default`) one hop to the alias target's full config — so
+// the effective backend is the target's backend, not the empty backend of the
+// alias stub. This mirrors the top-level alias resolution done in
+// core/http/middleware/request.go, which the realtime pipeline previously
 // skipped (failing in distributed mode with "backend name is empty").
-var _ = Describe("LoadResolvedModelConfig", func() {
-	It("resolves an alias one hop to the target's config", func() {
+var _ = Describe("loadPipelineSubModel", func() {
+	It("resolves a sub-model alias one hop to the target's config", func() {
 		tmpDir := GinkgoT().TempDir()

 		// A real model config with a concrete backend.
@@ -38,13 +38,13 @@ alias: real-llm
 		Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed())

 		// Resolving the alias must follow the hop to the target's full config.
-		resolved, err := cl.LoadResolvedModelConfig("default", tmpDir)
+		resolved, err := loadPipelineSubModel(cl, "default", tmpDir)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(resolved.IsAlias()).To(BeFalse())
 		Expect(resolved.Backend).To(Equal("llama-cpp"))

 		// A non-alias name must load unchanged.
-		direct, err := cl.LoadResolvedModelConfig("real-llm", tmpDir)
+		direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(direct.Backend).To(Equal("llama-cpp"))
 		Expect(direct.Name).To(Equal("real-llm"))
--- a/core/http/endpoints/openai/realtime_voicegate.go
+++ b/core/http/endpoints/openai/realtime_voicegate.go
@@ -21,7 +21,6 @@ type namedEmbedding struct {
 // drive the realtime pipeline.
 type voiceGate struct {
 	cfg       config.PipelineVoiceRecognition // normalized
-	recCfg    *config.ModelConfig             // resolved speaker-recognition model, for warm-up
 	registry  voicerecognition.Registry       // identify mode (nil otherwise)
 	refEmbeds []namedEmbedding                // verify mode, pre-embedded refs
 	refAudios []config.VoiceReference         // verify + anti-spoofing: ref paths
@@ -73,9 +72,7 @@ func newVoiceGate(
 		return nil, err
 	}

-	// Resolved like every other pipeline sub-model (one alias hop), so an
-	// aliased voice_recognition model gets its target's backend.
-	recCfg, err := cl.LoadResolvedModelConfig(cfg.Model, ml.ModelPath)
+	recCfg, err := cl.LoadModelConfigFileByName(cfg.Model, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("voice_recognition: failed to load model %q: %w", cfg.Model, err)
 	}
@@ -85,7 +82,6 @@ func newVoiceGate(

 	g := &voiceGate{
 		cfg:      cfg,
-		recCfg:   recCfg,
 		registry: registry,
 		embedFn: func(ctx context.Context, wavPath string) ([]float32, error) {
 			res, err := backend.VoiceEmbed(ctx, wavPath, ml, appConfig, *recCfg)
--- a/core/http/endpoints/openai/realtime_warmup_test.go
+++ b/core/http/endpoints/openai/realtime_warmup_test.go
@@ -1,64 +0,0 @@
-package openai
-
-import (
-	"context"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-)
-
-// Warmup delegates to backend.PreloadStages (its concurrency, nil-skipping and
-// error-joining semantics are pinned in core/backend). These specs pin the
-// wiring instead: each realtime model type must warm exactly its configured
-// stages under the right pipeline-role labels. No backends are installed, so
-// every attempted stage fails to load — the joined error is the proof of which
-// stages were attempted and how they were labeled.
-var _ = Describe("realtime model Warmup wiring", func() {
-	newLoader := func() (*model.ModelLoader, *config.ApplicationConfig) {
-		systemState, err := system.GetSystemState(system.WithModelPath(GinkgoT().TempDir()))
-		Expect(err).ToNot(HaveOccurred())
-		appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
-		return model.NewModelLoader(systemState), appConfig
-	}
-
-	It("wrappedModel warms every configured stage under its pipeline role", func() {
-		ml, appConfig := newLoader()
-		m := &wrappedModel{
-			VADConfig:            &config.ModelConfig{Name: "vad-m"},
-			TranscriptionConfig:  &config.ModelConfig{Name: "stt-m"},
-			LLMConfig:            &config.ModelConfig{Name: "llm-m"},
-			TTSConfig:            &config.ModelConfig{Name: "tts-m"},
-			SoundDetectionConfig: &config.ModelConfig{Name: "ced-m"},
-			modelLoader:          ml,
-			appConfig:            appConfig,
-		}
-
-		err := m.Warmup(context.Background())
-		Expect(err).To(HaveOccurred())
-		for _, stage := range []string{"vad (vad-m)", "transcription (stt-m)", "llm (llm-m)", "tts (tts-m)", "sound_detection (ced-m)"} {
-			Expect(err.Error()).To(ContainSubstring(stage))
-		}
-	})
-
-	It("transcriptOnlyModel warms its stages and skips absent ones", func() {
-		ml, appConfig := newLoader()
-		m := &transcriptOnlyModel{
-			VADConfig:           &config.ModelConfig{Name: "vad-m"},
-			TranscriptionConfig: &config.ModelConfig{Name: "stt-m"},
-			// SoundDetectionConfig nil: an absent stage must be skipped, not
-			// fail the warm-up.
-			modelLoader: ml,
-			appConfig:   appConfig,
-		}
-
-		err := m.Warmup(context.Background())
-		Expect(err).To(HaveOccurred())
-		Expect(err.Error()).To(ContainSubstring("vad (vad-m)"))
-		Expect(err.Error()).To(ContainSubstring("transcription (stt-m)"))
-		Expect(err.Error()).ToNot(ContainSubstring("sound_detection"))
-	})
-})
--- a/core/http/openresponses_test.go
+++ b/core/http/openresponses_test.go
@@ -7,7 +7,6 @@ import (
 	"io"
 	"net/http"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"

@@ -30,8 +29,6 @@ const testModel = "Qwen3-VL-2B-Instruct-Q4_K_M"

 var _ = Describe("Open Responses API", func() {
 	var app *echo.Echo
-	var localApp *application.Application
-	var localModelDir string
 	var c context.Context
 	var cancel context.CancelFunc

@@ -41,47 +38,28 @@ var _ = Describe("Open Responses API", func() {

 	Context("API with ephemeral models", func() {
 		BeforeEach(func(sc SpecContext) {
-			// This suite exercises the /v1/responses HTTP/protocol contract
-			// (Content-Type, SSE framing, response envelope, error shapes),
-			// not real inference — so it runs against the same prebuilt
-			// mock-backend the rest of the http suite uses instead of
-			// downloading a real model. Skip cleanly when it isn't built.
-			if mockBackendPath == "" {
-				Skip("mock-backend binary not built; run 'make build-mock-backend'")
-			}
-
 			var err error

+			backendPath := os.Getenv("BACKENDS_PATH")
+
 			c, cancel = context.WithCancel(context.Background())

-			// Isolated model dir carrying a single config named after testModel
-			// but served by the mock backend, so the responses endpoint can
-			// resolve and load the model without any real backend build.
-			localModelDir, err = os.MkdirTemp("", "openresponses-models-")
-			Expect(err).ToNot(HaveOccurred())
-
-			mockModelYAML := "name: " + testModel + "\n" +
-				"backend: mock-backend\n" +
-				"parameters:\n" +
-				"  model: mock-model.bin\n"
-			Expect(os.WriteFile(filepath.Join(localModelDir, testModel+".yaml"), []byte(mockModelYAML), 0644)).To(Succeed())
-
 			systemState, err := system.GetSystemState(
-				system.WithBackendPath(backendDir),
-				system.WithModelPath(localModelDir),
+				system.WithBackendPath(backendPath),
+				system.WithModelPath(modelDir),
 			)
 			Expect(err).ToNot(HaveOccurred())

-			localApp, err = application.New(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithSystemState(systemState),
 					config.WithApiKeys([]string{apiKey}),
+					config.WithModelsURL("https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-GGUF"),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			localApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)

-			app, err = API(localApp)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())

 			go func() {
@@ -102,24 +80,14 @@ var _ = Describe("Open Responses API", func() {
 		})

 		AfterEach(func(sc SpecContext) {
-			// Synchronous app shutdown first — context-cancel cleanup is async
-			// and races test-binary exit, orphaning mock-backend children.
-			if localApp != nil {
-				_ = localApp.Shutdown()
-				localApp = nil
-			}
 			cancel()
 			if app != nil {
 				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 				defer cancel()
 				err := app.Shutdown(ctx)
 				Expect(err).ToNot(HaveOccurred())
-				app = nil
-			}
-			if localModelDir != "" {
-				_ = os.RemoveAll(localModelDir)
-				localModelDir = ""
 			}
+
 		})

 		Context("HTTP Protocol Compliance", func() {
@@ -1001,16 +969,13 @@ var _ = Describe("Open Responses API", func() {
 				Expect(ok).To(BeTrue())
 				Expect(itemID).ToNot(BeEmpty())

-				// Now create a new response with item_reference. Per the OpenAI
-				// Responses spec (and this server's parser in
-				// endpoints/openresponses/responses.go) an item_reference carries
-				// the referenced item in the "id" field, not "item_id".
+				// Now create a new response with item_reference
 				reqBody2 := map[string]any{
 					"model": testModel,
 					"input": []any{
 						map[string]any{
-							"type": "item_reference",
-							"id":   itemID,
+							"type":    "item_reference",
+							"item_id": itemID,
 						},
 						map[string]any{
 							"type":    "message",
@@ -1040,8 +1005,8 @@ var _ = Describe("Open Responses API", func() {
 					"model": testModel,
 					"input": []any{
 						map[string]any{
-							"type": "item_reference",
-							"id":   "nonexistent_item_id",
+							"type":    "item_reference",
+							"item_id": "nonexistent_item_id",
 						},
 					},
 				}
--- a/core/http/react-ui/e2e/forking-chat.spec.js
+++ b/core/http/react-ui/e2e/forking-chat.spec.js
@@ -1,133 +0,0 @@
-import { test, expect } from './coverage-fixtures.js'
-
-// Seeds two-message chat into localStorage so we don't need a live model.
-async function seedChat(page, history) {
-  await page.addInitScript((h) => {
-    const chat = {
-      id: 'seed1', name: 'Seeded Chat', model: 'test-model',
-      history: h, systemPrompt: '', mcpMode: false, mcpServers: [],
-      clientMCPServers: [], temperature: null, topP: null, topK: null,
-      tokenUsage: { prompt: 0, completion: 0, total: 0 },
-      contextSize: null, createdAt: Date.now(), updatedAt: Date.now(),
-    }
-    localStorage.setItem('localai_chats_data', JSON.stringify({
-      chats: [chat], activeChatId: 'seed1', lastSaved: Date.now(),
-    }))
-  }, history)
-}
-
-async function mockModels(page) {
-  await page.route('**/api/models/capabilities', (route) => route.fulfill({
-    contentType: 'application/json',
-    body: JSON.stringify({ data: [{ id: 'test-model', capabilities: ['FLAG_CHAT'] }] }),
-  }))
-  await page.route('**/api/operations', (route) => route.fulfill({
-    contentType: 'application/json', body: JSON.stringify({ operations: [] }),
-  }))
-}
-
-const TWO_TURNS = [
-  { role: 'user', content: 'first question' },
-  { role: 'assistant', content: 'first answer' },
-  { role: 'user', content: 'second question' },
-  { role: 'assistant', content: 'second answer' },
-]
-
-test('duplicate creates an independent copy and switches to it', async ({ page }) => {
-  await mockModels(page)
-  await seedChat(page, TWO_TURNS)
-  await page.goto('/app/chat')
-
-  // Open the chats menu (Ctrl/Cmd+K) and duplicate the seeded chat.
-  // Wait for the menu trigger to mount so its global keydown listener is armed
-  // before we dispatch the shortcut.
-  await page.getByTitle('Conversations (Ctrl/Cmd+K)').waitFor()
-  await page.keyboard.press('Control+k')
-  await page.getByTitle('Duplicate chat').first().click()
-
-  // A new active chat named "Seeded Chat (fork)" with the same 4 messages.
-  await expect(page.locator('.chat-header-title')).toHaveText('Seeded Chat (fork)')
-  await expect(page.locator('.chat-message-user')).toHaveCount(2)
-  await expect(page.locator('.chat-message-assistant')).toHaveCount(2)
-})
-
-async function mockCompletion(page, replyText) {
-  await page.route('**/v1/chat/completions', (route) => {
-    const sse =
-      `data: ${JSON.stringify({ choices: [{ delta: { content: replyText } }] })}\n\n` +
-      `data: ${JSON.stringify({ choices: [{ delta: {}, finish_reason: 'stop' }], usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 } })}\n\n` +
-      `data: [DONE]\n\n`
-    route.fulfill({ status: 200, contentType: 'text/event-stream', body: sse })
-  })
-}
-
-test('retry regenerates the first answer and drops the later turn', async ({ page }) => {
-  await mockModels(page)
-  // Capture the outbound request body so we can assert the model receives the
-  // truncated history (not the stale downstream turns).
-  let sentMessages = null
-  await page.route('**/v1/chat/completions', (route) => {
-    sentMessages = route.request().postDataJSON()?.messages || []
-    const sse =
-      `data: ${JSON.stringify({ choices: [{ delta: { content: 'REGENERATED first answer' } }] })}\n\n` +
-      `data: ${JSON.stringify({ choices: [{ delta: {}, finish_reason: 'stop' }], usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 } })}\n\n` +
-      `data: [DONE]\n\n`
-    route.fulfill({ status: 200, contentType: 'text/event-stream', body: sse })
-  })
-  await seedChat(page, TWO_TURNS)
-  await page.goto('/app/chat')
-
-  // Hover the FIRST assistant message and click its retry button.
-  const firstAssistant = page.locator('.chat-message-assistant').first()
-  await firstAssistant.hover()
-  await firstAssistant.getByTitle('Regenerate').click()
-
-  // History is truncated to the first user turn, then the new answer streams in;
-  // the second Q/A turn is gone.
-  await expect(page.locator('.chat-message-assistant')).toContainText(['REGENERATED first answer'])
-  await expect(page.locator('.chat-message-user')).toHaveCount(1)
-  await expect(page.locator('.chat-message-assistant')).toHaveCount(1)
-
-  // The OUTBOUND payload must also be truncated: the resent user turn is present,
-  // but the downstream turn and the stale first answer must be gone.
-  const contents = (sentMessages || []).map(m =>
-    typeof m.content === 'string' ? m.content : JSON.stringify(m.content)
-  )
-  expect(contents.join('\n')).toContain('first question')
-  expect(contents.join('\n')).not.toContain('second question')
-  expect(contents.join('\n')).not.toContain('first answer')
-})
-
-test('copy chat puts the whole conversation on the clipboard', async ({ page, context }) => {
-  await context.grantPermissions(['clipboard-read', 'clipboard-write'])
-  await mockModels(page)
-  await seedChat(page, TWO_TURNS)
-  await page.goto('/app/chat')
-
-  // Wait for the menu trigger to mount so its global keydown listener is armed
-  // before we dispatch the shortcut (same mount-race guard as the duplicate test).
-  await page.getByTitle('Conversations (Ctrl/Cmd+K)').waitFor()
-  await page.keyboard.press('Control+k')
-  await page.getByTitle('Copy chat').first().click()
-
-  const clip = await page.evaluate(() => navigator.clipboard.readText())
-  expect(clip).toContain('# Seeded Chat')
-  expect(clip).toContain('first answer')
-  expect(clip).toContain('second answer')
-})
-
-test('branch from the first answer forks history up to that point', async ({ page }) => {
-  await mockModels(page)
-  await seedChat(page, TWO_TURNS)
-  await page.goto('/app/chat')
-
-  const firstAssistant = page.locator('.chat-message-assistant').first()
-  await firstAssistant.hover()
-  await firstAssistant.getByTitle('Branch from here').click()
-
-  // New active chat "Seeded Chat (fork)" contains only the first Q/A turn.
-  await expect(page.locator('.chat-header-title')).toHaveText('Seeded Chat (fork)')
-  await expect(page.locator('.chat-message-user')).toHaveCount(1)
-  await expect(page.locator('.chat-message-assistant')).toHaveCount(1)
-  await expect(page.locator('.chat-message-assistant')).toContainText(['first answer'])
-})
--- a/core/http/react-ui/public/locales/en/chat.json
+++ b/core/http/react-ui/public/locales/en/chat.json
@@ -72,7 +72,6 @@
  "actions": {
    "copy": "Copy",
    "regenerate": "Regenerate",
-    "branch": "Branch from here",
    "jumpToLatest": "Jump to latest"
  },
  "streaming": {
@@ -101,9 +100,7 @@
  "toasts": {
    "selectModel": "Please select a model",
    "copied": "Copied to clipboard",
-    "copyFailed": "Could not copy to clipboard",
-    "chatCopied": "Chat copied to clipboard",
-    "forked": "Created a new chat"
+    "copyFailed": "Could not copy to clipboard"
  },
  "menu": {
    "trigger": "Chats",
@@ -113,8 +110,6 @@
    "noMatch": "No conversations match your search",
    "noConversations": "No conversations yet",
    "rename": "Rename",
-    "duplicate": "Duplicate chat",
-    "copyChat": "Copy chat",
    "exportMarkdown": "Export as Markdown",
    "deleteChat": "Delete chat",
    "newChat": "New chat",
--- a/core/http/react-ui/src/components/ChatsMenu.jsx
+++ b/core/http/react-ui/src/components/ChatsMenu.jsx
@@ -24,8 +24,6 @@ const ChatsMenu = forwardRef(function ChatsMenu({
  onDeleteAll,
  onRename,
  onExport,
-  onCopyChat,
-  onDuplicate,
 }, ref) {
  const { t } = useTranslation('chat')
  const [open, setOpen] = useState(false)
@@ -232,24 +230,6 @@ const ChatsMenu = forwardRef(function ChatsMenu({
                  >
                    <i className="fas fa-pen" />
                  </button>
-                  {onDuplicate && (
-                    <button
-                      type="button"
-                      onClick={(e) => { e.stopPropagation(); onDuplicate(chat); setOpen(false) }}
-                      title={t('menu.duplicate')}
-                    >
-                      <i className="fas fa-clone" />
-                    </button>
-                  )}
-                  {(chat.history?.length || 0) > 0 && onCopyChat && (
-                    <button
-                      type="button"
-                      onClick={(e) => { e.stopPropagation(); onCopyChat(chat) }}
-                      title={t('menu.copyChat')}
-                    >
-                      <i className="fas fa-clipboard" />
-                    </button>
-                  )}
                  {(chat.history?.length || 0) > 0 && onExport && (
                    <button
                      type="button"
--- a/core/http/react-ui/src/hooks/useChat.js
+++ b/core/http/react-ui/src/hooks/useChat.js
@@ -141,24 +141,6 @@ export function useChat(initialModel = '') {
    return chat
  }, [])

-  const forkChat = useCallback((chatId, uptoIndex) => {
-    const src = chats.find(c => c.id === chatId)
-    if (!src) return null
-    const end = typeof uptoIndex === 'number' ? uptoIndex : src.history.length
-    const forked = {
-      ...src,
-      id: generateId(),
-      name: `${src.name} (fork)`,
-      history: structuredClone(src.history.slice(0, end)),
-      tokenUsage: { prompt: 0, completion: 0, total: 0 },
-      createdAt: Date.now(),
-      updatedAt: Date.now(),
-    }
-    setChats(prev => [forked, ...prev])
-    setActiveChatId(forked.id)
-    return forked
-  }, [chats])
-
  const switchChat = useCallback((chatId) => {
    setActiveChatId(chatId)
    setStreamingContent('')
@@ -278,12 +260,8 @@ export function useChat(initialModel = '') {
    if (chat?.systemPrompt) {
      messages.push({ role: 'system', content: chat.systemPrompt })
    }
-    // Filter out thinking/reasoning/tool_call/tool_result messages.
-    // options.baseHistory lets callers (e.g. mid-conversation retry) pass the
-    // intended truncated history synchronously; the closure `chat` still holds
-    // the stale pre-truncation state because setChats only schedules an update.
-    const baseHistory = options.baseHistory || chat?.history || []
-    const historyForApi = baseHistory.filter(m =>
+    // Filter out thinking/reasoning/tool_call/tool_result messages
+    const historyForApi = (chat?.history || []).filter(m =>
      m.role !== 'thinking' && m.role !== 'reasoning' && m.role !== 'tool_call' && m.role !== 'tool_result'
    )
    messages.push(...historyForApi, { role: 'user', content: messageContent })
@@ -815,7 +793,6 @@ export function useChat(initialModel = '') {
    tokensPerSecond,
    maxTokensPerSecond,
    addChat,
-    forkChat,
    switchChat,
    deleteChat,
    deleteAllChats,
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -33,7 +33,7 @@ function getLastMessagePreview(chat) {
  return ''
 }

-function serializeChatAsMarkdown(chat) {
+function exportChatAsMarkdown(chat) {
  let md = `# ${chat.name}\n\n`
  md += `Model: ${chat.model || 'Unknown'}\n`
  md += `Date: ${new Date(chat.createdAt).toLocaleString()}\n\n---\n\n`
@@ -47,11 +47,7 @@ function serializeChatAsMarkdown(chat) {
      md += `<details><summary>Thinking</summary>\n\n${msg.content}\n\n</details>\n\n`
    }
  }
-  return md
-}
-
-function downloadChatAsMarkdown(chat) {
-  const blob = new Blob([serializeChatAsMarkdown(chat)], { type: 'text/markdown' })
+  const blob = new Blob([md], { type: 'text/markdown' })
  const url = URL.createObjectURL(blob)
  const a = document.createElement('a')
  a.href = url
@@ -298,7 +294,7 @@ export default function Chat() {
  const {
    chats, activeChat, activeChatId, isStreaming, streamingChatId, streamingContent,
    streamingReasoning, streamingToolCalls, tokensPerSecond, maxTokensPerSecond,
-    addChat, forkChat, switchChat, deleteChat, deleteAllChats, renameChat, updateChatSettings,
+    addChat, switchChat, deleteChat, deleteAllChats, renameChat, updateChatSettings,
    sendMessage, stopGeneration, clearHistory, getContextUsagePercent, addMessage,
  } = useChat(urlModel || '')

@@ -799,27 +795,34 @@ export default function Chat() {
    await sendMessage(msg, files, mcpOptions)
  }, [input, files, activeChat, sendMessage, addToast, getToolsForLLM, isClientTool, executeTool, hasAppUI, getAppResource, getToolDefinition])

-  const handleRegenerate = useCallback(async (targetIndex) => {
+  const handleRegenerate = useCallback(async () => {
    if (!activeChat || isStreaming) return
    const history = activeChat.history
-    const end = typeof targetIndex === 'number' ? targetIndex : history.length
-    // Nearest user message at or before the target answer.
-    let userIdx = -1
-    for (let i = Math.min(end, history.length) - 1; i >= 0; i--) {
-      if (history[i].role === 'user') { userIdx = i; break }
+    let lastUserMsg = null
+    let lastUserFiles = null
+    for (let i = history.length - 1; i >= 0; i--) {
+      if (history[i].role === 'user') {
+        lastUserMsg = typeof history[i].content === 'string' ? history[i].content : history[i].content?.[0]?.text || ''
+        lastUserFiles = history[i].files || []
+        break
+      }
    }
-    if (userIdx === -1) return
-    const userMsg = typeof history[userIdx].content === 'string'
-      ? history[userIdx].content
-      : history[userIdx].content?.[0]?.text || ''
-    const userFiles = history[userIdx].files || []
-    // Drop the user turn and everything after it; sendMessage re-appends it.
-    // Thread the truncated history through explicitly: updateChatSettings only
-    // schedules a state update, so sendMessage's closure would otherwise read
-    // the stale pre-truncation history for the outbound API payload.
-    const baseHistory = history.slice(0, userIdx)
-    updateChatSettings(activeChat.id, { history: baseHistory })
-    await sendMessage(userMsg, userFiles, { baseHistory })
+    if (!lastUserMsg) return
+
+    // Remove everything after and including the last user message
+    const newHistory = []
+    let foundLastUser = false
+    for (let i = history.length - 1; i >= 0; i--) {
+      if (!foundLastUser && history[i].role === 'user') {
+        foundLastUser = true
+        continue
+      }
+      if (foundLastUser) {
+        newHistory.unshift(history[i])
+      }
+    }
+    updateChatSettings(activeChat.id, { history: newHistory })
+    await sendMessage(lastUserMsg, lastUserFiles)
  }, [activeChat, isStreaming, sendMessage, updateChatSettings])

  const handleKeyDown = (e) => {
@@ -849,11 +852,6 @@ export default function Chat() {
    }
  }

-  const copyChatAsMarkdown = async (chat) => {
-    const ok = await copyToClipboard(serializeChatAsMarkdown(chat))
-    addToast(ok ? t('toasts.chatCopied') : t('toasts.copyFailed'), ok ? 'success' : 'error', ok ? 2000 : 3000)
-  }
-
  const contextPercent = getContextUsagePercent()

  // Recent chats for the empty state — exclude the current chat and any
@@ -894,9 +892,7 @@ export default function Chat() {
            onDelete={deleteChat}
            onDeleteAll={promptDeleteAll}
            onRename={renameChat}
-            onExport={(chat) => downloadChatAsMarkdown(chat)}
-            onCopyChat={(chat) => copyChatAsMarkdown(chat)}
-            onDuplicate={(chat) => { if (forkChat(chat.id)) addToast(t('toasts.forked'), 'success', 2000) }}
+            onExport={(chat) => exportChatAsMarkdown(chat)}
          />
          {activeChat.localaiAssistant && (
            <span
@@ -1188,19 +1184,11 @@ export default function Chat() {
                      <button onClick={() => copyMessage(msg.content)} title={t('actions.copy')}>
                        <i className="fas fa-copy" />
                      </button>
-                      {msg.role === 'assistant' && !isStreaming && (
-                        <button onClick={() => handleRegenerate(i)} title={t('actions.regenerate')}>
+                      {msg.role === 'assistant' && i === activeChat.history.length - 1 && !isStreaming && (
+                        <button onClick={handleRegenerate} title={t('actions.regenerate')}>
                          <i className="fas fa-rotate" />
                        </button>
                      )}
-                      {msg.role === 'assistant' && !isStreaming && (
-                        <button
-                          onClick={() => { forkChat(activeChat.id, i + 1); addToast(t('toasts.forked'), 'success', 2000) }}
-                          title={t('actions.branch')}
-                        >
-                          <i className="fas fa-code-branch" />
-                        </button>
-                      )}
                    </div>
                  </div>
                </div>
--- a/core/http/react-ui/src/pages/Manage.jsx
+++ b/core/http/react-ui/src/pages/Manage.jsx
@@ -146,7 +146,6 @@ export default function Manage() {
  const [distributedMode, setDistributedMode] = useState(false)
  const [togglingModels, setTogglingModels] = useState(new Set())
  const [pinningModels, setPinningModels] = useState(new Set())
-  const [loadingModels, setLoadingModels] = useState(new Set())
  // Expanded row state — keyed by `${tab}:${id}` so switching tabs doesn't
  // collide and a single row is open at a time per tab.
  const [expandedKey, setExpandedKey] = useState(null)
@@ -314,26 +313,6 @@ export default function Manage() {
    })
  }

-  // Pre-load a model (or all of a realtime pipeline's sub-models) into memory.
-  // The /backend/load call blocks until loading finishes, so the menu item shows
-  // a loading state while in flight and reports the outcome on completion.
-  const handleLoadModel = async (modelName) => {
-    setLoadingModels(prev => new Set(prev).add(modelName))
-    try {
-      await backendControlApi.load({ model: modelName })
-      addToast(`Loaded ${modelName}`, 'success')
-      setTimeout(fetchLoadedModels, 500)
-    } catch (err) {
-      addToast(`Failed to load: ${err.message}`, 'error')
-    } finally {
-      setLoadingModels(prev => {
-        const next = new Set(prev)
-        next.delete(modelName)
-        return next
-      })
-    }
-  }
-
  const handleDeleteModel = (modelName) => {
    setConfirmDialog({
      title: 'Delete Model',
@@ -708,11 +687,6 @@ export default function Manage() {
                              label: model.disabled ? 'Enable model' : 'Disable model',
                              onClick: () => handleToggleModel(model.id, model.disabled),
                              disabled: togglingModels.has(model.id) },
-                            { key: 'load', icon: 'fa-bolt',
-                              label: loadingModels.has(model.id) ? 'Loading…' : 'Load into memory',
-                              onClick: () => handleLoadModel(model.id),
-                              hidden: isRunning || !!model.disabled,
-                              disabled: loadingModels.has(model.id) },
                            { key: 'stop', icon: 'fa-stop', label: 'Stop model',
                              onClick: () => handleStopModel(model.id), hidden: !isRunning },
                            { key: 'pin', icon: 'fa-thumbtack',
--- a/core/http/react-ui/src/utils/api.js
+++ b/core/http/react-ui/src/utils/api.js
@@ -352,9 +352,6 @@ export const realtimeApi = {
 // Backend control
 export const backendControlApi = {
  shutdown: (body) => postJSON(API_CONFIG.endpoints.backendShutdown, body),
-  // Pre-load a model (or all of a realtime pipeline's sub-models) into memory.
-  // body: { model: "<name>" }. Inverse of shutdown.
-  load: (body) => postJSON(API_CONFIG.endpoints.backendLoad, body),
 }

 // System info
--- a/core/http/react-ui/src/utils/config.js
+++ b/core/http/react-ui/src/utils/config.js
@@ -106,7 +106,6 @@ export const API_CONFIG = {
    video: '/video',
    backendMonitor: '/backend/monitor',
    backendShutdown: '/backend/shutdown',
-    backendLoad: '/backend/load',
    modelsApply: '/models/apply',
    modelsDelete: (name) => `/models/delete/${name}`,
    modelsAvailable: '/models/available',
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -207,14 +207,9 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 	backendMonitorService := monitoring.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
 	router.GET("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService), adminMiddleware)
 	router.POST("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService), adminMiddleware)
-	// /backend/load is the inverse of /backend/shutdown: pre-load a model (or all
-	// of a realtime pipeline's sub-models) into memory so clients can drive
-	// warm-up explicitly instead of paying the cold-start cost on first use.
-	router.POST("/backend/load", localai.LoadModelEndpoint(cl, ml, appConfig), adminMiddleware)
 	// The v1/* urls are exactly the same as above - makes local e2e testing easier if they are registered.
 	router.GET("/v1/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService), adminMiddleware)
 	router.POST("/v1/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService), adminMiddleware)
-	router.POST("/v1/backend/load", localai.LoadModelEndpoint(cl, ml, appConfig), adminMiddleware)

 	// Traces and backend logs (monitoring)
 	router.GET("/api/traces", localai.GetAPITracesEndpoint(), adminMiddleware)
@@ -250,7 +245,6 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 			"metrics":              "/metrics",
 			"backend_monitor":      "/backend/monitor",
 			"backend_shutdown":     "/backend/shutdown",
-			"backend_load":         "/backend/load",
 			"system":               "/system",
 			"version":              "/version",
 			"traces":               "/api/traces",
@@ -272,27 +266,25 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 			"version": internal.PrintableVersion(),
 			// Flat endpoint list for backwards compatibility
 			"endpoints": map[string]any{
-				"models":              "/v1/models",
-				"models_capabilities": "/v1/models/capabilities",
-				"chat_completions":    "/v1/chat/completions",
-				"completions":         "/v1/completions",
-				"embeddings":          "/v1/embeddings",
-				"config_metadata":     "/api/models/config-metadata",
-				"config_json":         "/api/models/config-json/:name",
-				"config_patch":        "/api/models/config-json/:name",
-				"autocomplete":        "/api/models/config-metadata/autocomplete/:provider",
-				"vram_estimate":       "/api/models/vram-estimate",
-				"tts":                 "/tts",
-				"transcription":       "/v1/audio/transcriptions",
-				"image_generation":    "/v1/images/generations",
-				"swagger":             "/swagger/index.html",
-				"instructions":        "/api/instructions",
+				"models":           "/v1/models",
+				"chat_completions": "/v1/chat/completions",
+				"completions":      "/v1/completions",
+				"embeddings":       "/v1/embeddings",
+				"config_metadata":  "/api/models/config-metadata",
+				"config_json":      "/api/models/config-json/:name",
+				"config_patch":     "/api/models/config-json/:name",
+				"autocomplete":     "/api/models/config-metadata/autocomplete/:provider",
+				"vram_estimate":    "/api/models/vram-estimate",
+				"tts":              "/tts",
+				"transcription":    "/v1/audio/transcriptions",
+				"image_generation": "/v1/images/generations",
+				"swagger":          "/swagger/index.html",
+				"instructions":     "/api/instructions",
 			},
 			// Categorized endpoint groups for structured discovery
 			"endpoint_groups": map[string]any{
 				"openai_compatible": map[string]string{
 					"models":               "/v1/models",
-					"models_capabilities":  "/v1/models/capabilities",
 					"chat_completions":     "/v1/chat/completions",
 					"completions":          "/v1/completions",
 					"embeddings":           "/v1/embeddings",
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -257,10 +257,4 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	// List models
 	app.GET("/v1/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
 	app.GET("/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
-
-	// List models enriched with capabilities + input/output modalities
-	// (LocalAI-specific, additive superset of /v1/models).
-	capabilitiesHandler := openai.ListModelCapabilitiesEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB())
-	app.GET("/v1/models/capabilities", capabilitiesHandler)
-	app.GET("/models/capabilities", capabilitiesHandler)
 }
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -11,24 +11,6 @@ type BackendMonitorRequest struct {
 	BasicModelRequest
 }

-// ModelLoadRequest asks LocalAI to pre-load a model into memory by name, so the
-// first request that uses it pays no cold-start load cost. For a realtime
-// pipeline model, every configured sub-model (VAD, transcription, LLM, TTS,
-// sound_detection, voice_recognition) is loaded instead of the pipeline stub.
-// It is the inverse of the /backend/shutdown request.
-type ModelLoadRequest struct {
-	BasicModelRequest
-}
-
-// ModelLoadResponse reports the outcome of a /backend/load call.
-type ModelLoadResponse struct {
-	// Loaded lists the model names actually resident in memory after the call.
-	// For a pipeline model these are its sub-models, not the pipeline name.
-	Loaded []string `json:"loaded"`
-	// Message is a short human-readable status ("model loaded", or an error).
-	Message string `json:"message"`
-}
-
 type TokenMetricsRequest struct {
 	BasicModelRequest
 }
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -251,27 +251,3 @@ type ModelsDataResponse struct {
 	Object string        `json:"object"`
 	Data   []OpenAIModel `json:"data"`
 }
-
-// ModelCapabilities is a strict superset of OpenAIModel that additionally
-// describes what a model can do and which modalities it accepts/produces. It is
-// served by the LocalAI-specific /v1/models/capabilities endpoint so clients can
-// route attachments (image/audio/video) to a model only when it can handle them.
-type ModelCapabilities struct {
-	ID     string `json:"id"`
-	Object string `json:"object"`
-	// Capabilities are canonical usecase strings (e.g. chat, vision, transcript,
-	// tts, embeddings, image, video) plus the modifiers "tools" and "thinking".
-	Capabilities []string `json:"capabilities"`
-	// InputModalities is the subset of {text,image,audio,video} the model accepts.
-	InputModalities []string `json:"input_modalities"`
-	// OutputModalities is the subset of {text,image,audio,video} the model produces.
-	OutputModalities []string `json:"output_modalities"`
-}
-
-// ModelCapabilitiesResponse is the envelope returned by /v1/models/capabilities.
-// It mirrors ModelsDataResponse so a client can treat it as an enriched
-// drop-in for /v1/models.
-type ModelCapabilitiesResponse struct {
-	Object string              `json:"object"`
-	Data   []ModelCapabilities `json:"data"`
-}
--- a/core/services/routing/pii/metrics.go
+++ b/core/services/routing/pii/metrics.go
@@ -1,48 +0,0 @@
-package pii
-
-import (
-	"context"
-	"sync"
-
-	"go.opentelemetry.io/otel"
-	"go.opentelemetry.io/otel/attribute"
-	"go.opentelemetry.io/otel/metric"
-)
-
-// Prometheus counter for PII events. The EventStore ring buffer is
-// capacity-bound and meant for recent-audit browsing; operators also want
-// a monotonic, scrape-friendly signal ("how many detections/blocks per
-// hour, did the filter stop firing after a deploy"). Record() is the
-// single choke point every producer already goes through (request
-// middleware, response scrubbing, MITM proxy connects/intercepts), so one
-// counter here covers all paths without touching the producers.
-//
-// Initialised lazily on first Record so the package works no matter when
-// (or whether) the Prometheus-backed global MeterProvider is installed —
-// same pattern as core/services/routing/billing.
-var (
-	metricsOnce   sync.Once
-	eventsCounter metric.Int64Counter
-)
-
-func recordEventMetric(e PIIEvent) {
-	metricsOnce.Do(func() {
-		meter := otel.Meter("github.com/mudler/LocalAI")
-		c, err := meter.Int64Counter(
-			"localai_pii_events_total",
-			metric.WithDescription("PII/audit events recorded, labeled by kind, origin, action and direction"),
-		)
-		if err == nil {
-			eventsCounter = c
-		}
-	})
-	if eventsCounter == nil {
-		return
-	}
-	eventsCounter.Add(context.Background(), 1, metric.WithAttributes(
-		attribute.String("kind", string(e.Kind)),
-		attribute.String("origin", string(e.Origin)),
-		attribute.String("action", string(e.Action)),
-		attribute.String("direction", string(e.Direction)),
-	))
-}
--- a/core/services/routing/pii/store.go
+++ b/core/services/routing/pii/store.go
@@ -58,7 +58,6 @@ type memoryEventStore struct {
 }

 func (s *memoryEventStore) Record(_ context.Context, e PIIEvent) error {
-	recordEventMetric(e)
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.ring[s.cursor] = e
--- a/core/trace/audio_snippet.go
+++ b/core/trace/audio_snippet.go
@@ -14,16 +14,6 @@ import (
 // MaxSnippetSeconds is the maximum number of seconds of audio captured per trace.
 const MaxSnippetSeconds = 30

-// silenceFloorDBFS is the dBFS value reported for digital silence (RMS or peak
-// of zero). The true level is -∞ dBFS; reporting a finite floor keeps the
-// metric present and meaningful in the Traces UI (a scrubbed nil would read as
-// "missing" rather than "silent"). -120 dBFS sits well below 16-bit PCM's
-// ~-90 dBFS least-significant-bit floor, so it reads unambiguously as
-// "effectively silent". JSON-marshal safety for any non-finite float that does
-// reach a trace is owned centrally by RecordBackendTrace's sanitizer — this
-// floor is about presentation, not transport.
-const silenceFloorDBFS = -120.0
-
 // AudioSnippet captures the first MaxSnippetSeconds of a WAV file and computes
 // quality metrics. The result is a map suitable for merging into a BackendTrace
 // Data field. maxBytes caps the embedded base64 waveform so a single TTS or
@@ -73,7 +63,7 @@ func AudioSnippetFromPCM(pcm []byte, sampleRate, totalPCMBytes, maxBytes int) ma
 	snippetDuration := float64(len(samples)) / float64(sampleRate)

 	rms := sound.CalculateRMS16(samples)
-	rmsDBFS := silenceFloorDBFS
+	rmsDBFS := -math.Inf(1)
 	if rms > 0 {
 		rmsDBFS = 20 * math.Log10(rms/32768.0)
 	}
@@ -88,7 +78,7 @@ func AudioSnippetFromPCM(pcm []byte, sampleRate, totalPCMBytes, maxBytes int) ma
 		}
 		dcSum += int64(s)
 	}
-	peakDBFS := silenceFloorDBFS
+	peakDBFS := -math.Inf(1)
 	if peak > 0 {
 		peakDBFS = 20 * math.Log10(float64(peak)/32768.0)
 	}
--- a/core/trace/audio_snippet_test.go
+++ b/core/trace/audio_snippet_test.go
@@ -1,9 +1,6 @@
 package trace_test

 import (
-	"encoding/json"
-	"math"
-
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"

@@ -50,32 +47,3 @@ var _ = Describe("AudioSnippetFromPCM byte cap", func() {
 		Expect(out).To(HaveKey("audio_wav_base64"))
 	})
 })
-
-// Silent audio (RMS/peak of zero) has a true level of -∞ dBFS, but emitting
-// -Inf made the whole /api/backend-traces response fail to JSON-marshal and
-// blanked the Traces UI. The metrics must instead be finite and serializable.
-var _ = Describe("AudioSnippetFromPCM silent audio dBFS", func() {
-	pcm := makePCM(snippetSeconds, snippetSampleRate) // all zeros == digital silence
-	totalPCM := len(pcm)
-
-	It("reports finite dBFS for silence instead of -Inf", func() {
-		out := trace.AudioSnippetFromPCM(pcm, snippetSampleRate, totalPCM, 0)
-
-		rms, ok := out["audio_rms_dbfs"].(float64)
-		Expect(ok).To(BeTrue())
-		Expect(math.IsInf(rms, 0)).To(BeFalse(), "silent RMS must not be ±Inf")
-		Expect(math.IsNaN(rms)).To(BeFalse())
-
-		peak, ok := out["audio_peak_dbfs"].(float64)
-		Expect(ok).To(BeTrue())
-		Expect(math.IsInf(peak, 0)).To(BeFalse(), "silent peak must not be ±Inf")
-		Expect(math.IsNaN(peak)).To(BeFalse())
-	})
-
-	It("produces a snippet that round-trips through encoding/json", func() {
-		out := trace.AudioSnippetFromPCM(pcm, snippetSampleRate, totalPCM, 0)
-
-		_, err := json.Marshal(out)
-		Expect(err).ToNot(HaveOccurred(), "silent-audio metrics must be JSON-marshalable")
-	})
-})
--- a/core/trace/backend_trace.go
+++ b/core/trace/backend_trace.go
@@ -3,8 +3,6 @@ package trace
 import (
 	"encoding/json"
 	"fmt"
-	"maps"
-	"math"
 	"slices"
 	"sync"
 	"time"
@@ -118,13 +116,8 @@ func RecordBackendTrace(t BackendTrace) {
 	backendMu.Lock()
 	maxBody := backendMaxBodyBytes
 	backendMu.Unlock()
-	// Always walk Data, even with no body cap configured: besides capping
-	// oversized strings (maxBody > 0), the walk replaces non-finite floats
-	// (Inf/NaN) that encoding/json cannot marshal. A single such value — e.g. a
-	// -Inf dBFS audio metric from a silent clip — would otherwise fail the whole
-	// /api/backend-traces response and blank the Traces UI.
-	if t.Data != nil {
-		t.Data = sanitizeData(t.Data, maxBody)
+	if t.Data != nil && maxBody > 0 {
+		t.Data = capDataStrings(t.Data, maxBody)
 	}
 	select {
 	case backendLogChan <- &t:
@@ -133,90 +126,32 @@ func RecordBackendTrace(t BackendTrace) {
 	}
 }

-// sanitizeData walks a trace Data map (recursing into nested maps and slices)
-// and makes every value safe for the /api/backend-traces JSON response:
-//
-//   - When maxBytes > 0, any string longer than maxBytes is replaced with a
-//     fixed-size marker that names the original byte count. The replacement is
-//     intentionally short and not valid base64/JSON: it flags "this was dropped"
-//     cheaply rather than keeping a partial value the UI might try to render.
-//   - Non-finite floats (Inf/NaN) are replaced with nil regardless of maxBytes,
-//     because encoding/json refuses to marshal them and one bad value would fail
-//     the entire response.
-//
-// Other scalars (ints, bools, finite floats) pass through untouched so
-// structural fields like total_deltas or audio_sample_rate remain useful.
-//
-// The walk is copy-on-write: it runs on every RecordBackendTrace call, and in
-// the common case nothing needs rewriting, so containers are only re-allocated
-// on the paths that actually changed and untouched values keep their original
-// interface boxes instead of paying a per-value re-boxing allocation.
-func sanitizeData(data map[string]any, maxBytes int) map[string]any {
-	out, _ := sanitizeMap(data, maxBytes)
+// capDataStrings walks a trace Data map and replaces any string value (at any
+// depth) that exceeds maxBytes with a fixed-size marker that names the
+// original byte count. The replacement is intentionally short and not valid
+// base64/JSON: the goal is to flag "this was dropped" cheaply, not to keep a
+// partial value that the UI might try to render. Non-string scalars and
+// non-map containers pass through untouched so structural fields like
+// total_deltas or audio_sample_rate remain useful.
+func capDataStrings(data map[string]any, maxBytes int) map[string]any {
+	out := make(map[string]any, len(data))
+	for k, v := range data {
+		out[k] = capValue(v, maxBytes)
+	}
 	return out
 }

-func sanitizeMap(m map[string]any, maxBytes int) (map[string]any, bool) {
-	var out map[string]any
-	for k, v := range m {
-		nv, changed := sanitizeValue(v, maxBytes)
-		if changed && out == nil {
-			// First change: fork the map. Entries already visited were
-			// unchanged, so a full copy then overwriting as we go is exact.
-			out = make(map[string]any, len(m))
-			maps.Copy(out, m)
-		}
-		if out != nil {
-			out[k] = nv
-		}
-	}
-	if out == nil {
-		return m, false
-	}
-	return out, true
-}
-
-func sanitizeSlice(s []any, maxBytes int) ([]any, bool) {
-	var out []any
-	for i, v := range s {
-		nv, changed := sanitizeValue(v, maxBytes)
-		if changed && out == nil {
-			out = make([]any, len(s))
-			copy(out, s)
-		}
-		if out != nil {
-			out[i] = nv
-		}
-	}
-	if out == nil {
-		return s, false
-	}
-	return out, true
-}
-
-func sanitizeValue(v any, maxBytes int) (any, bool) {
+func capValue(v any, maxBytes int) any {
 	switch val := v.(type) {
 	case string:
-		if maxBytes > 0 && len(val) > maxBytes {
-			return fmt.Sprintf("<truncated: %d bytes>", len(val)), true
+		if len(val) > maxBytes {
+			return fmt.Sprintf("<truncated: %d bytes>", len(val))
 		}
-		return v, false
-	case float64:
-		if math.IsInf(val, 0) || math.IsNaN(val) {
-			return nil, true
-		}
-		return v, false
-	case float32:
-		if f := float64(val); math.IsInf(f, 0) || math.IsNaN(f) {
-			return nil, true
-		}
-		return v, false
+		return val
 	case map[string]any:
-		return sanitizeMap(val, maxBytes)
-	case []any:
-		return sanitizeSlice(val, maxBytes)
+		return capDataStrings(val, maxBytes)
 	default:
-		return v, false
+		return v
 	}
 }

--- a/core/trace/backend_trace_sanitize_test.go
+++ b/core/trace/backend_trace_sanitize_test.go
@@ -1,80 +0,0 @@
-package trace_test
-
-import (
-	"encoding/json"
-	"math"
-	"time"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/trace"
-)
-
-// encoding/json cannot marshal ±Inf or NaN. The /api/backend-traces endpoint
-// serializes the whole buffer with one json call, so a single non-finite float
-// in any trace's Data map (e.g. a -Inf dBFS audio metric from a silent clip)
-// would fail the entire response and blank the Traces UI. RecordBackendTrace
-// must scrub those values regardless of whether a body cap is configured.
-var _ = Describe("RecordBackendTrace non-finite float sanitization", func() {
-	BeforeEach(func() {
-		// maxBodyBytes 0 == no body cap: float sanitization must still run.
-		trace.InitBackendTracingIfEnabled(64, 0)
-		trace.ClearBackendTraces()
-	})
-
-	It("replaces ±Inf and NaN with nil so the response stays JSON-marshalable", func() {
-		trace.RecordBackendTrace(trace.BackendTrace{
-			Timestamp: time.Now(),
-			Type:      trace.BackendTraceTranscription,
-			ModelName: "m",
-			Data: map[string]any{
-				"audio_rms_dbfs":   math.Inf(-1),
-				"audio_peak_dbfs":  math.Inf(1),
-				"weird":            math.NaN(),
-				"audio_duration_s": 1.5, // finite siblings must survive
-			},
-		})
-
-		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
-		got := trace.GetBackendTraces()[0]
-
-		Expect(got.Data["audio_rms_dbfs"]).To(BeNil())
-		Expect(got.Data["audio_peak_dbfs"]).To(BeNil())
-		Expect(got.Data["weird"]).To(BeNil())
-		Expect(got.Data["audio_duration_s"]).To(Equal(1.5), "finite floats must pass through untouched")
-
-		_, err := json.Marshal(trace.GetBackendTraces())
-		Expect(err).ToNot(HaveOccurred(), "the whole trace buffer must marshal even with non-finite inputs")
-	})
-
-	It("scrubs non-finite floats nested in maps and slices", func() {
-		trace.RecordBackendTrace(trace.BackendTrace{
-			Timestamp: time.Now(),
-			Type:      trace.BackendTraceLLM,
-			ModelName: "m",
-			Data: map[string]any{
-				"nested": map[string]any{
-					"logprob": math.Inf(-1),
-					"ok":      0.25,
-				},
-				"scores": []any{1.0, math.Inf(1), math.NaN()},
-			},
-		})
-
-		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
-		got := trace.GetBackendTraces()[0]
-
-		nested := got.Data["nested"].(map[string]any)
-		Expect(nested["logprob"]).To(BeNil())
-		Expect(nested["ok"]).To(Equal(0.25))
-
-		scores := got.Data["scores"].([]any)
-		Expect(scores[0]).To(Equal(1.0))
-		Expect(scores[1]).To(BeNil())
-		Expect(scores[2]).To(BeNil())
-
-		_, err := json.Marshal(trace.GetBackendTraces())
-		Expect(err).ToNot(HaveOccurred())
-	})
-})
--- a/docs/content/advanced/vram-management.md
+++ b/docs/content/advanced/vram-management.md
@@ -381,8 +381,6 @@ curl -X POST http://localhost:8080/backend/shutdown \

 To stop all models, you'll need to call the endpoint for each loaded model individually, or use the web UI to stop all models at once.

-Conversely, you can pre-load a model into memory ahead of its first request with `POST /backend/load` (the inverse of shutdown) — see [Backend Monitor]({{%relref "features/backend-monitor" %}}).
-
 ### Best Practices

 1. **Monitor VRAM usage**: Use `nvidia-smi` (for NVIDIA GPUs) or similar tools to monitor actual VRAM usage
--- a/docs/content/features/api-discovery.md
+++ b/docs/content/features/api-discovery.md
@@ -36,7 +36,6 @@ Returns the instance version, all available endpoint URLs (flat and categorized)
  "endpoints": {
    "chat_completions": "/v1/chat/completions",
    "models": "/v1/models",
-    "models_capabilities": "/v1/models/capabilities",
    "config_metadata": "/api/models/config-metadata",
    "instructions": "/api/instructions",
    "swagger": "/swagger/index.html"
@@ -124,45 +123,6 @@ Add `?format=json` to get a raw **OpenAPI fragment** (filtered Swagger spec with
 curl http://localhost:8080/api/instructions/config-management?format=json
 ```

-## Model Capabilities
-
-`GET /v1/models/capabilities`
-
-An additive, LocalAI-specific superset of `/v1/models`. It returns the same set of models but enriches each entry with the **capabilities** the model supports and the **input/output modalities** it accepts and produces. Use it to decide, before sending a request, whether a given model can take an image, audio, or video attachment directly — or whether the input needs converting/transcribing first.
-
-Because it is purely additive, clients that only understand `/v1/models` keep working unchanged; they simply never call this route.
-
-```bash
-curl http://localhost:8080/v1/models/capabilities
-```
-
-```json
-{
-  "object": "list",
-  "data": [
-    {
-      "id": "qwen2.5-omni",
-      "object": "model",
-      "capabilities": ["chat", "vision", "tools"],
-      "input_modalities": ["text", "image", "audio"],
-      "output_modalities": ["text"]
-    },
-    {
-      "id": "parakeet",
-      "object": "model",
-      "capabilities": ["transcript"],
-      "input_modalities": ["audio"],
-      "output_modalities": ["text"]
-    }
-  ]
-}
-```
-
- **`capabilities`** — canonical usecase strings (e.g. `chat`, `vision`, `transcript`, `tts`, `embeddings`, `image`, `video`) plus the modifiers `tools` and `thinking`.
- **`input_modalities` / `output_modalities`** — the subsets of `{text, image, audio, video}` the model accepts and produces. Audio and video *input* are derived from the model's multimodal limits (e.g. vLLM `limit_mm_per_prompt`), which no single usecase flag expresses — which is why this endpoint exists alongside the plain listing.
-
-The same query parameters as `/v1/models` are honored (`filter`, `excludeConfigured`), and the same per-user model allowlist is applied when authentication is enabled.
-
 ## Configuration Management APIs

 These endpoints let agents discover model configuration fields, read current settings, modify them, and estimate VRAM usage.
--- a/docs/content/features/authentication.md
+++ b/docs/content/features/authentication.md
@@ -166,7 +166,7 @@ When authentication is enabled, the following endpoints require admin role:
 - `GET /api/backend-traces`, `POST /api/backend-traces/clear`
 - `GET /api/backend-logs/*`, `POST /api/backend-logs/*/clear`
 - `GET /api/resources`, `GET /api/settings`, `POST /api/settings`
- `GET /system`, `GET /backend/monitor`, `POST /backend/shutdown`, `POST /backend/load`
+- `GET /system`, `GET /backend/monitor`, `POST /backend/shutdown`

 **P2P:**
 - `GET /api/p2p/*`
--- a/docs/content/features/backend-monitor.md
+++ b/docs/content/features/backend-monitor.md
@@ -5,9 +5,7 @@ weight = 20
 url = "/features/backend-monitor/"
 +++

-LocalAI provides endpoints to monitor and manage running backends. The `/backend/monitor` endpoint reports the status and resource usage of loaded models, `/backend/load` pre-loads a model into memory, and `/backend/shutdown` allows stopping a model's backend process.
-
-All three are admin-only.
+LocalAI provides endpoints to monitor and manage running backends. The `/backend/monitor` endpoint reports the status and resource usage of loaded models, and `/backend/shutdown` allows stopping a model's backend process.

 ## Monitor API

@@ -64,42 +62,6 @@ curl "http://localhost:8080/backend/monitor?model=my-model"
 }
 ```

-## Load API
-
-Pre-loads a model into memory ahead of its first request, so that request pays no cold-start load cost. It is the inverse of the Shutdown API and works for any model, not just realtime pipelines.
-
- **Method:** `POST`
- **Endpoints:** `/backend/load`, `/v1/backend/load`
-
-### Request
-
-| Parameter | Type     | Required | Description                  |
-|-----------|----------|----------|------------------------------|
-| `model`   | `string` | Yes      | Name of the model to load    |
-
-### Behavior
-
- For a regular model, its own backend is loaded.
- For a **realtime pipeline** model (a config with a `pipeline:` block), every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded concurrently instead of the pipeline stub, which has no backend of its own.
-
-The call blocks until loading finishes and reports which model names became resident, so partial failures are visible.
-
-### Usage
-
-```bash
-curl -X POST http://localhost:8080/backend/load \
-  -H "Content-Type: application/json" \
-  -d '{"model": "my-model"}'
-```
-
-### Example response
-
-```json
-{ "loaded": ["my-model"], "message": "model loaded" }
-```
-
-On failure the call returns `500` with `loaded` listing whichever sub-models did load and `message` naming the failures.
-
 ## Shutdown API

 - **Method:** `POST`
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -56,39 +56,6 @@ pipeline:

 All streaming flags are off by default, so existing pipelines are unaffected.

-### Model warm-up (cold start)
-
-Without warm-up the pipeline's models are loaded into memory only on first use *within* a session: the VAD on the first audio chunk, transcription at the first end-of-speech, the LLM on the first reply, and TTS on the first spoken output. On a cold session this staggers a load delay across those first few interactions — and a model that fails to load (missing weights, wrong backend, out of memory) only fails part-way through the first turn.
-
-To avoid that, LocalAI **warms the pipeline by default**: it loads the VAD, transcription, LLM and TTS backends into memory *before* the session is announced, and the session start **blocks until they are all ready**. The loads run concurrently, so the wait is the slowest single model, not the sum. This means:
-
- The first turn pays no cold-start cost — every backend is already resident.
- **Model-load errors surface at session start.** If any stage fails to load, the session is not started and the client receives a `model_load_error` instead of `session.created`, so a broken pipeline fails fast and visibly rather than mid-call.
-
-Set `disable_warmup: true` to restore the lazy "load on first use" behavior — session start no longer waits on loading and load errors surface on the first turn instead. Useful if you want idle sessions to avoid holding model memory they may never use:
-
-```yaml
-name: gpt-realtime
-pipeline:
-  vad: silero-vad-ggml
-  transcription: whisper-large-turbo
-  llm: qwen3-4b
-  tts: tts-1
-  disable_warmup: true   # lazily load each model on first use instead of at session start
-```
-
-#### Pre-loading a pipeline on demand
-
-Warm-up only fires when a realtime session opens. To load a pipeline into memory ahead of time — e.g. to warm it right after boot, or when running with `disable_warmup: true` — POST the model name to the admin-only `/backend/load` endpoint. For a pipeline model it loads every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) concurrently:
-
-```bash
-curl -X POST http://localhost:8080/backend/load \
-  -H "Content-Type: application/json" \
-  -d '{"model": "gpt-realtime"}'
-```
-
-The endpoint is not realtime-specific — it pre-loads any model. See [Backend Monitor]({{%relref "features/backend-monitor" %}}) for the full request/response reference (it is the inverse of `/backend/shutdown`).
-
 ### Turn detection

 Turn detection decides when the user has finished speaking and the pipeline should respond. Two modes are supported, matching the OpenAI session schema:
--- a/docs/content/features/text-generation.md
+++ b/docs/content/features/text-generation.md
@@ -507,7 +507,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp
 | `fit_params_min_ctx` or `fit_ctx` | integer | Minimum context size that can be set by fit_params. Default: `4096`. | `fit_ctx:2048` |
 | `n_cache_reuse` or `cache_reuse` | integer | Minimum chunk size to attempt reusing from the cache via KV shifting. Default: `0` (disabled). | `cache_reuse:256` |
 | `slot_prompt_similarity` or `sps` | float | How much the prompt of a request must match the prompt of a slot to use that slot. Default: `0.1`. Set to `0` to disable. | `sps:0.5` |
-| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Default: `false`. | `swa_full:true` |
+| `swa_full` | boolean | Use full-size SWA (Sliding Window Attention) cache. Upstream default is `false` (a memory-light reduced cache), but that reduced cache cannot reuse a prompt prefix across requests, which defeats `cache_reuse` for SWA models (Gemma 2/3, Cohere2, Llama 4, ...). LocalAI therefore **auto-enables `swa_full:true` for GGUF models detected as SWA** so the cross-request prefix cache works; it is left off for dense models. The tradeoff is memory: the full SWA cache scales with `context_size`. Set `swa_full:false` explicitly to opt back out (e.g. to save memory at a large context). | `swa_full:true` |
 | `cont_batching` or `continuous_batching` | boolean | Enable continuous batching for handling multiple sequences. Default: `true`. | `cont_batching:true` |
 | `check_tensors` | boolean | Validate tensor data for invalid values during model loading. Default: `false`. | `check_tensors:true` |
 | `warmup` | boolean | Enable warmup run after model loading. Default: `true`. | `warmup:false` |
--- a/docs/content/whats-new.md
+++ b/docs/content/whats-new.md
@@ -17,7 +17,6 @@ You can see the release notes [here](https://github.com/mudler/LocalAI/releases)
 - **May 2026**: [Speaker diarization](/features/audio-diarization/) — new `/v1/audio/diarization` endpoint returning "who spoke when" segments. Backed by `sherpa-onnx` (pyannote-3.0 + speaker embeddings + clustering) for pure diarization, and `vibevoice-cpp` for diarization bundled with long-form ASR. Supports `json` / `verbose_json` / `rttm` response formats.
 - **June 2026**: [Sound classification](/features/audio-classification/) — new `/v1/audio/classification` endpoint for audio tagging / sound-event classification, returning scored [AudioSet](https://research.google.com/audioset/) labels (baby cry, glass breaking, alarms, ...). Backed by [ced.cpp](https://github.com/mudler/ced.cpp), a 527-class AudioSet tagger ported to ggml.
 - **June 2026**: [PII analyze / redact API](/features/middleware/#analyze--redact-api) — the PII detection pipeline (NER + restricted-regex pattern tiers) is now a standalone service: `POST /api/pii/analyze` returns detected entity spans and `POST /api/pii/redact` returns the sanitised text (or `400 pii_blocked`), without routing a chat request through the middleware. Events gain an `origin` (`middleware` / `proxy` / `pii_analyze` / `pii_redact`) so `/api/pii/events` can be filtered by source.
- **July 2026**: [Model capabilities endpoint](/features/api-discovery/#model-capabilities) — `GET /v1/models/capabilities`, an additive superset of `/v1/models` that reports each model's `capabilities` plus its `input_modalities` / `output_modalities` (`text` / `image` / `audio` / `video`). Lets clients route image/audio/video attachments to a model only when it can handle them; audio/video *input* is derived from the model's multimodal limits, which no single usecase flag expresses.
 - **June 2026**: Concurrent scoring and PII NER on llama.cpp — the `Score` (router classifier) and `TokenClassify` (PII NER) primitives now ride llama.cpp's server task queue instead of locking the context, so they run concurrently with chat/completion/embedding traffic and with each other. The `known_usecases` restriction that forced dedicated scorer/NER model configs on llama-cpp is lifted, repeated scoring calls reuse the prompt KV cache across candidates, and scoring inputs are no longer capped by the physical batch size.

 ## 2024 Highlights
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,56 +1,4 @@
 ---
- name: "qwopus3.6-35b-a3b-coder-mtp"
-  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
-  urls:
-    - https://huggingface.co/Jackrong/Qwopus3.6-35B-A3B-Coder-MTP-GGUF
-  description: |
-    # 🌟 Qwopus3.6-35B-A3B-v1
-
-    ## 💡 Base Model Overview
-
-    **Qwen3.6-35B-A3B** is an advanced hybrid sparse MoE (Mixture-of-Experts) model developed by Alibaba Cloud. It features 35B total parameters with only 3B active parameters per token, ensuring high inference efficiency. Architecturally, it combines Gated DeltaNet linear attention with standard gated attention layers, routing tokens across **256 experts**. It natively supports a massive **262k context window** and is specifically designed for high-performance agentic coding, deep reasoning, and multimodal tasks.
-
-    ## 🚀 Model Refinement & Logic Tuning （Qwopus3.6-35B-A3B-v1）
-
-    🪐**Qwopus3.6-35B-A3B-v1** is a reasoning-enhanced MoE (Mixture of Experts) model fine-tuned on top of **Qwen3.6-35B-A3B**.
-
-    ### 🛠 Training Strategy
-
-    The fine-tuning process for this model is structured into **three distinct stages of distributed SFT (Supervised Fine-Tuning)**, progressively scaling reasoning complexity and data diversity. This systematic approach ensures the model inherits the base MoE capabilities while sharpening its logic-handling depth.
-
-    ...
-  license: "apache-2.0"
-  tags:
-    - llm
-    - gguf
-    - vision
-    - multimodal
-  icon: https://cdn-uploads.huggingface.co/production/uploads/66309bd090589b7c65950665/ztbyGV_zGhzcLuTCSVyq3.png
-  overrides:
-    backend: llama-cpp
-    function:
-      automatic_tool_parsing_fallback: true
-      grammar:
-        disable: true
-    known_usecases:
-      - chat
-    mmproj: llama-cpp/mmproj/Qwopus3.6-35B-A3B-Coder-MTP-Q4_K_M/mmproj-F32.gguf
-    options:
-      - use_jinja:true
-      - spec_type:draft-mtp
-      - spec_n_max:6
-      - spec_p_min:0.75
-    parameters:
-      model: llama-cpp/models/Qwopus3.6-35B-A3B-Coder-MTP-Q4_K_M/Qwopus3.6-35B-A3B-Coder-MTP-Q4_K_M.gguf
-    template:
-      use_tokenizer_template: true
-  files:
-    - filename: llama-cpp/models/Qwopus3.6-35B-A3B-Coder-MTP-Q4_K_M/Qwopus3.6-35B-A3B-Coder-MTP-Q4_K_M.gguf
-      sha256: c283cd2321a3cb4c6e7faf9481ac7d946913e4f02e20172eb2872112f567d8d4
-      uri: https://huggingface.co/Jackrong/Qwopus3.6-35B-A3B-Coder-MTP-GGUF/resolve/main/Qwopus3.6-35B-A3B-Coder-MTP-Q4_K_M.gguf
-    - filename: llama-cpp/mmproj/Qwopus3.6-35B-A3B-Coder-MTP-Q4_K_M/mmproj-F32.gguf
-      sha256: 5c82c8095717b39f29c88ebfec3607a10307785b1e14a87744603d6c582cd497
-      uri: https://huggingface.co/Jackrong/Qwopus3.6-35B-A3B-Coder-MTP-GGUF/resolve/main/mmproj-F32.gguf
 - name: "ornith-1.0-9b-mtp"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
--- a/pkg/functions/peg/parser.go
+++ b/pkg/functions/peg/parser.go
@@ -461,7 +461,10 @@ func (p *RuleParser) parse(arena *Arena, ctx *ParseContext, start int) ParseResu
 	if result.Type != Fail {
 		text := ""
 		if result.Start < len(ctx.Input) {
-			end := min(result.End, len(ctx.Input))
+			end := result.End
+			if end > len(ctx.Input) {
+				end = len(ctx.Input)
+			}
 			text = ctx.Input[result.Start:end]
 		}

@@ -511,7 +514,10 @@ func (p *TagParser) parse(arena *Arena, ctx *ParseContext, start int) ParseResul
 	if result.Type != Fail {
 		text := ""
 		if result.Start < len(ctx.Input) {
-			end := min(result.End, len(ctx.Input))
+			end := result.End
+			if end > len(ctx.Input) {
+				end = len(ctx.Input)
+			}
 			text = ctx.Input[result.Start:end]
 		}

--- a/pkg/mcp/localaitools/client.go
+++ b/pkg/mcp/localaitools/client.go
@@ -36,10 +36,6 @@ type LocalAIClient interface {
 	DeleteModel(ctx context.Context, name string) error
 	EditModelConfig(ctx context.Context, name string, patch map[string]any) error
 	ReloadModels(ctx context.Context) error
-	// LoadModel pre-loads a model into memory by name (the inverse of shutting
-	// it down). For a realtime pipeline model every configured sub-model is
-	// loaded; it returns the model names that became resident.
-	LoadModel(ctx context.Context, model string) ([]string, error)
 	ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error)

 	// ---- Model aliases ----
--- a/pkg/mcp/localaitools/coverage_test.go
+++ b/pkg/mcp/localaitools/coverage_test.go
@@ -49,7 +49,6 @@ var toolToHTTPRoute = map[string]string{
 	ToolDeleteModel:       "POST /models/delete/:name",
 	ToolEditModelConfig:   "PATCH /api/models/config-json/:name",
 	ToolReloadModels:      "POST /models/reload",
-	ToolLoadModel:         "POST /backend/load",
 	ToolInstallBackend:    "POST /backends/apply",
 	ToolUpgradeBackend:    "POST /backends/upgrade/:name",
 	ToolToggleModelState:  "PUT /models/toggle-state/:name/:action",
--- a/pkg/mcp/localaitools/fakes_test.go
+++ b/pkg/mcp/localaitools/fakes_test.go
@@ -35,7 +35,6 @@ type fakeClient struct {
 	setAlias            func(string, string) error
 	listAliases         func() ([]AliasInfo, error)
 	reloadModels        func() error
-	loadModel           func(string) ([]string, error)
 	listBackends        func() ([]Backend, error)
 	listKnownBackends   func() ([]schema.KnownBackend, error)
 	installBackend      func(InstallBackendRequest) (string, error)
@@ -170,14 +169,6 @@ func (f *fakeClient) ReloadModels(_ context.Context) error {
 	return nil
 }

-func (f *fakeClient) LoadModel(_ context.Context, model string) ([]string, error) {
-	f.record("LoadModel", model)
-	if f.loadModel != nil {
-		return f.loadModel(model)
-	}
-	return []string{model}, nil
-}
-
 func (f *fakeClient) ListBackends(_ context.Context) ([]Backend, error) {
 	f.record("ListBackends", nil)
 	if f.listBackends != nil {
--- a/pkg/mcp/localaitools/httpapi/client.go
+++ b/pkg/mcp/localaitools/httpapi/client.go
@@ -338,16 +338,6 @@ func (c *Client) ReloadModels(ctx context.Context) error {
 	return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil)
 }

-func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
-	// On a load failure the endpoint returns a non-2xx whose body (carrying the
-	// per-sub-model failure detail) is folded into the HTTPError by c.do.
-	var resp schema.ModelLoadResponse
-	if err := c.do(ctx, http.MethodPost, routeBackendLoad, map[string]string{"model": model}, &resp); err != nil {
-		return nil, err
-	}
-	return resp.Loaded, nil
-}
-
 // ---- Model aliases ----

 // SetAlias is swap-first: it PATCHes the alias config (a deep-merge that
--- a/pkg/mcp/localaitools/httpapi/routes.go
+++ b/pkg/mcp/localaitools/httpapi/routes.go
@@ -19,7 +19,6 @@ const (
 	routeModelImport     = "/models/import"
 	routeAliases         = "/api/aliases"
 	routeModelsReload    = "/models/reload"
-	routeBackendLoad     = "/backend/load"
 	routeBackends        = "/backends"
 	routeBackendsKnown   = "/backends/known"
 	routeBackendsApply   = "/backends/apply"
--- a/pkg/mcp/localaitools/inproc/client.go
+++ b/pkg/mcp/localaitools/inproc/client.go
@@ -13,7 +13,6 @@ import (
 	"path/filepath"

 	"github.com/google/uuid"
-	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/gallery/importers"
@@ -303,16 +302,6 @@ func (c *Client) ReloadModels(_ context.Context) error {
 	return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath)
 }

-func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
-	if c.ConfigLoader == nil || c.ModelLoader == nil {
-		return nil, errors.New("model loader not available")
-	}
-	// Reuse the same preload path the REST /backend/load endpoint uses, so a
-	// pipeline model loads all its sub-models and the behaviour stays identical
-	// across the in-process and HTTP clients.
-	return backend.PreloadModelByName(ctx, c.ConfigLoader, c.ModelLoader, c.AppConfig, model)
-}
-
 // ---- Model aliases ----

 // SetAlias is swap-first to match the httpapi client: PatchConfig swaps an
--- a/pkg/mcp/localaitools/inproc/load_model_test.go
+++ b/pkg/mcp/localaitools/inproc/load_model_test.go
@@ -1,71 +0,0 @@
-package inproc
-
-import (
-	"context"
-	"os"
-	"path/filepath"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-)
-
-var _ = Describe("inproc.Client LoadModel", func() {
-	var (
-		ctx       context.Context
-		tempDir   string
-		cl        *config.ModelConfigLoader
-		ml        *model.ModelLoader
-		c         *Client
-		seedModel func(name, body string)
-	)
-
-	BeforeEach(func() {
-		ctx = context.Background()
-		tempDir = GinkgoT().TempDir()
-		systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
-		Expect(err).ToNot(HaveOccurred())
-		appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
-		cl = config.NewModelConfigLoader(tempDir)
-		ml = model.NewModelLoader(systemState) // no backends installed
-		c = New(appConfig, systemState, cl, ml, nil)
-
-		seedModel = func(name, body string) {
-			Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
-			Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed())
-		}
-	})
-
-	It("errors when the model loader is unavailable", func() {
-		noLoader := New(c.AppConfig, c.SystemState, cl, nil, nil)
-		_, err := noLoader.LoadModel(ctx, "anything")
-		Expect(err).To(MatchError(ContainSubstring("model loader not available")))
-	})
-
-	It("loads a regular model through the model loader", func() {
-		seedModel("solo", "name: solo\n")
-		// No backend is installed in the test env, so the load itself fails — but
-		// the call must exercise the single-model path and surface that error
-		// rather than panicking or silently succeeding.
-		loaded, err := c.LoadModel(ctx, "solo")
-		Expect(err).To(HaveOccurred())
-		Expect(loaded).To(BeEmpty())
-	})
-
-	It("expands a pipeline model into its sub-models", func() {
-		seedModel("voicebot", "name: voicebot\npipeline:\n  vad: vad-m\n  llm: llm-m\n")
-		seedModel("vad-m", "name: vad-m\n")
-		seedModel("llm-m", "name: llm-m\n")
-
-		loaded, err := c.LoadModel(ctx, "voicebot")
-		// Sub-models can't load without backends, so the joined error names them
-		// — proving the pipeline stub was expanded rather than loaded directly.
-		Expect(err).To(HaveOccurred())
-		Expect(err.Error()).To(ContainSubstring("vad-m"))
-		Expect(err.Error()).ToNot(ContainSubstring("voicebot"))
-		Expect(loaded).To(BeEmpty())
-	})
-})
--- a/pkg/mcp/localaitools/prompts/10_safety.md
+++ b/pkg/mcp/localaitools/prompts/10_safety.md
@@ -2,7 +2,7 @@

 These rules are non-negotiable. The user trusts you to operate their server without unintended changes.

-1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `load_model`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.
+1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.

 2. **Disambiguate before mutating.** If the user's request is ambiguous (several gallery candidates match, the model name has multiple installed versions, the backend has variants), present the candidates as a numbered list and ask the user to pick before calling any mutating tool.

--- a/pkg/mcp/localaitools/prompts/20_tools.md
+++ b/pkg/mcp/localaitools/prompts/20_tools.md
@@ -24,6 +24,5 @@ The MCP `tools/list` endpoint also exposes the full input schema for each of the
 - `upgrade_backend` — Upgrade an installed backend by name.
 - `edit_model_config` — Patch (deep-merge) JSON into an installed model's config.
 - `reload_models` — Reload all model configs from disk.
- `load_model` — Pre-load a model into memory so the first request pays no cold-start cost. For a realtime pipeline model, every sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Inverse of stopping a model.
 - `toggle_model_state` — Enable or disable a model (`action`: `enable` or `disable`).
 - `toggle_model_pinned` — Pin or unpin a model (`action`: `pin` or `unpin`).
--- a/pkg/mcp/localaitools/server_test.go
+++ b/pkg/mcp/localaitools/server_test.go
@@ -92,7 +92,6 @@ var expectedFullCatalog = sortedStrings(
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
-	ToolLoadModel,
 	ToolReloadModels,
 	ToolSetAlias,
 	ToolSetBranding,
@@ -167,7 +166,6 @@ var _ = Describe("Tool dispatch", func() {
 		{ToolUpgradeBackend, map[string]any{"name": "llama-cpp"}, "UpgradeBackend"},
 		{ToolEditModelConfig, map[string]any{"name": "foo", "patch": map[string]any{"context_size": 4096}}, "EditModelConfig"},
 		{ToolReloadModels, struct{}{}, "ReloadModels"},
-		{ToolLoadModel, map[string]any{"model": "test-model"}, "LoadModel"},
 		{ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"},
 		{ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"},
 		{ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"},
--- a/pkg/mcp/localaitools/tools.go
+++ b/pkg/mcp/localaitools/tools.go
@@ -31,7 +31,6 @@ const (
 	ToolDeleteModel       = "delete_model"
 	ToolEditModelConfig   = "edit_model_config"
 	ToolReloadModels      = "reload_models"
-	ToolLoadModel         = "load_model"
 	ToolInstallBackend    = "install_backend"
 	ToolUpgradeBackend    = "upgrade_backend"
 	ToolToggleModelState  = "toggle_model_state"
--- a/pkg/mcp/localaitools/tools_models.go
+++ b/pkg/mcp/localaitools/tools_models.go
@@ -65,22 +65,6 @@ func registerModelTools(s *mcp.Server, client LocalAIClient, opts Options) {
 		return
 	}

-	mcp.AddTool(s, &mcp.Tool{
-		Name:        ToolLoadModel,
-		Description: "Pre-load a model into memory by name so the first request pays no cold-start cost (the inverse of shutting a model down). For a realtime pipeline model every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Returns the model names that became resident. Requires user confirmation per safety rule 1.",
-	}, func(ctx context.Context, _ *mcp.CallToolRequest, args struct {
-		Model string `json:"model" jsonschema:"The installed model name to load into memory."`
-	}) (*mcp.CallToolResult, any, error) {
-		if args.Model == "" {
-			return errorResultf("model is required"), nil, nil
-		}
-		loaded, err := client.LoadModel(ctx, args.Model)
-		if err != nil {
-			return errorResult(err), nil, nil
-		}
-		return jsonResult(map[string]any{"loaded": loaded}), nil, nil
-	})
-
 	mcp.AddTool(s, &mcp.Tool{
 		Name:        ToolInstallModel,
 		Description: "Install a model from a gallery. Requires explicit user confirmation per safety rule 1. Returns a job id; poll with get_job_status.",
--- a/scripts/build/package-gpu-libs-rocm-data_test.sh
+++ b/scripts/build/package-gpu-libs-rocm-data_test.sh
@@ -1,57 +0,0 @@
-#!/bin/bash
-# Regression test for scripts/build/package-gpu-libs.sh ROCm data bundling.
-#
-# Guards issue #10660: hipBLASLt (rocblaslt) resolves its TensileLibrary_lazy_gfx*.dat
-# kernel data relative to the bundled libhipblaslt.so. The packager copied the
-# rocblas/ data dir but not the hipblaslt/ data dir, so the bundled backend
-# fell back to slow generic kernels and logged
-#   rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory
-#
-# This test fabricates a fake ROCm tree containing both rocblas/ and hipblaslt/
-# tensile data, points the packager at it via ROCM_BASE_DIRS, and asserts BOTH
-# data directories are bundled into the target lib dir.
-set -euo pipefail
-
-CURDIR=$(dirname "$(realpath "$0")")
-SCRIPT="$CURDIR/package-gpu-libs.sh"
-
-WORK=$(mktemp -d)
-trap 'rm -rf "$WORK"' EXIT
-
-# Fabricate a fake ROCm install with both rocblas and hipblaslt tensile data.
-FAKE_ROCM="$WORK/opt/rocm"
-mkdir -p "$FAKE_ROCM/lib/rocblas/library"
-mkdir -p "$FAKE_ROCM/lib/hipblaslt/library"
-echo "fake rocblas tensile" > "$FAKE_ROCM/lib/rocblas/library/TensileLibrary_lazy_gfx1201.dat"
-echo "fake hipblaslt tensile" > "$FAKE_ROCM/lib/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat"
-
-TARGET="$WORK/target"
-mkdir -p "$TARGET"
-
-# shellcheck source=/dev/null
-source "$SCRIPT" "$TARGET"
-
-# Point the data-dir copy at the fabricated tree instead of the real /opt/rocm,
-# then run the actual ROCm packager. This asserts package_rocm_libs itself
-# bundles BOTH data dirs, not just that the helper works in isolation.
-export BUILD_TYPE=hipblas
-export ROCM_BASE_DIRS="$FAKE_ROCM"
-package_rocm_libs
-
-fail=false
-if [ ! -e "$TARGET/rocblas/library/TensileLibrary_lazy_gfx1201.dat" ]; then
-    echo "FAIL: rocblas tensile data was NOT bundled"
-    fail=true
-fi
-if [ ! -e "$TARGET/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat" ]; then
-    echo "FAIL: hipblaslt tensile data was NOT bundled (regression of #10660)"
-    fail=true
-fi
-
-if [ "$fail" = true ]; then
-    ls -R "$TARGET" || true
-    exit 1
-fi
-
-echo "PASS: rocblas and hipblaslt tensile data were both bundled"
-exit 0
--- a/scripts/build/package-gpu-libs.sh
+++ b/scripts/build/package-gpu-libs.sh
@@ -224,50 +224,6 @@ package_cuda_libs() {
    echo "CUDA libraries packaged successfully"
 }

-# Copy a ROCm library data subdirectory (e.g. rocblas, hipblaslt) into the
-# bundled lib/ dir. These directories hold the TensileLibrary_*.dat GPU kernel
-# tuning files, which rocBLAS/hipBLASLt load at runtime *relative to their own
-# .so*. Since backends ship their own copies of libhipblaslt.so/librocblas.so
-# under lib/, the matching data dir must travel with them or the libs fall back
-# to slow generic kernels (rocblaslt error: Cannot read TensileLibrary_lazy_gfx*.dat;
-# see issue #10660).
-#
-# The ROCm search roots default to /opt/rocm{,-*} but can be overridden via the
-# ROCM_BASE_DIRS env var (space-separated), which keeps the copy unit-testable
-# without a real ROCm install.
-# Args: $1 = data subdir name found under <rocm-root>/lib{,64}/
-copy_rocm_data_dir() {
-    local data_name="$1"
-    # Single-line `local x=$(...)` on purpose: `local` masks the command
-    # substitution's exit status, which is 1 when nullglob is unset and would
-    # otherwise trip the script's `set -e`.
-    local old_nullglob=$(shopt -p nullglob)
-    shopt -s nullglob
-    local rocm_dirs
-    if [ -n "${ROCM_BASE_DIRS:-}" ]; then
-        # shellcheck disable=SC2206  # intentional word-split of the override
-        rocm_dirs=(${ROCM_BASE_DIRS})
-    else
-        rocm_dirs=(/opt/rocm /opt/rocm-*)
-    fi
-    eval "$old_nullglob"
-    local found=false
-    local rocm_base lib_subdir
-    for rocm_base in "${rocm_dirs[@]}"; do
-        for lib_subdir in lib lib64; do
-            if [ -d "$rocm_base/$lib_subdir/$data_name" ]; then
-                echo "Found $data_name data at $rocm_base/$lib_subdir/$data_name"
-                mkdir -p "$TARGET_LIB_DIR/$data_name"
-                cp -arfL "$rocm_base/$lib_subdir/$data_name/"* "$TARGET_LIB_DIR/$data_name/" || echo "WARNING: Failed to copy $data_name data from $rocm_base/$lib_subdir/$data_name"
-                found=true
-            fi
-        done
-    done
-    if [ "$found" = false ]; then
-        echo "WARNING: No $data_name library data found in ${ROCM_BASE_DIRS:-/opt/rocm*}/lib{,64}/$data_name"
-    fi
-}
-
 # Package AMD ROCm/HIPBlas libraries
 package_rocm_libs() {
    echo "Packaging ROCm/HIPBlas libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -311,16 +267,27 @@ package_rocm_libs() {
        fi
    done

-    # Copy rocBLAS and hipBLASLt kernel data (TensileLibrary_*.dat tuning files)
-    # so the bundled libs find their per-arch kernels at runtime instead of
-    # falling back to slow generic code (see copy_rocm_data_dir / issue #10660).
-    copy_rocm_data_dir rocblas
-    copy_rocm_data_dir hipblaslt
+    # Copy rocblas library data (tuning files, TensileLibrary, etc.)
+    local old_nullglob=$(shopt -p nullglob)
+    shopt -s nullglob
+    local rocm_dirs=(/opt/rocm /opt/rocm-*)
+    eval "$old_nullglob"
+    local rocblas_found=false
+    for rocm_base in "${rocm_dirs[@]}"; do
+        for lib_subdir in lib lib64; do
+            if [ -d "$rocm_base/$lib_subdir/rocblas" ]; then
+                echo "Found rocblas data at $rocm_base/$lib_subdir/rocblas"
+                mkdir -p "$TARGET_LIB_DIR/rocblas"
+                cp -arfL "$rocm_base/$lib_subdir/rocblas/"* "$TARGET_LIB_DIR/rocblas/" || echo "WARNING: Failed to copy rocblas data from $rocm_base/$lib_subdir/rocblas"
+                rocblas_found=true
+            fi
+        done
+    done
+    if [ "$rocblas_found" = false ]; then
+        echo "WARNING: No rocblas library data found in /opt/rocm*/lib{,64}/rocblas"
+    fi

    # Copy libomp from LLVM (required for ROCm)
-    # Single-line `local x=$(...)` on purpose: masks shopt -p's nonzero exit
-    # (nullglob unset) so it doesn't trip `set -e`.
-    local old_nullglob=$(shopt -p nullglob)
    shopt -s nullglob
    local omp_libs=(/opt/rocm*/lib/llvm/lib/libomp.so*)
    eval "$old_nullglob"
@@ -510,7 +477,6 @@ export -f copy_libs_glob
 export -f is_core_lib
 export -f copy_elf_deps
 export -f sweep_transitive_deps
-export -f copy_rocm_data_dir
 export -f package_cuda_libs
 export -f package_rocm_libs
 export -f package_intel_libs
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1443,52 +1443,6 @@ const docTemplate = `{
                "responses": {}
            }
        },
-        "/backend/load": {
-            "post": {
-                "description": "Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.",
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "monitoring"
-                ],
-                "summary": "Pre-load a model into memory",
-                "parameters": [
-                    {
-                        "description": "Model to load",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "Model loaded",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Missing model name",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadResponse"
-                        }
-                    },
-                    "500": {
-                        "description": "Load failed (Loaded lists any sub-models that did load)",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/backend/monitor": {
            "get": {
                "tags": [
@@ -2728,22 +2682,6 @@ const docTemplate = `{
                }
            }
        },
-        "/v1/models/capabilities": {
-            "get": {
-                "tags": [
-                    "models"
-                ],
-                "summary": "List available models enriched with capabilities and input/output modalities.",
-                "responses": {
-                    "200": {
-                        "description": "Response",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelCapabilitiesResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/v1/rerank": {
            "post": {
                "tags": [
@@ -5198,76 +5136,6 @@ const docTemplate = `{
                }
            }
        },
-        "schema.ModelCapabilities": {
-            "type": "object",
-            "properties": {
-                "capabilities": {
-                    "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "id": {
-                    "type": "string"
-                },
-                "input_modalities": {
-                    "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                },
-                "output_modalities": {
-                    "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            }
-        },
-        "schema.ModelCapabilitiesResponse": {
-            "type": "object",
-            "properties": {
-                "data": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.ModelCapabilities"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.ModelLoadRequest": {
-            "type": "object",
-            "properties": {
-                "model": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.ModelLoadResponse": {
-            "type": "object",
-            "properties": {
-                "loaded": {
-                    "description": "Loaded lists the model names actually resident in memory after the call.\nFor a pipeline model these are its sub-models, not the pipeline name.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "message": {
-                    "description": "Message is a short human-readable status (\"model loaded\", or an error).",
-                    "type": "string"
-                }
-            }
-        },
        "schema.ModelsDataResponse": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1440,52 +1440,6 @@
                "responses": {}
            }
        },
-        "/backend/load": {
-            "post": {
-                "description": "Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.",
-                "consumes": [
-                    "application/json"
-                ],
-                "produces": [
-                    "application/json"
-                ],
-                "tags": [
-                    "monitoring"
-                ],
-                "summary": "Pre-load a model into memory",
-                "parameters": [
-                    {
-                        "description": "Model to load",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "Model loaded",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadResponse"
-                        }
-                    },
-                    "400": {
-                        "description": "Missing model name",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadResponse"
-                        }
-                    },
-                    "500": {
-                        "description": "Load failed (Loaded lists any sub-models that did load)",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelLoadResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/backend/monitor": {
            "get": {
                "tags": [
@@ -2725,22 +2679,6 @@
                }
            }
        },
-        "/v1/models/capabilities": {
-            "get": {
-                "tags": [
-                    "models"
-                ],
-                "summary": "List available models enriched with capabilities and input/output modalities.",
-                "responses": {
-                    "200": {
-                        "description": "Response",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelCapabilitiesResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/v1/rerank": {
            "post": {
                "tags": [
@@ -5195,76 +5133,6 @@
                }
            }
        },
-        "schema.ModelCapabilities": {
-            "type": "object",
-            "properties": {
-                "capabilities": {
-                    "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "id": {
-                    "type": "string"
-                },
-                "input_modalities": {
-                    "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                },
-                "output_modalities": {
-                    "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            }
-        },
-        "schema.ModelCapabilitiesResponse": {
-            "type": "object",
-            "properties": {
-                "data": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.ModelCapabilities"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.ModelLoadRequest": {
-            "type": "object",
-            "properties": {
-                "model": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.ModelLoadResponse": {
-            "type": "object",
-            "properties": {
-                "loaded": {
-                    "description": "Loaded lists the model names actually resident in memory after the call.\nFor a pipeline model these are its sub-models, not the pipeline name.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "message": {
-                    "description": "Message is a short human-readable status (\"model loaded\", or an error).",
-                    "type": "string"
-                }
-            }
-        },
        "schema.ModelsDataResponse": {
            "type": "object",
            "properties": {
--- a/Show More
+++ b/Show More