chore(uv): add --index-strategy=unsafe-first-match to l4t

This is because the main index might not contain all the dependencies for torch Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-04 03:32:40 -05:00 · 2026-01-08 21:47:18 +00:00
38 changed files with 184 additions and 3477 deletions
--- a/11
+++ b/11
@@ -16,7 +16,7 @@ RUN apt-get update && \

 # The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
 FROM requirements AS requirements-drivers
-ARG VULKAN_FROM_SOURCE=false
+
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=0
@@ -41,7 +41,7 @@ RUN <<EOT bash
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils mesa-vulkan-drivers
-        if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
+        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
            rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -59,11 +59,6 @@ RUN <<EOT bash
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
            rm -rf /opt/vulkan-sdk
-        elif [ "amd64" = "${TARGETARCH}}" ]; then
-            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
-            apt-get update && \
-            apt-get install -y vulkan-sdk
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            mkdir vulkan && cd vulkan && \
@@ -111,7 +106,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -46,7 +46,7 @@ RUN <<EOT bash
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
-        if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
+        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
            rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -64,11 +64,6 @@ RUN <<EOT bash
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
            rm -rf /opt/vulkan-sdk
-        elif [ "amd64" = "${TARGETARCH}}" ]; then
-            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
-            apt-get update && \
-            apt-get install -y vulkan-sdk
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            mkdir vulkan && cd vulkan && \
@@ -99,11 +94,7 @@ RUN <<EOT bash
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
@@ -115,7 +106,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -103,7 +103,7 @@ RUN <<EOT bash
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
-        if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
+        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
            rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -121,11 +121,6 @@ RUN <<EOT bash
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
            rm -rf /opt/vulkan-sdk
-        elif [ "amd64" = "${TARGETARCH}}" ]; then
-            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
-            apt-get update && \
-            apt-get install -y vulkan-sdk
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            mkdir vulkan && cd vulkan && \
@@ -153,14 +148,11 @@ RUN <<EOT bash
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
+            echo https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
@@ -172,7 +164,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -60,7 +60,7 @@ RUN <<EOT bash
            git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
            ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
            clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
-        if [ "amd64" = "$TARGETARCH" ] && [ "${VULKAN_FROM_SOURCE}" = "true" ]; then
+        if [ "amd64" = "$TARGETARCH" ]; then
            wget "https://sdk.lunarg.com/sdk/download/1.4.328.1/linux/vulkansdk-linux-x86_64-1.4.328.1.tar.xz" && \
            tar -xf vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
            rm vulkansdk-linux-x86_64-1.4.328.1.tar.xz && \
@@ -78,11 +78,6 @@ RUN <<EOT bash
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/include/* /usr/include/ && \
            cp -rfv /opt/vulkan-sdk/1.4.328.1/x86_64/share/* /usr/share/ && \
            rm -rf /opt/vulkan-sdk
-        elif [ "amd64" = "${TARGETARCH}}" ]; then
-            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc && \
-            wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list http://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
-            apt-get update && \
-            apt-get install -y vulkan-sdk
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            mkdir vulkan && cd vulkan && \
@@ -113,11 +108,7 @@ RUN <<EOT bash
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
-            if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
-            else
-                curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
-            fi
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
@@ -129,7 +120,7 @@ RUN <<EOT bash
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
-        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
+        if [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=b1377188784f9aea26b8abde56d4aee8c733eec7
+LLAMA_VERSION?=ae9f8df77882716b1702df2bed8919499e64cc28
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -23,7 +23,6 @@
 #include <grpcpp/health_check_service_interface.h>
 #include <regex>
 #include <atomic>
-#include <mutex>
 #include <signal.h>
 #include <thread>

@@ -391,9 +390,8 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    // Initialize fit_params options (can be overridden by options)
    // fit_params: whether to auto-adjust params to fit device memory (default: true as in llama.cpp)
    params.fit_params = true;
-    // fit_params_target: target margin per device in bytes (default: 1GB per device)
-    // Initialize as vector with default value for all devices
-    params.fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024 * 1024);
+    // fit_params_target: target margin per device in bytes (default: 1GB)
+    params.fit_params_target = 1024 * 1024 * 1024;
    // fit_params_min_ctx: minimum context size for fit (default: 4096)
    params.fit_params_min_ctx = 4096;

@@ -470,28 +468,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
        } else if (!strcmp(optname, "fit_params_target") || !strcmp(optname, "fit_target")) {
            if (optval != NULL) {
                try {
-                    // Value is in MiB, can be comma-separated list for multiple devices
-                    // Single value is broadcast across all devices
-                    std::string arg_next = optval_str;
-                    const std::regex regex{ R"([,/]+)" };
-                    std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
-                    std::vector<std::string> split_arg{ it, {} };
-                    if (split_arg.size() >= llama_max_devices()) {
-                        // Too many values provided
-                        continue;
-                    }
-                    if (split_arg.size() == 1) {
-                        // Single value: broadcast to all devices
-                        size_t value_mib = std::stoul(split_arg[0]);
-                        std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), value_mib * 1024 * 1024);
-                    } else {
-                        // Multiple values: set per device
-                        for (size_t i = 0; i < split_arg.size() && i < params.fit_params_target.size(); i++) {
-                            params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024 * 1024;
-                        }
-                    }
+                    // Value is in MiB, convert to bytes
+                    params.fit_params_target = static_cast<size_t>(std::stoi(optval_str)) * 1024 * 1024;
                } catch (const std::exception& e) {
-                    // If conversion fails, keep default value (1GB per device)
+                    // If conversion fails, keep default value (1GB)
                }
            }
        } else if (!strcmp(optname, "fit_params_min_ctx") || !strcmp(optname, "fit_ctx")) {
@@ -706,13 +686,13 @@ private:
 public:
    BackendServiceImpl(server_context& ctx) : ctx_server(ctx) {}

-    grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) override {
+    grpc::Status Health(ServerContext* /*context*/, const backend::HealthMessage* /*request*/, backend::Reply* reply) {
        // Implement Health RPC
        reply->set_message("OK");
        return Status::OK;
    }

-    grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) override {
+    grpc::Status LoadModel(ServerContext* /*context*/, const backend::ModelOptions* request, backend::Result* result) {
        // Implement LoadModel RPC
        common_params params;
        params_parse(ctx_server, request, params);
@@ -729,72 +709,11 @@ public:
        LOG_INF("\n");
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
        LOG_INF("\n");
-        
-        // Capture error messages during model loading
-        struct error_capture {
-            std::string captured_error;
-            std::mutex error_mutex;
-            ggml_log_callback original_callback;
-            void* original_user_data;
-        } error_capture_data;
-        
-        // Get original log callback
-        llama_log_get(&error_capture_data.original_callback, &error_capture_data.original_user_data);
-        
-        // Set custom callback to capture errors
-        llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
-            auto* capture = static_cast<error_capture*>(user_data);
-            
-            // Capture error messages
-            if (level == GGML_LOG_LEVEL_ERROR) {
-                std::lock_guard<std::mutex> lock(capture->error_mutex);
-                // Append error message, removing trailing newlines
-                std::string msg(text);
-                while (!msg.empty() && (msg.back() == '\n' || msg.back() == '\r')) {
-                    msg.pop_back();
-                }
-                if (!msg.empty()) {
-                    if (!capture->captured_error.empty()) {
-                        capture->captured_error.append("; ");
-                    }
-                    capture->captured_error.append(msg);
-                }
-            }
-            
-            // Also call original callback to preserve logging
-            if (capture->original_callback) {
-                capture->original_callback(level, text, capture->original_user_data);
-            }
-        }, &error_capture_data);
-        
        // load the model
-        bool load_success = ctx_server.load_model(params);
-        
-        // Restore original log callback
-        llama_log_set(error_capture_data.original_callback, error_capture_data.original_user_data);
-        
-        if (!load_success) {
-            std::string error_msg = "Failed to load model: " + params.model.path;
-            if (!params.mmproj.path.empty()) {
-                error_msg += " (with mmproj: " + params.mmproj.path + ")";
-            }
-            if (params.has_speculative() && !params.speculative.model.path.empty()) {
-                error_msg += " (with draft model: " + params.speculative.model.path + ")";
-            }
-            
-            // Add captured error details if available
-            {
-                std::lock_guard<std::mutex> lock(error_capture_data.error_mutex);
-                if (!error_capture_data.captured_error.empty()) {
-                    error_msg += ". Error: " + error_capture_data.captured_error;
-                } else {
-                    error_msg += ". Model file may not exist or be invalid.";
-                }
-            }
-            
-            result->set_message(error_msg);
+        if (!ctx_server.load_model(params)) {
+            result->set_message("Failed loading model");
            result->set_success(false);
-            return grpc::Status(grpc::StatusCode::INTERNAL, error_msg);
+            return Status::CANCELLED;
        }

        // Process grammar triggers now that vocab is available
@@ -1573,7 +1492,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) override {
+    grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
         if (params_base.model.path.empty()) {
             return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
         }
@@ -2244,7 +2163,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) override {
+    grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
        if (params_base.model.path.empty()) {
            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
        }
@@ -2339,7 +2258,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) override {
+    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
        if (!params_base.embedding || params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
            return grpc::Status(grpc::StatusCode::UNIMPLEMENTED, "This server does not support reranking. Start it with `--reranking` and without `--embedding`");
        }
@@ -2425,7 +2344,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
+    grpc::Status TokenizeString(ServerContext* /*context*/, const backend::PredictOptions* request, backend::TokenizationResponse* response) {
        if (params_base.model.path.empty()) {
            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
        }
@@ -2448,7 +2367,7 @@ public:
        return grpc::Status::OK;
    }

-    grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) override {
+    grpc::Status GetMetrics(ServerContext* /*context*/, const backend::MetricsRequest* /*request*/, backend::MetricsResponse* response) {

 // request slots data using task queue
        auto rd = ctx_server.get_response_reader();
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=0e52afc6513cc2dea9a1a017afc4a008d5acf2b0
+STABLEDIFFUSION_GGML_VERSION?=9be0b91927dfa4007d053df72dea7302990226bb

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/python/chatterbox/install.sh
+++ b/backend/python/chatterbox/install.sh
@@ -15,11 +15,14 @@ fi
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
-EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

-if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
-    USE_PIP=true
+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
 fi

+EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

 installRequirements
--- a/backend/python/chatterbox/requirements-install.txt
+++ b/backend/python/chatterbox/requirements-install.txt
@@ -1,5 +0,0 @@
-# Build dependencies needed for packages installed from source (e.g., git dependencies)
-# When using --no-build-isolation, these must be installed in the venv first
-wheel
-setuptools
-packaging
--- a/backend/python/diffusers/install.sh
+++ b/backend/python/diffusers/install.sh
@@ -16,10 +16,6 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

-if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
-    USE_PIP=true
-fi
-
 # Use python 3.12 for l4t
 if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
  PYTHON_VERSION="3.12"
@@ -27,4 +23,11 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
  PY_STANDALONE_TAG="20251120"
 fi

+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
+
 installRequirements
--- a/backend/python/kokoro/install.sh
+++ b/backend/python/kokoro/install.sh
@@ -16,8 +16,11 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

-if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
-    USE_PIP=true
+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
 fi

 installRequirements
--- a/backend/python/neutts/install.sh
+++ b/backend/python/neutts/install.sh
@@ -16,6 +16,13 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi

+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
+fi
+
 if [ "x${BUILD_TYPE}" == "xcublas" ] || [ "x${BUILD_TYPE}" == "xl4t" ]; then
    export CMAKE_ARGS="-DGGML_CUDA=on"
 fi
@@ -26,12 +33,6 @@ fi

 EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"

-
-if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
-    USE_PIP=true
-fi
-
-
 git clone https://github.com/neuphonic/neutts-air neutts-air

 cp -rfv neutts-air/neuttsair ./
--- a/backend/python/vibevoice/install.sh
+++ b/backend/python/vibevoice/install.sh
@@ -23,8 +23,11 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
  PY_STANDALONE_TAG="20251120"
 fi

-if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
-    USE_PIP=true
+# This is here because the jetson-ai-lab.io PyPI mirror's root PyPI endpoint (pypi.jetson-ai-lab.io/root/pypi/)
+# returns 503 errors when uv tries to fall back to it for packages not found in the specific subdirectory.
+# We need uv to continue falling through to the official PyPI index when it encounters these errors.
+if [ "x${BUILD_PROFILE}" == "xl4t12" ] || [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-first-match"
 fi

 installRequirements
--- a/core/gallery/backend_types.go
+++ b/core/gallery/backend_types.go
@@ -63,25 +63,6 @@ func (m *GalleryBackend) IsMeta() bool {
 	return len(m.CapabilitiesMap) > 0 && m.URI == ""
 }

-// IsCompatibleWith checks if the backend is compatible with the current system capability.
-// For meta backends, it checks if any of the capabilities in the map match the system capability.
-// For concrete backends, it delegates to SystemState.IsBackendCompatible.
-func (m *GalleryBackend) IsCompatibleWith(systemState *system.SystemState) bool {
-	if systemState == nil {
-		return true
-	}
-
-	// Meta backends are compatible if the system capability matches one of the keys
-	if m.IsMeta() {
-		capability := systemState.Capability(m.CapabilitiesMap)
-		_, exists := m.CapabilitiesMap[capability]
-		return exists
-	}
-
-	// For concrete backends, delegate to the system package
-	return systemState.IsBackendCompatible(m.Name, m.URI)
-}
-
 func (m *GalleryBackend) SetInstalled(installed bool) {
 	m.Installed = installed
 }
--- a/core/gallery/backends_test.go
+++ b/core/gallery/backends_test.go
@@ -172,252 +172,6 @@ var _ = Describe("Gallery Backends", func() {
 			Expect(nilMetaBackend.IsMeta()).To(BeFalse())
 		})

-		It("should check IsCompatibleWith correctly for meta backends", func() {
-			metaBackend := &GalleryBackend{
-				Metadata: Metadata{
-					Name: "meta-backend",
-				},
-				CapabilitiesMap: map[string]string{
-					"nvidia":  "nvidia-backend",
-					"amd":     "amd-backend",
-					"default": "default-backend",
-				},
-			}
-
-			// Test with nil state - should be compatible
-			Expect(metaBackend.IsCompatibleWith(nil)).To(BeTrue())
-
-			// Test with NVIDIA system - should be compatible (has nvidia key)
-			nvidiaState := &system.SystemState{GPUVendor: "nvidia", VRAM: 8 * 1024 * 1024 * 1024}
-			Expect(metaBackend.IsCompatibleWith(nvidiaState)).To(BeTrue())
-
-			// Test with default (no GPU) - should be compatible (has default key)
-			defaultState := &system.SystemState{}
-			Expect(metaBackend.IsCompatibleWith(defaultState)).To(BeTrue())
-		})
-
-		Describe("IsCompatibleWith for concrete backends", func() {
-			Context("CPU backends", func() {
-				It("should be compatible on all systems", func() {
-					cpuBackend := &GalleryBackend{
-						Metadata: Metadata{
-							Name: "cpu-llama-cpp",
-						},
-						URI: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp",
-					}
-					Expect(cpuBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
-					Expect(cpuBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					Expect(cpuBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-				})
-			})
-
-			Context("Darwin/Metal backends", func() {
-				When("running on darwin", func() {
-					BeforeEach(func() {
-						if runtime.GOOS != "darwin" {
-							Skip("Skipping darwin-specific tests on non-darwin system")
-						}
-					})
-
-					It("should be compatible for MLX backend", func() {
-						mlxBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "mlx",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx",
-						}
-						Expect(mlxBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
-					})
-
-					It("should be compatible for metal-llama-cpp backend", func() {
-						metalBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "metal-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp",
-						}
-						Expect(metalBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
-					})
-				})
-
-				When("running on non-darwin", func() {
-					BeforeEach(func() {
-						if runtime.GOOS == "darwin" {
-							Skip("Skipping non-darwin-specific tests on darwin system")
-						}
-					})
-
-					It("should NOT be compatible for MLX backend", func() {
-						mlxBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "mlx",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx",
-						}
-						Expect(mlxBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
-					})
-
-					It("should NOT be compatible for metal-llama-cpp backend", func() {
-						metalBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "metal-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp",
-						}
-						Expect(metalBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
-					})
-				})
-			})
-
-			Context("NVIDIA/CUDA backends", func() {
-				When("running on non-darwin", func() {
-					BeforeEach(func() {
-						if runtime.GOOS == "darwin" {
-							Skip("Skipping CUDA tests on darwin system")
-						}
-					})
-
-					It("should NOT be compatible without nvidia GPU", func() {
-						cudaBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "cuda12-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp",
-						}
-						Expect(cudaBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
-						Expect(cudaBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeFalse())
-					})
-
-					It("should be compatible with nvidia GPU", func() {
-						cudaBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "cuda12-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp",
-						}
-						Expect(cudaBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					})
-
-					It("should be compatible with cuda13 backend on nvidia GPU", func() {
-						cuda13Backend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "cuda13-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-llama-cpp",
-						}
-						Expect(cuda13Backend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					})
-				})
-			})
-
-			Context("AMD/ROCm backends", func() {
-				When("running on non-darwin", func() {
-					BeforeEach(func() {
-						if runtime.GOOS == "darwin" {
-							Skip("Skipping AMD/ROCm tests on darwin system")
-						}
-					})
-
-					It("should NOT be compatible without AMD GPU", func() {
-						rocmBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "rocm-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp",
-						}
-						Expect(rocmBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
-						Expect(rocmBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeFalse())
-					})
-
-					It("should be compatible with AMD GPU", func() {
-						rocmBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "rocm-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp",
-						}
-						Expect(rocmBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					})
-
-					It("should be compatible with hipblas backend on AMD GPU", func() {
-						hipBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "hip-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-hip-llama-cpp",
-						}
-						Expect(hipBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.AMD, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					})
-				})
-			})
-
-			Context("Intel/SYCL backends", func() {
-				When("running on non-darwin", func() {
-					BeforeEach(func() {
-						if runtime.GOOS == "darwin" {
-							Skip("Skipping Intel/SYCL tests on darwin system")
-						}
-					})
-
-					It("should NOT be compatible without Intel GPU", func() {
-						intelBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "intel-sycl-f16-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp",
-						}
-						Expect(intelBackend.IsCompatibleWith(&system.SystemState{})).To(BeFalse())
-						Expect(intelBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Nvidia, VRAM: 8 * 1024 * 1024 * 1024})).To(BeFalse())
-					})
-
-					It("should be compatible with Intel GPU", func() {
-						intelBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "intel-sycl-f16-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp",
-						}
-						Expect(intelBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Intel, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					})
-
-					It("should be compatible with intel-sycl-f32 backend on Intel GPU", func() {
-						intelF32Backend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "intel-sycl-f32-llama-cpp",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp",
-						}
-						Expect(intelF32Backend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Intel, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					})
-
-					It("should be compatible with intel-transformers backend on Intel GPU", func() {
-						intelTransformersBackend := &GalleryBackend{
-							Metadata: Metadata{
-								Name: "intel-transformers",
-							},
-							URI: "quay.io/go-skynet/local-ai-backends:latest-intel-transformers",
-						}
-						Expect(intelTransformersBackend.IsCompatibleWith(&system.SystemState{GPUVendor: system.Intel, VRAM: 8 * 1024 * 1024 * 1024})).To(BeTrue())
-					})
-				})
-			})
-
-			Context("Vulkan backends", func() {
-				It("should be compatible on CPU-only systems", func() {
-					// Vulkan backends don't have a specific GPU vendor requirement in the current logic
-					// They are compatible if no other GPU-specific pattern matches
-					vulkanBackend := &GalleryBackend{
-						Metadata: Metadata{
-							Name: "vulkan-llama-cpp",
-						},
-						URI: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp",
-					}
-					// Vulkan doesn't have vendor-specific filtering in current implementation
-					Expect(vulkanBackend.IsCompatibleWith(&system.SystemState{})).To(BeTrue())
-				})
-			})
-		})
-
 		It("should find best backend from meta based on system capabilities", func() {

 			metaBackend := &GalleryBackend{
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -226,16 +226,6 @@ func AvailableGalleryModels(galleries []config.Gallery, systemState *system.Syst

 // List available backends
 func AvailableBackends(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) {
-	return availableBackendsWithFilter(galleries, systemState, true)
-}
-
-// AvailableBackendsUnfiltered returns all available backends without filtering by system capability.
-func AvailableBackendsUnfiltered(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) {
-	return availableBackendsWithFilter(galleries, systemState, false)
-}
-
-// availableBackendsWithFilter is a helper function that lists available backends with optional filtering.
-func availableBackendsWithFilter(galleries []config.Gallery, systemState *system.SystemState, filterByCapability bool) (GalleryElements[*GalleryBackend], error) {
 	var backends []*GalleryBackend

 	systemBackends, err := ListSystemBackends(systemState)
@@ -251,17 +241,7 @@ func availableBackendsWithFilter(galleries []config.Gallery, systemState *system
 		if err != nil {
 			return nil, err
 		}
-
-		// Filter backends by system capability if requested
-		if filterByCapability {
-			for _, backend := range galleryBackends {
-				if backend.IsCompatibleWith(systemState) {
-					backends = append(backends, backend)
-				}
-			}
-		} else {
-			backends = append(backends, galleryBackends...)
-		}
+		backends = append(backends, galleryBackends...)
 	}

 	return backends, nil
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -205,7 +205,6 @@ func API(application *application.Application) (*echo.Echo, error) {

 	routes.RegisterLocalAIRoutes(e, requestExtractor, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService(), opcache, application.TemplatesEvaluator(), application)
 	routes.RegisterOpenAIRoutes(e, requestExtractor, application)
-	routes.RegisterAnthropicRoutes(e, requestExtractor, application)
 	if !application.ApplicationConfig().DisableWebUI {
 		routes.RegisterUIAPIRoutes(e, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService(), opcache, application)
 		routes.RegisterUIRoutes(e, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.GalleryService())
--- a/core/http/endpoints/anthropic/messages.go
+++ b/core/http/endpoints/anthropic/messages.go
@@ -1,537 +0,0 @@
-package anthropic
-
-import (
-	"encoding/json"
-	"fmt"
-
-	"github.com/google/uuid"
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/backend"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/middleware"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/templates"
-	"github.com/mudler/LocalAI/pkg/functions"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/xlog"
-)
-
-// MessagesEndpoint is the Anthropic Messages API endpoint
-// https://docs.anthropic.com/claude/reference/messages_post
-// @Summary Generate a message response for the given messages and model.
-// @Param request body schema.AnthropicRequest true "query params"
-// @Success 200 {object} schema.AnthropicResponse "Response"
-// @Router /v1/messages [post]
-func MessagesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
-	return func(c echo.Context) error {
-		id := uuid.New().String()
-
-		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.AnthropicRequest)
-		if !ok || input.Model == "" {
-			return sendAnthropicError(c, 400, "invalid_request_error", "model is required")
-		}
-
-		cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
-		if !ok || cfg == nil {
-			return sendAnthropicError(c, 400, "invalid_request_error", "model configuration not found")
-		}
-
-		if input.MaxTokens <= 0 {
-			return sendAnthropicError(c, 400, "invalid_request_error", "max_tokens is required and must be greater than 0")
-		}
-
-		xlog.Debug("Anthropic Messages endpoint configuration read", "config", cfg)
-
-		// Convert Anthropic messages to OpenAI format for internal processing
-		openAIMessages := convertAnthropicToOpenAIMessages(input)
-
-		// Convert Anthropic tools to internal Functions format
-		funcs, shouldUseFn := convertAnthropicTools(input, cfg)
-
-		// Create an OpenAI-compatible request for internal processing
-		openAIReq := &schema.OpenAIRequest{
-			PredictionOptions: schema.PredictionOptions{
-				BasicModelRequest: schema.BasicModelRequest{Model: input.Model},
-				Temperature:       input.Temperature,
-				TopK:              input.TopK,
-				TopP:              input.TopP,
-				Maxtokens:         &input.MaxTokens,
-			},
-			Messages: openAIMessages,
-			Stream:   input.Stream,
-			Context:  input.Context,
-			Cancel:   input.Cancel,
-		}
-
-		// Set stop sequences
-		if len(input.StopSequences) > 0 {
-			openAIReq.Stop = input.StopSequences
-		}
-
-		// Merge config settings
-		if input.Temperature != nil {
-			cfg.Temperature = input.Temperature
-		}
-		if input.TopK != nil {
-			cfg.TopK = input.TopK
-		}
-		if input.TopP != nil {
-			cfg.TopP = input.TopP
-		}
-		cfg.Maxtokens = &input.MaxTokens
-		if len(input.StopSequences) > 0 {
-			cfg.StopWords = append(cfg.StopWords, input.StopSequences...)
-		}
-
-		// Template the prompt with tools if available
-		predInput := evaluator.TemplateMessages(*openAIReq, openAIReq.Messages, cfg, funcs, shouldUseFn)
-		xlog.Debug("Anthropic Messages - Prompt (after templating)", "prompt", predInput)
-
-		if input.Stream {
-			return handleAnthropicStream(c, id, input, cfg, ml, predInput, openAIReq, funcs, shouldUseFn)
-		}
-
-		return handleAnthropicNonStream(c, id, input, cfg, ml, predInput, openAIReq, funcs, shouldUseFn)
-	}
-}
-
-func handleAnthropicNonStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool) error {
-	images := []string{}
-	for _, m := range openAIReq.Messages {
-		images = append(images, m.StringImages...)
-	}
-
-	predFunc, err := backend.ModelInference(
-		input.Context, predInput, openAIReq.Messages, images, nil, nil, ml, cfg, nil, nil, nil, "", "", nil, nil, nil)
-	if err != nil {
-		xlog.Error("Anthropic model inference failed", "error", err)
-		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
-	}
-
-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("Anthropic prediction failed", "error", err)
-		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
-	}
-
-	result := backend.Finetune(*cfg, predInput, prediction.Response)
-	
-	// Check if the result contains tool calls
-	toolCalls := functions.ParseFunctionCall(result, cfg.FunctionsConfig)
-	
-	var contentBlocks []schema.AnthropicContentBlock
-	var stopReason string
-	
-	if shouldUseFn && len(toolCalls) > 0 {
-		// Model wants to use tools
-		stopReason = "tool_use"
-		for _, tc := range toolCalls {
-			// Parse arguments as JSON
-			var inputArgs map[string]interface{}
-			if err := json.Unmarshal([]byte(tc.Arguments), &inputArgs); err != nil {
-				xlog.Warn("Failed to parse tool call arguments as JSON", "error", err, "args", tc.Arguments)
-				inputArgs = map[string]interface{}{"raw": tc.Arguments}
-			}
-			
-			contentBlocks = append(contentBlocks, schema.AnthropicContentBlock{
-				Type:  "tool_use",
-				ID:    fmt.Sprintf("toolu_%s_%d", id, len(contentBlocks)),
-				Name:  tc.Name,
-				Input: inputArgs,
-			})
-		}
-		
-		// Add any text content before the tool calls
-		textContent := functions.ParseTextContent(result, cfg.FunctionsConfig)
-		if textContent != "" {
-			// Prepend text block
-			contentBlocks = append([]schema.AnthropicContentBlock{{Type: "text", Text: textContent}}, contentBlocks...)
-		}
-	} else {
-		// Normal text response
-		stopReason = "end_turn"
-		contentBlocks = []schema.AnthropicContentBlock{
-			{Type: "text", Text: result},
-		}
-	}
-
-	resp := &schema.AnthropicResponse{
-		ID:         fmt.Sprintf("msg_%s", id),
-		Type:       "message",
-		Role:       "assistant",
-		Model:      input.Model,
-		StopReason: &stopReason,
-		Content:    contentBlocks,
-		Usage: schema.AnthropicUsage{
-			InputTokens:  prediction.Usage.Prompt,
-			OutputTokens: prediction.Usage.Completion,
-		},
-	}
-
-	if respData, err := json.Marshal(resp); err == nil {
-		xlog.Debug("Anthropic Response", "response", string(respData))
-	}
-
-	return c.JSON(200, resp)
-}
-
-func handleAnthropicStream(c echo.Context, id string, input *schema.AnthropicRequest, cfg *config.ModelConfig, ml *model.ModelLoader, predInput string, openAIReq *schema.OpenAIRequest, funcs functions.Functions, shouldUseFn bool) error {
-	c.Response().Header().Set("Content-Type", "text/event-stream")
-	c.Response().Header().Set("Cache-Control", "no-cache")
-	c.Response().Header().Set("Connection", "keep-alive")
-
-	// Create OpenAI messages for inference
-	openAIMessages := openAIReq.Messages
-
-	images := []string{}
-	for _, m := range openAIMessages {
-		images = append(images, m.StringImages...)
-	}
-
-	// Send message_start event
-	messageStart := schema.AnthropicStreamEvent{
-		Type: "message_start",
-		Message: &schema.AnthropicStreamMessage{
-			ID:      fmt.Sprintf("msg_%s", id),
-			Type:    "message",
-			Role:    "assistant",
-			Content: []schema.AnthropicContentBlock{},
-			Model:   input.Model,
-			Usage:   schema.AnthropicUsage{InputTokens: 0, OutputTokens: 0},
-		},
-	}
-	sendAnthropicSSE(c, messageStart)
-
-	// Track accumulated content for tool call detection
-	accumulatedContent := ""
-	currentBlockIndex := 0
-	inToolCall := false
-	toolCallsEmitted := 0
-	
-	// Send initial content_block_start event
-	contentBlockStart := schema.AnthropicStreamEvent{
-		Type:         "content_block_start",
-		Index:        currentBlockIndex,
-		ContentBlock: &schema.AnthropicContentBlock{Type: "text", Text: ""},
-	}
-	sendAnthropicSSE(c, contentBlockStart)
-
-	// Stream content deltas
-	tokenCallback := func(token string, usage backend.TokenUsage) bool {
-		accumulatedContent += token
-		
-		// If we're using functions, try to detect tool calls incrementally
-		if shouldUseFn {
-			cleanedResult := functions.CleanupLLMResult(accumulatedContent, cfg.FunctionsConfig)
-			
-			// Try parsing for tool calls
-			toolCalls := functions.ParseFunctionCall(cleanedResult, cfg.FunctionsConfig)
-			
-			// If we detected new tool calls and haven't emitted them yet
-			if len(toolCalls) > toolCallsEmitted {
-				// Stop the current text block if we were in one
-				if !inToolCall && currentBlockIndex == 0 {
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_stop",
-						Index: currentBlockIndex,
-					})
-					currentBlockIndex++
-					inToolCall = true
-				}
-				
-				// Emit new tool calls
-				for i := toolCallsEmitted; i < len(toolCalls); i++ {
-					tc := toolCalls[i]
-					
-					// Send content_block_start for tool_use
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_start",
-						Index: currentBlockIndex,
-						ContentBlock: &schema.AnthropicContentBlock{
-							Type: "tool_use",
-							ID:   fmt.Sprintf("toolu_%s_%d", id, i),
-							Name: tc.Name,
-						},
-					})
-					
-					// Send input_json_delta with the arguments
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_delta",
-						Index: currentBlockIndex,
-						Delta: &schema.AnthropicStreamDelta{
-							Type:        "input_json_delta",
-							PartialJSON: tc.Arguments,
-						},
-					})
-					
-					// Send content_block_stop
-					sendAnthropicSSE(c, schema.AnthropicStreamEvent{
-						Type:  "content_block_stop",
-						Index: currentBlockIndex,
-					})
-					
-					currentBlockIndex++
-				}
-				toolCallsEmitted = len(toolCalls)
-				return true
-			}
-		}
-		
-		// Send regular text delta if not in tool call mode
-		if !inToolCall {
-			delta := schema.AnthropicStreamEvent{
-				Type:  "content_block_delta",
-				Index: 0,
-				Delta: &schema.AnthropicStreamDelta{
-					Type: "text_delta",
-					Text: token,
-				},
-			}
-			sendAnthropicSSE(c, delta)
-		}
-		return true
-	}
-
-	predFunc, err := backend.ModelInference(
-		input.Context, predInput, openAIMessages, images, nil, nil, ml, cfg, nil, nil, tokenCallback, "", "", nil, nil, nil)
-	if err != nil {
-		xlog.Error("Anthropic stream model inference failed", "error", err)
-		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
-	}
-
-	prediction, err := predFunc()
-	if err != nil {
-		xlog.Error("Anthropic stream prediction failed", "error", err)
-		return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
-	}
-
-	// Send content_block_stop event for last block if we didn't close it yet
-	if !inToolCall {
-		contentBlockStop := schema.AnthropicStreamEvent{
-			Type:  "content_block_stop",
-			Index: 0,
-		}
-		sendAnthropicSSE(c, contentBlockStop)
-	}
-
-	// Determine stop reason
-	stopReason := "end_turn"
-	if toolCallsEmitted > 0 {
-		stopReason = "tool_use"
-	}
-
-	// Send message_delta event with stop_reason
-	messageDelta := schema.AnthropicStreamEvent{
-		Type: "message_delta",
-		Delta: &schema.AnthropicStreamDelta{
-			StopReason: &stopReason,
-		},
-		Usage: &schema.AnthropicUsage{
-			OutputTokens: prediction.Usage.Completion,
-		},
-	}
-	sendAnthropicSSE(c, messageDelta)
-
-	// Send message_stop event
-	messageStop := schema.AnthropicStreamEvent{
-		Type: "message_stop",
-	}
-	sendAnthropicSSE(c, messageStop)
-
-	return nil
-}
-
-func sendAnthropicSSE(c echo.Context, event schema.AnthropicStreamEvent) {
-	data, err := json.Marshal(event)
-	if err != nil {
-		xlog.Error("Failed to marshal SSE event", "error", err)
-		return
-	}
-	fmt.Fprintf(c.Response().Writer, "event: %s\ndata: %s\n\n", event.Type, string(data))
-	c.Response().Flush()
-}
-
-func sendAnthropicError(c echo.Context, statusCode int, errorType, message string) error {
-	resp := schema.AnthropicErrorResponse{
-		Type: "error",
-		Error: schema.AnthropicError{
-			Type:    errorType,
-			Message: message,
-		},
-	}
-	return c.JSON(statusCode, resp)
-}
-
-func convertAnthropicToOpenAIMessages(input *schema.AnthropicRequest) []schema.Message {
-	var messages []schema.Message
-
-	// Add system message if present
-	if input.System != "" {
-		messages = append(messages, schema.Message{
-			Role:          "system",
-			StringContent: input.System,
-			Content:       input.System,
-		})
-	}
-
-	// Convert Anthropic messages to OpenAI format
-	for _, msg := range input.Messages {
-		openAIMsg := schema.Message{
-			Role: msg.Role,
-		}
-
-		// Handle content (can be string or array of content blocks)
-		switch content := msg.Content.(type) {
-		case string:
-			openAIMsg.StringContent = content
-			openAIMsg.Content = content
-		case []interface{}:
-			// Handle array of content blocks
-			var textContent string
-			var stringImages []string
-			var toolCalls []schema.ToolCall
-			toolCallIndex := 0
-
-			for _, block := range content {
-				if blockMap, ok := block.(map[string]interface{}); ok {
-					blockType, _ := blockMap["type"].(string)
-					switch blockType {
-					case "text":
-						if text, ok := blockMap["text"].(string); ok {
-							textContent += text
-						}
-					case "image":
-						// Handle image content
-						if source, ok := blockMap["source"].(map[string]interface{}); ok {
-							if sourceType, ok := source["type"].(string); ok && sourceType == "base64" {
-								if data, ok := source["data"].(string); ok {
-									mediaType, _ := source["media_type"].(string)
-									// Format as data URI
-									dataURI := fmt.Sprintf("data:%s;base64,%s", mediaType, data)
-									stringImages = append(stringImages, dataURI)
-								}
-							}
-						}
-					case "tool_use":
-						// Convert tool_use to ToolCall format
-						toolID, _ := blockMap["id"].(string)
-						toolName, _ := blockMap["name"].(string)
-						toolInput := blockMap["input"]
-						
-						// Serialize input to JSON string
-						inputJSON, err := json.Marshal(toolInput)
-						if err != nil {
-							xlog.Warn("Failed to marshal tool input", "error", err)
-							inputJSON = []byte("{}")
-						}
-						
-						toolCalls = append(toolCalls, schema.ToolCall{
-							Index: toolCallIndex,
-							ID:    toolID,
-							Type:  "function",
-							FunctionCall: schema.FunctionCall{
-								Name:      toolName,
-								Arguments: string(inputJSON),
-							},
-						})
-						toolCallIndex++
-					case "tool_result":
-						// Convert tool_result to a message with role "tool"
-						// This is handled by creating a separate message after this block
-						// For now, we'll add it as text content
-						toolUseID, _ := blockMap["tool_use_id"].(string)
-						isError := false
-						if isErrorPtr, ok := blockMap["is_error"].(*bool); ok && isErrorPtr != nil {
-							isError = *isErrorPtr
-						}
-						
-						var resultText string
-						if resultContent, ok := blockMap["content"]; ok {
-							switch rc := resultContent.(type) {
-							case string:
-								resultText = rc
-							case []interface{}:
-								// Array of content blocks
-								for _, cb := range rc {
-									if cbMap, ok := cb.(map[string]interface{}); ok {
-										if cbMap["type"] == "text" {
-											if text, ok := cbMap["text"].(string); ok {
-												resultText += text
-											}
-										}
-									}
-								}
-							}
-						}
-						
-						// Add tool result as a tool role message
-						// We need to handle this differently - create a new message
-						if msg.Role == "user" {
-							// Store tool result info for creating separate message
-							prefix := ""
-							if isError {
-								prefix = "Error: "
-							}
-							textContent += fmt.Sprintf("\n[Tool Result for %s]: %s%s", toolUseID, prefix, resultText)
-						}
-					}
-				}
-			}
-			openAIMsg.StringContent = textContent
-			openAIMsg.Content = textContent
-			openAIMsg.StringImages = stringImages
-			
-			// Add tool calls if present
-			if len(toolCalls) > 0 {
-				openAIMsg.ToolCalls = toolCalls
-			}
-		}
-
-		messages = append(messages, openAIMsg)
-	}
-
-	return messages
-}
-
-// convertAnthropicTools converts Anthropic tools to internal Functions format
-func convertAnthropicTools(input *schema.AnthropicRequest, cfg *config.ModelConfig) (functions.Functions, bool) {
-	if len(input.Tools) == 0 {
-		return nil, false
-	}
-	
-	var funcs functions.Functions
-	for _, tool := range input.Tools {
-		f := functions.Function{
-			Name:        tool.Name,
-			Description: tool.Description,
-			Parameters:  tool.InputSchema,
-		}
-		funcs = append(funcs, f)
-	}
-	
-	// Handle tool_choice
-	if input.ToolChoice != nil {
-		switch tc := input.ToolChoice.(type) {
-		case string:
-			// "auto", "any", or "none"
-			if tc == "any" {
-				// Force the model to use one of the tools
-				cfg.SetFunctionCallString("required")
-			} else if tc == "none" {
-				// Don't use tools
-				return nil, false
-			}
-			// "auto" is the default - let model decide
-		case map[string]interface{}:
-			// Specific tool selection: {"type": "tool", "name": "tool_name"}
-			if tcType, ok := tc["type"].(string); ok && tcType == "tool" {
-				if name, ok := tc["name"].(string); ok {
-					// Force specific tool
-					cfg.SetFunctionCallString(name)
-				}
-			}
-		}
-	}
-	
-	return funcs, len(funcs) > 0 && cfg.ShouldUseFunctions()
-}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -3,7 +3,6 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
-	"strings"
 	"time"

 	"github.com/google/uuid"
@@ -35,54 +34,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			Created: created,
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant"}, Index: 0, FinishReason: nil}},
+			Object:  "chat.completion.chunk",
 		}
 		responses <- initialMessage

-		// Track accumulated content for reasoning extraction
-		accumulatedContent := ""
-		lastEmittedReasoning := ""
-		lastEmittedCleanedContent := ""
-
 		_, _, err := ComputeChoices(req, s, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
-			accumulatedContent += s
-			// Extract reasoning from accumulated content
-			currentReasoning, cleanedContent := functions.ExtractReasoning(accumulatedContent)
-
-			// Calculate new reasoning delta (what we haven't emitted yet)
-			var reasoningDelta *string
-			if currentReasoning != lastEmittedReasoning {
-				// Extract only the new part
-				if len(currentReasoning) > len(lastEmittedReasoning) && strings.HasPrefix(currentReasoning, lastEmittedReasoning) {
-					newReasoning := currentReasoning[len(lastEmittedReasoning):]
-					reasoningDelta = &newReasoning
-					lastEmittedReasoning = currentReasoning
-				} else if currentReasoning != "" {
-					// If reasoning changed in a non-append way, emit the full current reasoning
-					reasoningDelta = &currentReasoning
-					lastEmittedReasoning = currentReasoning
-				}
-			}
-
-			// Calculate content delta from cleaned content
-			var deltaContent string
-			if len(cleanedContent) > len(lastEmittedCleanedContent) && strings.HasPrefix(cleanedContent, lastEmittedCleanedContent) {
-				deltaContent = cleanedContent[len(lastEmittedCleanedContent):]
-				lastEmittedCleanedContent = cleanedContent
-			} else if cleanedContent != lastEmittedCleanedContent {
-				// If cleaned content changed but not in a simple append, extract delta from cleaned content
-				// This handles cases where thinking tags are removed mid-stream
-				if lastEmittedCleanedContent == "" {
-					deltaContent = cleanedContent
-					lastEmittedCleanedContent = cleanedContent
-				} else {
-					// Content changed in non-append way, use the new cleaned content
-					deltaContent = cleanedContent
-					lastEmittedCleanedContent = cleanedContent
-				}
-			}
-			// Only emit content if there's actual content (not just thinking tags)
-			// If deltaContent is empty, we still emit the response but with empty content
-
 			usage := schema.OpenAIUsage{
 				PromptTokens:     tokenUsage.Prompt,
 				CompletionTokens: tokenUsage.Completion,
@@ -93,20 +49,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
 			}

-			delta := &schema.Message{}
-			// Only include content if there's actual content (not just thinking tags)
-			if deltaContent != "" {
-				delta.Content = &deltaContent
-			}
-			if reasoningDelta != nil && *reasoningDelta != "" {
-				delta.Reasoning = reasoningDelta
-			}
-
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}},
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0, FinishReason: nil}},
 				Object:  "chat.completion.chunk",
 				Usage:   usage,
 			}
@@ -229,10 +176,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		if err != nil {
 			return err
 		}
-		// Extract reasoning before processing tool calls
-		reasoning, cleanedResult := functions.ExtractReasoning(result)
-		result = cleanedResult
-
 		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
 		result = functions.CleanupLLMResult(result, config.FunctionsConfig)
 		functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig)
@@ -265,20 +208,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
 			}

-			var deltaReasoning *string
-			if reasoning != "" {
-				deltaReasoning = &reasoning
-			}
-			delta := &schema.Message{Content: &result}
-			if deltaReasoning != nil {
-				delta.Reasoning = deltaReasoning
-			}
-
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: delta, Index: 0, FinishReason: nil}},
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &result}, Index: 0, FinishReason: nil}},
 				Object:  "chat.completion.chunk",
 				Usage:   usage,
 			}
@@ -619,18 +553,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		default:

 			tokenCallback := func(s string, c *[]schema.Choice) {
-				// Extract reasoning from the response
-				reasoning, cleanedS := functions.ExtractReasoning(s)
-				s = cleanedS
-
 				if !shouldUseFn {
 					// no function is called, just reply and use stop as finish reason
 					stopReason := FinishReasonStop
-					message := &schema.Message{Role: "assistant", Content: &s}
-					if reasoning != "" {
-						message.Reasoning = &reasoning
-					}
-					*c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: message})
+					*c = append(*c, schema.Choice{FinishReason: &stopReason, Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
 					return
 				}

@@ -649,13 +575,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					}

 					stopReason := FinishReasonStop
-					message := &schema.Message{Role: "assistant", Content: &result}
-					if reasoning != "" {
-						message.Reasoning = &reasoning
-					}
 					*c = append(*c, schema.Choice{
 						FinishReason: &stopReason,
-						Message:      message})
+						Message:      &schema.Message{Role: "assistant", Content: &result}})
 				default:
 					toolCallsReason := FinishReasonToolCalls
 					toolChoice := schema.Choice{
@@ -664,9 +586,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 							Role: "assistant",
 						},
 					}
-					if reasoning != "" {
-						toolChoice.Message.Reasoning = &reasoning
-					}

 					for _, ss := range results {
 						name, args := ss.Name, ss.Arguments
@@ -687,20 +606,16 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 						} else {
 							// otherwise we return more choices directly (deprecated)
 							functionCallReason := FinishReasonFunctionCall
-							message := &schema.Message{
-								Role:    "assistant",
-								Content: &textContentToReturn,
-								FunctionCall: map[string]interface{}{
-									"name":      name,
-									"arguments": args,
-								},
-							}
-							if reasoning != "" {
-								message.Reasoning = &reasoning
-							}
 							*c = append(*c, schema.Choice{
 								FinishReason: &functionCallReason,
-								Message:      message,
+								Message: &schema.Message{
+									Role:    "assistant",
+									Content: &textContentToReturn,
+									FunctionCall: map[string]interface{}{
+										"name":      name,
+										"arguments": args,
+									},
+								},
 							})
 						}
 					}
--- a/core/http/routes/anthropic.go
+++ b/core/http/routes/anthropic.go
@@ -1,108 +0,0 @@
-package routes
-
-import (
-	"context"
-	"fmt"
-	"net/http"
-
-	"github.com/google/uuid"
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/application"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/endpoints/anthropic"
-	"github.com/mudler/LocalAI/core/http/middleware"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/xlog"
-)
-
-func RegisterAnthropicRoutes(app *echo.Echo,
-	re *middleware.RequestExtractor,
-	application *application.Application) {
-
-	// Anthropic Messages API endpoint
-	messagesHandler := anthropic.MessagesEndpoint(
-		application.ModelConfigLoader(),
-		application.ModelLoader(),
-		application.TemplatesEvaluator(),
-		application.ApplicationConfig(),
-	)
-
-	messagesMiddleware := []echo.MiddlewareFunc{
-		middleware.TraceMiddleware(application),
-		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
-		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.AnthropicRequest) }),
-		setAnthropicRequestContext(application.ApplicationConfig()),
-	}
-
-	// Main Anthropic endpoint
-	app.POST("/v1/messages", messagesHandler, messagesMiddleware...)
-
-	// Also support without version prefix for compatibility
-	app.POST("/messages", messagesHandler, messagesMiddleware...)
-}
-
-// setAnthropicRequestContext sets up the context and cancel function for Anthropic requests
-func setAnthropicRequestContext(appConfig *config.ApplicationConfig) echo.MiddlewareFunc {
-	return func(next echo.HandlerFunc) echo.HandlerFunc {
-		return func(c echo.Context) error {
-			input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.AnthropicRequest)
-			if !ok || input.Model == "" {
-				return echo.NewHTTPError(http.StatusBadRequest, "model is required")
-			}
-
-			cfg, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
-			if !ok || cfg == nil {
-				return echo.NewHTTPError(http.StatusBadRequest, "model configuration not found")
-			}
-
-			// Extract or generate the correlation ID
-			// Anthropic uses x-request-id header
-			correlationID := c.Request().Header.Get("x-request-id")
-			if correlationID == "" {
-				correlationID = uuid.New().String()
-			}
-			c.Response().Header().Set("x-request-id", correlationID)
-
-			// Set up context with cancellation
-			reqCtx := c.Request().Context()
-			c1, cancel := context.WithCancel(appConfig.Context)
-
-			// Cancel when request context is cancelled (client disconnects)
-			go func() {
-				select {
-				case <-reqCtx.Done():
-					cancel()
-				case <-c1.Done():
-					// Already cancelled
-				}
-			}()
-
-			// Add the correlation ID to the new context
-			ctxWithCorrelationID := context.WithValue(c1, middleware.CorrelationIDKey, correlationID)
-
-			input.Context = ctxWithCorrelationID
-			input.Cancel = cancel
-
-			if cfg.Model == "" {
-				xlog.Debug("replacing empty cfg.Model with input value", "input.Model", input.Model)
-				cfg.Model = input.Model
-			}
-
-			c.Set(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST, input)
-			c.Set(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG, cfg)
-
-			// Log the Anthropic API version if provided
-			anthropicVersion := c.Request().Header.Get("anthropic-version")
-			if anthropicVersion != "" {
-				xlog.Debug("Anthropic API version", "version", anthropicVersion)
-			}
-
-			// Validate max_tokens is provided
-			if input.MaxTokens <= 0 {
-				return echo.NewHTTPError(http.StatusBadRequest, fmt.Sprintf("max_tokens is required and must be greater than 0"))
-			}
-
-			return next(c)
-		}
-	}
-}
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -617,12 +617,6 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 			installedBackendsCount = len(installedBackends)
 		}

-		// Get the detected system capability
-		detectedCapability := ""
-		if appConfig.SystemState != nil {
-			detectedCapability = appConfig.SystemState.DetectedCapability()
-		}
-
 		return c.JSON(200, map[string]interface{}{
 			"backends":           backendsJSON,
 			"repositories":       appConfig.BackendGalleries,
@@ -635,7 +629,6 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 			"totalPages":         totalPages,
 			"prevPage":           prevPage,
 			"nextPage":           nextPage,
-			"systemCapability":   detectedCapability,
 		})
 	})

--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -1368,7 +1368,6 @@ async function promptGPT(systemPrompt, input) {
    let lastAssistantMessageIndex = -1;
    let lastThinkingMessageIndex = -1;
    let lastThinkingScrollTime = 0;
-    let hasReasoningFromAPI = false; // Track if we're receiving reasoning from API (skip tag-based detection)
    const THINKING_SCROLL_THROTTLE = 200; // Throttle scrolling to every 200ms

    try {
@@ -1402,24 +1401,19 @@ async function promptGPT(systemPrompt, input) {
              // Handle different event types
              switch (eventData.type) {
                case "reasoning":
-                  hasReasoningFromAPI = true; // Mark that we're receiving reasoning from API
                  if (eventData.content) {
-                    const currentChat = chatStore.getChat(chatId);
-                    if (!currentChat) break; // Chat was deleted
-                    const isMCPMode = currentChat.mcpMode || false;
-                    const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
-                    // Insert thinking before assistant message if it exists (always use "thinking" role)
+                    // Insert reasoning before assistant message if it exists
                    if (lastAssistantMessageIndex >= 0 && targetHistory[lastAssistantMessageIndex]?.role === "assistant") {
                      targetHistory.splice(lastAssistantMessageIndex, 0, {
-                        role: "thinking",
+                        role: "reasoning",
                        content: eventData.content,
                        html: DOMPurify.sanitize(marked.parse(eventData.content)),
                        image: [],
                        audio: [],
-                        expanded: shouldExpand
+                        expanded: false // Reasoning is always collapsed
                      });
                      lastAssistantMessageIndex++; // Adjust index since we inserted
-                      // Scroll smoothly after adding thinking
+                      // Scroll smoothly after adding reasoning
                      setTimeout(() => {
                        const chatContainer = document.getElementById('chat');
                        if (chatContainer) {
@@ -1431,7 +1425,7 @@ async function promptGPT(systemPrompt, input) {
                      }, 100);
                    } else {
                      // No assistant message yet, just add normally
-                      chatStore.add("thinking", eventData.content, null, null, chatId);
+                      chatStore.add("reasoning", eventData.content, null, null, chatId);
                    }
                  }
                  break;
@@ -1497,17 +1491,14 @@ async function promptGPT(systemPrompt, input) {
                    // Only update display if this is the active chat (interval will handle it)
                    // Don't call updateTokensPerSecond here to avoid unnecessary updates
                    
-                    // Only check for thinking tags if we're NOT receiving reasoning from API
-                    // This prevents duplicate thinking/reasoning messages
-                    if (!hasReasoningFromAPI) {
-                      // Check for thinking tags in the chunk (incremental detection)
-                      if (contentChunk.includes("<thinking>") || contentChunk.includes("<think>")) {
-                        isThinking = true;
-                        thinkingContent = "";
-                        lastThinkingMessageIndex = -1;
-                      }
-                      
-                      if (contentChunk.includes("</thinking>") || contentChunk.includes("</think>")) {
+                    // Check for thinking tags in the chunk (incremental detection)
+                    if (contentChunk.includes("<thinking>") || contentChunk.includes("<think>")) {
+                      isThinking = true;
+                      thinkingContent = "";
+                      lastThinkingMessageIndex = -1;
+                    }
+                    
+                    if (contentChunk.includes("</thinking>") || contentChunk.includes("</think>")) {
                      isThinking = false;
                      // When closing tag is detected, process the accumulated thinking content
                      if (thinkingContent.trim()) {
@@ -1561,11 +1552,10 @@ async function promptGPT(systemPrompt, input) {
                        }
                        thinkingContent = "";
                      }
-                      }
                    }
                    
-                    // Handle content based on thinking state (only if not receiving reasoning from API)
-                    if (!hasReasoningFromAPI && isThinking) {
+                    // Handle content based on thinking state
+                    if (isThinking) {
                      thinkingContent += contentChunk;
                      const currentChat = chatStore.getChat(chatId);
                      if (!currentChat) break; // Chat was deleted
@@ -1647,10 +1637,7 @@ async function promptGPT(systemPrompt, input) {
          
          // Process any thinking tags that might be in the accumulated content
          // This handles cases where tags are split across chunks
-          // Only process if we're NOT receiving reasoning from API (to avoid duplicates)
-          const { regularContent: processedRegular, thinkingContent: processedThinking } = hasReasoningFromAPI
-            ? { regularContent: regularContent, thinkingContent: "" }
-            : processThinkingTags(regularContent);
+          const { regularContent: processedRegular, thinkingContent: processedThinking } = processThinkingTags(regularContent);
          
          // Update or create assistant message with processed regular content
          const currentChat = chatStore.getChat(chatId);
@@ -1658,10 +1645,10 @@ async function promptGPT(systemPrompt, input) {
          const request = activeRequests.get(chatId);
          const requestModel = request?.model || null;
          if (lastAssistantMessageIndex === -1) {
-            // Create assistant message if we have any content (even if empty string after processing)
-            // This ensures the message is created and can be updated with more content later
-            chatStore.add("assistant", processedRegular || "", null, null, chatId, requestModel);
-            lastAssistantMessageIndex = targetHistory.length - 1;
+            if (processedRegular && processedRegular.trim()) {
+              chatStore.add("assistant", processedRegular, null, null, chatId, requestModel);
+              lastAssistantMessageIndex = targetHistory.length - 1;
+            }
          } else {
            const lastMessage = targetHistory[lastAssistantMessageIndex];
            if (lastMessage && lastMessage.role === "assistant") {
@@ -1699,10 +1686,7 @@ async function promptGPT(systemPrompt, input) {
      if (assistantContentBuffer.length > 0) {
        const regularContent = assistantContentBuffer.join("");
        // Process any remaining thinking tags that might be in the buffer
-        // Only process if we're NOT receiving reasoning from API (to avoid duplicates)
-        const { regularContent: processedRegular, thinkingContent: processedThinking } = hasReasoningFromAPI
-          ? { regularContent: regularContent, thinkingContent: "" }
-          : processThinkingTags(regularContent);
+        const { regularContent: processedRegular, thinkingContent: processedThinking } = processThinkingTags(regularContent);
        
        const currentChat = chatStore.getChat(chatId);
        if (!currentChat) {
@@ -1735,26 +1719,23 @@ async function promptGPT(systemPrompt, input) {
        }
        
        // Then update or create assistant message
-        // Always create/update assistant message if we have any content
        if (lastAssistantMessageIndex !== -1) {
          const lastMessage = targetHistory[lastAssistantMessageIndex];
          if (lastMessage && lastMessage.role === "assistant") {
            lastMessage.content = (lastMessage.content || "") + (processedRegular || "");
            lastMessage.html = DOMPurify.sanitize(marked.parse(lastMessage.content));
          }
-        } else {
-          // Create assistant message (even if empty, so it can be updated with more content)
+        } else if (processedRegular && processedRegular.trim()) {
          const request = activeRequests.get(chatId);
          const requestModel = request?.model || null;
-          chatStore.add("assistant", processedRegular || "", null, null, chatId, requestModel);
+          chatStore.add("assistant", processedRegular, null, null, chatId, requestModel);
          lastAssistantMessageIndex = targetHistory.length - 1;
        }
      }
      
      // Final thinking content flush if any data remains (from incremental detection)
-      // Only process if we're NOT receiving reasoning from API (to avoid duplicates)
      const finalChat = chatStore.getChat(chatId);
-      if (finalChat && !hasReasoningFromAPI && thinkingContent.trim() && lastThinkingMessageIndex === -1) {
+      if (finalChat && thinkingContent.trim() && lastThinkingMessageIndex === -1) {
        const finalHistory = finalChat.history;
        // Extract thinking content if tags are present
        const thinkingMatch = thinkingContent.match(/<(?:thinking|redacted_reasoning)>(.*?)<\/(?:thinking|redacted_reasoning)>/s);
@@ -1910,13 +1891,9 @@ async function promptGPT(systemPrompt, input) {
    let buffer = "";
    let contentBuffer = [];
    let thinkingContent = "";
-    let reasoningContent = ""; // Track reasoning from API reasoning field
    let isThinking = false;
    let lastThinkingMessageIndex = -1;
-    let lastReasoningMessageIndex = -1; // Track reasoning message separately
-    let lastAssistantMessageIndex = -1; // Track assistant message for reasoning placement
    let lastThinkingScrollTime = 0;
-    let hasReasoningFromAPI = false; // Track if we're receiving reasoning from API (skip tag-based detection)
    const THINKING_SCROLL_THROTTLE = 200; // Throttle scrolling to every 200ms

    try {
@@ -1952,100 +1929,30 @@ async function promptGPT(systemPrompt, input) {
                chatStore.updateTokenUsage(jsonData.usage, chatId);
              }
              
-              const token = jsonData.choices?.[0]?.delta?.content;
-              const reasoningDelta = jsonData.choices?.[0]?.delta?.reasoning;
+              const token = jsonData.choices[0].delta.content;

-              // Handle reasoning from API reasoning field - always use "thinking" role
-              if (reasoningDelta && reasoningDelta.trim() !== "") {
-                hasReasoningFromAPI = true; // Mark that we're receiving reasoning from API
-                reasoningContent += reasoningDelta;
-                const currentChat = chatStore.getChat(chatId);
-                if (!currentChat) {
-                  // Chat was deleted, skip this line
+              if (token) {
+                // Check for thinking tags
+                if (token.includes("<thinking>") || token.includes("<think>")) {
+                  isThinking = true;
+                  thinkingContent = "";
+                  lastThinkingMessageIndex = -1;
                  return;
                }
-                const isMCPMode = currentChat.mcpMode || false;
-                const shouldExpand = !isMCPMode; // Expanded in non-MCP mode, collapsed in MCP mode
-                
-                // Only create/update thinking message if we have actual content
-                if (reasoningContent.trim() !== "") {
-                  // Update or create thinking message (always use "thinking" role, not "reasoning")
-                  if (lastReasoningMessageIndex === -1) {
-                    // Find the last assistant message index to insert thinking before it
-                    const targetHistory = currentChat.history;
-                    const assistantIndex = targetHistory.length - 1;
-                    if (assistantIndex >= 0 && targetHistory[assistantIndex]?.role === "assistant") {
-                      // Insert thinking before assistant message
-                      targetHistory.splice(assistantIndex, 0, {
-                        role: "thinking",
-                        content: reasoningContent,
-                        html: DOMPurify.sanitize(marked.parse(reasoningContent)),
-                        image: [],
-                        audio: [],
-                        expanded: shouldExpand
-                      });
-                      lastReasoningMessageIndex = assistantIndex;
-                      lastAssistantMessageIndex = assistantIndex + 1; // Adjust for inserted thinking
-                    } else {
-                      // No assistant message yet, just add normally
-                      chatStore.add("thinking", reasoningContent, null, null, chatId);
-                      lastReasoningMessageIndex = currentChat.history.length - 1;
-                    }
-                  } else {
-                    // Update existing thinking message
-                    const targetHistory = currentChat.history;
-                    if (lastReasoningMessageIndex >= 0 && lastReasoningMessageIndex < targetHistory.length) {
-                      const thinkingMessage = targetHistory[lastReasoningMessageIndex];
-                      if (thinkingMessage && thinkingMessage.role === "thinking") {
-                        thinkingMessage.content = reasoningContent;
-                        thinkingMessage.html = DOMPurify.sanitize(marked.parse(reasoningContent));
-                      }
+                if (token.includes("</thinking>") || token.includes("</think>")) {
+                  isThinking = false;
+                  if (thinkingContent.trim()) {
+                    // Only add the final thinking message if we don't already have one
+                    if (lastThinkingMessageIndex === -1) {
+                      chatStore.add("thinking", thinkingContent, null, null, chatId);
                    }
                  }
+                  return;
                }
-                
-                // Scroll when reasoning is updated (throttled)
-                const now = Date.now();
-                if (now - lastThinkingScrollTime > THINKING_SCROLL_THROTTLE) {
-                  lastThinkingScrollTime = now;
-                  setTimeout(() => {
-                    const chatContainer = document.getElementById('chat');
-                    if (chatContainer) {
-                      chatContainer.scrollTo({
-                        top: chatContainer.scrollHeight,
-                        behavior: 'smooth'
-                      });
-                    }
-                    scrollThinkingBoxToBottom();
-                  }, 100);
-                }
-              }

-              if (token && token.trim() !== "") {
-                // Only check for thinking tags if we're NOT receiving reasoning from API
-                // This prevents duplicate thinking/reasoning messages
-                if (!hasReasoningFromAPI) {
-                  // Check for thinking tags (legacy support - models that output tags directly)
-                  if (token.includes("<thinking>") || token.includes("<think>")) {
-                    isThinking = true;
-                    thinkingContent = "";
-                    lastThinkingMessageIndex = -1;
-                    return;
-                  }
-                  if (token.includes("</thinking>") || token.includes("</think>")) {
-                    isThinking = false;
-                    if (thinkingContent.trim()) {
-                      // Only add the final thinking message if we don't already have one
-                      if (lastThinkingMessageIndex === -1) {
-                        chatStore.add("thinking", thinkingContent, null, null, chatId);
-                      }
-                    }
-                    return;
-                  }
-
-                  // Handle content based on thinking state
-                  if (isThinking) {
-                    thinkingContent += token;
+                // Handle content based on thinking state
+                if (isThinking) {
+                  thinkingContent += token;
                  // Count tokens for rate calculation (per chat)
                  const request = activeRequests.get(chatId);
                  if (request) {
@@ -2088,42 +1995,7 @@ async function promptGPT(systemPrompt, input) {
                    }, 100);
                  }
                } else {
-                  // Not in thinking state, add to content buffer
                  contentBuffer.push(token);
-                  // Track assistant message index for reasoning placement
-                  if (lastAssistantMessageIndex === -1) {
-                    const currentChat = chatStore.getChat(chatId);
-                    if (currentChat) {
-                      const targetHistory = currentChat.history;
-                      // Find or create assistant message index
-                      for (let i = targetHistory.length - 1; i >= 0; i--) {
-                        if (targetHistory[i].role === "assistant") {
-                          lastAssistantMessageIndex = i;
-                          break;
-                        }
-                      }
-                      // If no assistant message yet, it will be created when we flush contentBuffer
-                    }
-                  }
-                }
-                } else {
-                  // Receiving reasoning from API, just add token to content buffer
-                  contentBuffer.push(token);
-                  // Track assistant message index for reasoning placement
-                  if (lastAssistantMessageIndex === -1) {
-                    const currentChat = chatStore.getChat(chatId);
-                    if (currentChat) {
-                      const targetHistory = currentChat.history;
-                      // Find or create assistant message index
-                      for (let i = targetHistory.length - 1; i >= 0; i--) {
-                        if (targetHistory[i].role === "assistant") {
-                          lastAssistantMessageIndex = i;
-                          break;
-                        }
-                      }
-                      // If no assistant message yet, it will be created when we flush contentBuffer
-                    }
-                  }
                }
              }
            } catch (error) {
@@ -2135,17 +2007,6 @@ async function promptGPT(systemPrompt, input) {
        // Efficiently update the chat in batch
        if (contentBuffer.length > 0) {
          addToChat(contentBuffer.join(""));
-          // Update assistant message index after adding content
-          const currentChat = chatStore.getChat(chatId);
-          if (currentChat) {
-            const targetHistory = currentChat.history;
-            for (let i = targetHistory.length - 1; i >= 0; i--) {
-              if (targetHistory[i].role === "assistant") {
-                lastAssistantMessageIndex = i;
-                break;
-              }
-            }
-          }
          contentBuffer = [];
          // Scroll when assistant content is updated (this will also show thinking messages above)
          setTimeout(() => {
@@ -2164,30 +2025,7 @@ async function promptGPT(systemPrompt, input) {
      if (contentBuffer.length > 0) {
        addToChat(contentBuffer.join(""));
      }
-      
-      // Final reasoning flush if any data remains - always use "thinking" role
      const finalChat = chatStore.getChat(chatId);
-      if (finalChat && reasoningContent.trim() && lastReasoningMessageIndex === -1) {
-        const isMCPMode = finalChat.mcpMode || false;
-        const shouldExpand = !isMCPMode;
-        const targetHistory = finalChat.history;
-        // Find assistant message to insert before
-        const assistantIndex = targetHistory.length - 1;
-        if (assistantIndex >= 0 && targetHistory[assistantIndex]?.role === "assistant") {
-          targetHistory.splice(assistantIndex, 0, {
-            role: "thinking",
-            content: reasoningContent,
-            html: DOMPurify.sanitize(marked.parse(reasoningContent)),
-            image: [],
-            audio: [],
-            expanded: shouldExpand
-          });
-        } else {
-          chatStore.add("thinking", reasoningContent, null, null, chatId);
-        }
-      }
-      
-      // Final thinking content flush (legacy tag-based thinking)
      if (finalChat && thinkingContent.trim() && lastThinkingMessageIndex === -1) {
        chatStore.add("thinking", thinkingContent, null, null, chatId);
      }
--- a/core/http/views/backends.html
+++ b/core/http/views/backends.html
@@ -54,11 +54,6 @@
                        <span class="font-semibold text-cyan-300" x-text="installedBackends"></span>
                        <span class="text-[#94A3B8] ml-1">installed</span>
                    </a>
-                    <div class="flex items-center bg-[#101827] rounded-lg px-4 py-2 border border-[#38BDF8]/30">
-                        <i class="fas fa-microchip text-[#38BDF8] mr-2"></i>
-                        <span class="text-[#94A3B8] mr-1">Capability:</span>
-                        <span class="font-semibold text-[#38BDF8]" x-text="systemCapability"></span>
-                    </div>
                    <a href="https://localai.io/backends/" target="_blank" class="btn-primary">
                        <i class="fas fa-info-circle mr-2"></i>
                        <span>Documentation</span>
@@ -593,7 +588,6 @@ function backendsGallery() {
        totalPages: 1,
        availableBackends: 0,
        installedBackends: 0,
-        systemCapability: '',
        selectedBackend: null,
        jobProgress: {},
        notifications: [],
@@ -689,7 +683,6 @@ function backendsGallery() {
                this.totalPages = data.totalPages || 1;
                this.availableBackends = data.availableBackends || 0;
                this.installedBackends = data.installedBackends || 0;
-                this.systemCapability = data.systemCapability || 'default';
            } catch (error) {
                console.error('Error fetching backends:', error);
            } finally {
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -41,7 +41,7 @@ SOFTWARE.
    __chatContextSize = {{ .ContextSize }};
    {{ end }}

-    // Store gallery configs for header icon display and model info modal
+    // Store gallery configs for header icon display
    window.__galleryConfigs = {};
    {{ $allGalleryConfigs:=.GalleryConfig }}
    {{ range $modelName, $galleryConfig := $allGalleryConfigs }}
@@ -49,16 +49,6 @@ SOFTWARE.
    {{ if $galleryConfig.Icon }}
    window.__galleryConfigs["{{$modelName}}"].Icon = "{{$galleryConfig.Icon}}";
    {{ end }}
-    {{ if $galleryConfig.Description }}
-    window.__galleryConfigs["{{$modelName}}"].Description = {{ printf "%q" $galleryConfig.Description }};
-    {{ end }}
-    {{ if $galleryConfig.URLs }}
-    window.__galleryConfigs["{{$modelName}}"].URLs = [
-      {{ range $idx, $url := $galleryConfig.URLs }}
-      {{ if $idx }},{{ end }}{{ printf "%q" $url }}
-      {{ end }}
-    ];
-    {{ end }}
    {{ end }}

    // Function to initialize store
@@ -336,10 +326,10 @@ SOFTWARE.
                c += DOMPurify.sanitize(marked.parse(line));
              });
            }
-            // Set expanded state: thinking and reasoning are expanded by default in non-MCP mode, collapsed in MCP mode
-            // tool_call and tool_result are always collapsed by default
+            // Set expanded state: thinking is expanded by default in non-MCP mode, collapsed in MCP mode
+            // Reasoning, tool_call, and tool_result are always collapsed by default
            const isMCPMode = chat.mcpMode || false;
-            const shouldExpand = ((role === "thinking" || role === "reasoning") && !isMCPMode) || false;
+            const shouldExpand = (role === "thinking" && !isMCPMode) || false;
            chat.history.push({ role, content, html: c, image, audio, expanded: shouldExpand, model: messageModel });
            
            // Auto-name chat from first user message
@@ -507,11 +497,6 @@ SOFTWARE.
      activeChat.model = modelName;
      activeChat.updatedAt = Date.now();
      
-      // Update model info modal with new model
-      if (window.updateModelInfoModal) {
-        window.updateModelInfoModal(modelName);
-      }
-      
      // Get context size from data attribute
      let contextSize = null;
      if (selectedOption.dataset.contextSize) {
@@ -551,23 +536,18 @@ SOFTWARE.
      }
      
      // Update model selector to reflect the change (ensure it stays in sync)
-      // Note: We don't dispatch a change event here to avoid infinite loop
-      // The selector is already updated via user interaction or programmatic change
      const modelSelector = document.getElementById('modelSelector');
      if (modelSelector) {
        // Find and select the option matching the model
        const optionValue = 'chat/' + modelName;
        for (let i = 0; i < modelSelector.options.length; i++) {
          if (modelSelector.options[i].value === optionValue) {
-            // Only update if it's different to avoid unnecessary updates
-            if (modelSelector.selectedIndex !== i) {
-              modelSelector.selectedIndex = i;
-            }
+            modelSelector.selectedIndex = i;
            break;
          }
        }
-        // Don't dispatch change event here - it would cause infinite recursion
-        // The selector is already in sync with the model
+        // Trigger Alpine reactivity by dispatching change event
+        modelSelector.dispatchEvent(new Event('change', { bubbles: true }));
      }
      
      // Trigger MCP availability check in Alpine component
@@ -623,52 +603,27 @@ SOFTWARE.
            <div class="flex items-center justify-between gap-2">
              <label class="text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide flex-shrink-0">Model</label>
              <div class="flex items-center gap-1 flex-shrink-0">
-                <!-- Info button - reactive to active chat model -->
-                <template x-if="$store.chat.activeChat() && $store.chat.activeChat().model && window.__galleryConfigs && window.__galleryConfigs[$store.chat.activeChat().model]">
-                  <button 
-                    data-twe-ripple-init 
-                    data-twe-ripple-color="light" 
-                    class="text-[var(--color-text-secondary)] hover:text-[var(--color-primary)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]" 
-                    data-modal-target="model-info-modal" 
-                    data-modal-toggle="model-info-modal"
-                    :data-model-name="$store.chat.activeChat().model"
-                    @click="if (window.updateModelInfoModal) { window.updateModelInfoModal($store.chat.activeChat().model, true); }"
-                    title="Model Information">
-                    <i class="fas fa-info-circle"></i>
-                  </button>
-                </template>
-                <!-- Fallback info button for initial model from server -->
-                <template x-if="(!$store.chat.activeChat() || !$store.chat.activeChat().model) && window.__galleryConfigs && window.__galleryConfigs['{{$model}}']">
-                  <button 
-                    data-twe-ripple-init 
-                    data-twe-ripple-color="light" 
-                    class="text-[var(--color-text-secondary)] hover:text-[var(--color-primary)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]" 
-                    data-modal-target="model-info-modal" 
-                    data-modal-toggle="model-info-modal"
-                    data-model-name="{{$model}}"
-                    @click="if (window.updateModelInfoModal) { window.updateModelInfoModal('{{$model}}', true); }"
-                    title="Model Information">
-                    <i class="fas fa-info-circle"></i>
-                  </button>
-                </template>
-                <!-- Edit button - reactive to active chat model -->
-                <template x-if="$store.chat.activeChat() && $store.chat.activeChat().model">
-                  <a :href="'/models/edit/' + $store.chat.activeChat().model" 
-                     class="text-[var(--color-text-secondary)] hover:text-[var(--color-warning)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]" 
-                     title="Edit Model Configuration">
-                    <i class="fas fa-edit"></i>
-                  </a>
-                </template>
-                <!-- Fallback edit button for initial model from server -->
-                <template x-if="!$store.chat.activeChat() || !$store.chat.activeChat().model">
-                  {{ if $model }}
-                  <a href="/models/edit/{{$model}}" 
-                     class="text-[var(--color-text-secondary)] hover:text-[var(--color-warning)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]" 
-                     title="Edit Model Configuration">
-                    <i class="fas fa-edit"></i>
-                  </a>
-                  {{ end }}
-                </template>
+                {{ if $model }}
+                {{ $galleryConfig:= index $allGalleryConfigs $model}}
+                {{ if $galleryConfig }}
+                <button 
+                  data-twe-ripple-init 
+                  data-twe-ripple-color="light" 
+                  class="text-[var(--color-text-secondary)] hover:text-[var(--color-primary)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]" 
+                  data-modal-target="model-info-modal" 
+                  data-modal-toggle="model-info-modal"
+                  title="Model Information">
+                  <i class="fas fa-info-circle"></i>
+                </button>
+                {{ end }}
+                {{ end }}
+                {{ if $model }}
+                <a href="/models/edit/{{$model}}" 
+                   class="text-[var(--color-text-secondary)] hover:text-[var(--color-warning)] transition-colors text-xs p-1 rounded hover:bg-[var(--color-bg-primary)]" 
+                   title="Edit Model Configuration">
+                  <i class="fas fa-edit"></i>
+                </a>
+                {{ end }}
              </div>
            </div>
            <select
@@ -1533,14 +1488,17 @@ SOFTWARE.
      </div>
    </div>

-    <!-- Modal moved outside of sidebar to appear in center of page - Always available, content updated dynamically -->
-    <div id="model-info-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 flex justify-center items-center w-full h-full md:inset-0 max-h-full" style="padding: 1rem;">
+    <!-- Modal moved outside of sidebar to appear in center of page -->
+    {{ if $model }}
+    {{ $galleryConfig:= index $allGalleryConfigs $model}}
+    {{ if $galleryConfig }}
+    <div id="model-info-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 flex justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full">
      <div class="relative p-4 w-full max-w-2xl max-h-full">
        <div class="relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700">
          <!-- Header -->
          <div class="flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600">
-            <h3 id="model-info-modal-title" class="text-xl font-semibold text-gray-900 dark:text-white">{{ if $model }}{{ $model }}{{ end }}</h3>
-            <button class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="model-info-modal" @click="if (window.closeModelInfoModal) { window.closeModelInfoModal(); }">
+            <h3 class="text-xl font-semibold text-gray-900 dark:text-white">{{ $model }}</h3>
+            <button class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="model-info-modal">
              <svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
                <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
              </svg>
@@ -1551,24 +1509,29 @@ SOFTWARE.
          <!-- Body -->
          <div class="p-4 md:p-5 space-y-4">
            <div class="flex justify-center items-center">
-              <img id="model-info-modal-icon" class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded" style="display: none;" loading="lazy"/>
+              {{ if $galleryConfig.Icon }}<img class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded" src="{{$galleryConfig.Icon}}" loading="lazy"/>{{end}}
            </div>
-            <div id="model-info-description" class="text-base leading-relaxed text-gray-500 dark:text-gray-400 break-words max-w-full"></div>
+            <div id="model-info-description" class="text-base leading-relaxed text-gray-500 dark:text-gray-400 break-words max-w-full">{{ $galleryConfig.Description }}</div>
            <hr>
            <p class="text-sm font-semibold text-gray-900 dark:text-white">Links</p>
-            <ul id="model-info-links">
+            <ul>
+              {{range $galleryConfig.URLs}}
+              <li><a href="{{ . }}" target="_blank">{{ . }}</a></li>
+              {{end}}
            </ul>
          </div>

          <!-- Footer -->
          <div class="flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600">
-            <button data-modal-hide="model-info-modal" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700" @click="if (window.closeModelInfoModal) { window.closeModelInfoModal(); }">
+            <button data-modal-hide="model-info-modal" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700">
              Close
            </button>
          </div>
        </div>
      </div>
    </div>
+    {{ end }}
+    {{ end }}

    <!-- Alpine store initialization and utilities -->
    <script>
@@ -1779,20 +1742,10 @@ SOFTWARE.
        });

        // Also listen for click events on modal toggle buttons
-        // Use event delegation to handle dynamically created buttons
-        document.addEventListener('click', (e) => {
-          const button = e.target.closest('[data-modal-toggle="model-info-modal"]');
-          if (button) {
-            // Update modal with current model before showing
-            if (window.Alpine && window.Alpine.store("chat")) {
-              const activeChat = window.Alpine.store("chat").activeChat();
-              const modelName = activeChat ? activeChat.model : (button.dataset.modelName || (document.getElementById("chat-model") ? document.getElementById("chat-model").value : null));
-              if (modelName && window.updateModelInfoModal) {
-                window.updateModelInfoModal(modelName, true);
-              }
-            }
+        document.querySelectorAll('[data-modal-toggle="model-info-modal"]').forEach(button => {
+          button.addEventListener('click', () => {
            setTimeout(processMarkdown, 300);
-          }
+          });
        });

        // Process on initial load if libraries are ready
@@ -1833,176 +1786,12 @@ SOFTWARE.
        syncModelSelectorOnLoad();
      }

-      // Function to update model info modal with current model
-      // Set openModal to true to actually open the modal, false to just update content
-      window.updateModelInfoModal = function(modelName, openModal = false) {
-        if (!modelName) {
-          return;
-        }
-        if (!window.__galleryConfigs) {
-          return;
-        }
-        
-        const galleryConfig = window.__galleryConfigs[modelName];
-        // Check if galleryConfig exists and has at least one property
-        if (!galleryConfig || Object.keys(galleryConfig).length === 0) {
-          // Still update the modal title even if no config, so user can see which model they clicked
-          const titleEl = document.getElementById('model-info-modal-title');
-          if (titleEl) {
-            titleEl.textContent = modelName;
-          }
-          // Show message that no info is available
-          const descEl = document.getElementById('model-info-description');
-          if (descEl) {
-            descEl.textContent = 'No additional information available for this model.';
-          }
-          const linksEl = document.getElementById('model-info-links');
-          if (linksEl) {
-            linksEl.innerHTML = '';
-          }
-          const iconEl = document.getElementById('model-info-modal-icon');
-          if (iconEl) {
-            iconEl.style.display = 'none';
-          }
-          // Only open the modal if explicitly requested
-          if (openModal) {
-            const modalElement = document.getElementById('model-info-modal');
-            if (modalElement) {
-              modalElement.classList.remove('hidden');
-              modalElement.setAttribute('aria-hidden', 'false');
-              // Add backdrop
-              let backdrop = document.querySelector('.modal-backdrop');
-              if (!backdrop) {
-                backdrop = document.createElement('div');
-                backdrop.className = 'modal-backdrop fixed inset-0 bg-gray-900 bg-opacity-50 dark:bg-opacity-80 z-40';
-                document.body.appendChild(backdrop);
-                backdrop.addEventListener('click', () => {
-                  closeModelInfoModal();
-                });
-              }
-            }
-          }
-          return;
-        }
-        
-        // Update modal title
-        const titleEl = document.getElementById('model-info-modal-title');
-        if (titleEl) {
-          titleEl.textContent = modelName;
-        }
-        
-        // Update icon
-        const iconEl = document.getElementById('model-info-modal-icon');
-        if (iconEl) {
-          if (galleryConfig.Icon) {
-            iconEl.src = galleryConfig.Icon;
-            iconEl.style.display = 'block';
-          } else {
-            iconEl.style.display = 'none';
-          }
-        }
-        
-        // Update description
-        const descEl = document.getElementById('model-info-description');
-        if (descEl) {
-          descEl.textContent = galleryConfig.Description || 'No description available.';
-        }
-        
-        // Update links
-        const linksEl = document.getElementById('model-info-links');
-        if (linksEl && galleryConfig.URLs && Array.isArray(galleryConfig.URLs) && galleryConfig.URLs.length > 0) {
-          linksEl.innerHTML = '';
-          galleryConfig.URLs.forEach(url => {
-            const li = document.createElement('li');
-            const a = document.createElement('a');
-            a.href = url;
-            a.target = '_blank';
-            a.textContent = url;
-            li.appendChild(a);
-            linksEl.appendChild(li);
-          });
-        } else if (linksEl) {
-          linksEl.innerHTML = '<li>No links available</li>';
-        }
-        
-        // Only open the modal if explicitly requested
-        if (openModal) {
-          const modalElement = document.getElementById('model-info-modal');
-          if (modalElement) {
-            // Ensure positioning classes are present (they might have been removed)
-            if (!modalElement.classList.contains('flex')) {
-              modalElement.classList.add('flex');
-            }
-            if (!modalElement.classList.contains('justify-center')) {
-              modalElement.classList.add('justify-center');
-            }
-            if (!modalElement.classList.contains('items-center')) {
-              modalElement.classList.add('items-center');
-            }
-            // Ensure fixed positioning
-            if (!modalElement.classList.contains('fixed')) {
-              modalElement.classList.add('fixed');
-            }
-            // Ensure full width and height
-            if (!modalElement.classList.contains('w-full')) {
-              modalElement.classList.add('w-full');
-            }
-            if (!modalElement.classList.contains('h-full')) {
-              modalElement.classList.add('h-full');
-            }
-            // Ensure padding is set
-            if (!modalElement.style.padding) {
-              modalElement.style.padding = '1rem';
-            }
-            // Remove hidden class if present
-            modalElement.classList.remove('hidden');
-            // Set aria-hidden to false
-            modalElement.setAttribute('aria-hidden', 'false');
-            // Add backdrop if needed
-            let backdrop = document.querySelector('.modal-backdrop');
-            if (!backdrop) {
-              backdrop = document.createElement('div');
-              backdrop.className = 'modal-backdrop fixed inset-0 bg-gray-900 bg-opacity-50 dark:bg-opacity-80 z-40';
-              document.body.appendChild(backdrop);
-              backdrop.addEventListener('click', () => {
-                window.closeModelInfoModal();
-              });
-            }
-          }
-        }
-      };
-      
-      // Function to close the model info modal
-      window.closeModelInfoModal = function() {
-        const modalElement = document.getElementById('model-info-modal');
-        if (modalElement) {
-          modalElement.classList.add('hidden');
-          modalElement.setAttribute('aria-hidden', 'true');
-        }
-        const backdrop = document.querySelector('.modal-backdrop');
-        if (backdrop) {
-          backdrop.remove();
-        }
-      };
-
      // Also sync after Alpine initializes (in case it runs after DOMContentLoaded)
-      function initializeModelInfo() {
-        syncModelSelectorOnLoad();
-        // Initialize model info modal content with current model (but don't open it)
-        if (window.updateModelInfoModal && window.Alpine && window.Alpine.store("chat")) {
-          const activeChat = window.Alpine.store("chat").activeChat();
-          const modelName = activeChat ? activeChat.model : (document.getElementById("chat-model") ? document.getElementById("chat-model").value : null);
-          if (modelName) {
-            window.updateModelInfoModal(modelName, false); // false = don't open, just update content
-          }
-        }
-      }
-      
      if (window.Alpine) {
-        Alpine.nextTick(initializeModelInfo);
+        Alpine.nextTick(syncModelSelectorOnLoad);
      } else {
        document.addEventListener('alpine:init', () => {
-          Alpine.nextTick(initializeModelInfo);
+          Alpine.nextTick(syncModelSelectorOnLoad);
        });
      }
    </script>
--- a/core/schema/anthropic.go
+++ b/core/schema/anthropic.go
@@ -1,176 +0,0 @@
-package schema
-
-import (
-	"context"
-	"encoding/json"
-)
-
-// AnthropicRequest represents a request to the Anthropic Messages API
-// https://docs.anthropic.com/claude/reference/messages_post
-type AnthropicRequest struct {
-	Model         string             `json:"model"`
-	Messages      []AnthropicMessage `json:"messages"`
-	MaxTokens     int                `json:"max_tokens"`
-	Metadata      map[string]string  `json:"metadata,omitempty"`
-	StopSequences []string           `json:"stop_sequences,omitempty"`
-	Stream        bool               `json:"stream,omitempty"`
-	System        string             `json:"system,omitempty"`
-	Temperature   *float64           `json:"temperature,omitempty"`
-	TopK          *int               `json:"top_k,omitempty"`
-	TopP          *float64           `json:"top_p,omitempty"`
-	Tools         []AnthropicTool    `json:"tools,omitempty"`
-	ToolChoice    interface{}        `json:"tool_choice,omitempty"`
-
-	// Internal fields for request handling
-	Context context.Context    `json:"-"`
-	Cancel  context.CancelFunc `json:"-"`
-}
-
-// ModelName implements the LocalAIRequest interface
-func (ar *AnthropicRequest) ModelName(s *string) string {
-	if s != nil {
-		ar.Model = *s
-	}
-	return ar.Model
-}
-
-// AnthropicTool represents a tool definition in the Anthropic format
-type AnthropicTool struct {
-	Name        string                 `json:"name"`
-	Description string                 `json:"description,omitempty"`
-	InputSchema map[string]interface{} `json:"input_schema"`
-}
-
-// AnthropicMessage represents a message in the Anthropic format
-type AnthropicMessage struct {
-	Role    string      `json:"role"`
-	Content interface{} `json:"content"`
-}
-
-// AnthropicContentBlock represents a content block in an Anthropic message
-type AnthropicContentBlock struct {
-	Type       string                 `json:"type"`
-	Text       string                 `json:"text,omitempty"`
-	Source     *AnthropicImageSource  `json:"source,omitempty"`
-	ID         string                 `json:"id,omitempty"`
-	Name       string                 `json:"name,omitempty"`
-	Input      map[string]interface{} `json:"input,omitempty"`
-	ToolUseID  string                 `json:"tool_use_id,omitempty"`
-	Content    interface{}            `json:"content,omitempty"`
-	IsError    *bool                  `json:"is_error,omitempty"`
-}
-
-// AnthropicImageSource represents an image source in Anthropic format
-type AnthropicImageSource struct {
-	Type      string `json:"type"`
-	MediaType string `json:"media_type"`
-	Data      string `json:"data"`
-}
-
-// AnthropicResponse represents a response from the Anthropic Messages API
-type AnthropicResponse struct {
-	ID           string                  `json:"id"`
-	Type         string                  `json:"type"`
-	Role         string                  `json:"role"`
-	Content      []AnthropicContentBlock `json:"content"`
-	Model        string                  `json:"model"`
-	StopReason   *string                 `json:"stop_reason"`
-	StopSequence *string                 `json:"stop_sequence,omitempty"`
-	Usage        AnthropicUsage          `json:"usage"`
-}
-
-// AnthropicUsage represents token usage in Anthropic format
-type AnthropicUsage struct {
-	InputTokens  int `json:"input_tokens"`
-	OutputTokens int `json:"output_tokens"`
-}
-
-// AnthropicStreamEvent represents a streaming event from the Anthropic API
-type AnthropicStreamEvent struct {
-	Type         string                  `json:"type"`
-	Index        int                     `json:"index,omitempty"`
-	ContentBlock *AnthropicContentBlock  `json:"content_block,omitempty"`
-	Delta        *AnthropicStreamDelta   `json:"delta,omitempty"`
-	Message      *AnthropicStreamMessage `json:"message,omitempty"`
-	Usage        *AnthropicUsage         `json:"usage,omitempty"`
-}
-
-// AnthropicStreamDelta represents the delta in a streaming response
-type AnthropicStreamDelta struct {
-	Type         string  `json:"type,omitempty"`
-	Text         string  `json:"text,omitempty"`
-	PartialJSON  string  `json:"partial_json,omitempty"`
-	StopReason   *string `json:"stop_reason,omitempty"`
-	StopSequence *string `json:"stop_sequence,omitempty"`
-}
-
-// AnthropicStreamMessage represents the message object in streaming events
-type AnthropicStreamMessage struct {
-	ID           string                  `json:"id"`
-	Type         string                  `json:"type"`
-	Role         string                  `json:"role"`
-	Content      []AnthropicContentBlock `json:"content"`
-	Model        string                  `json:"model"`
-	StopReason   *string                 `json:"stop_reason"`
-	StopSequence *string                 `json:"stop_sequence,omitempty"`
-	Usage        AnthropicUsage          `json:"usage"`
-}
-
-// AnthropicErrorResponse represents an error response from the Anthropic API
-type AnthropicErrorResponse struct {
-	Type  string         `json:"type"`
-	Error AnthropicError `json:"error"`
-}
-
-// AnthropicError represents an error in the Anthropic format
-type AnthropicError struct {
-	Type    string `json:"type"`
-	Message string `json:"message"`
-}
-
-// GetStringContent extracts the string content from an AnthropicMessage
-// Content can be either a string or an array of content blocks
-func (m *AnthropicMessage) GetStringContent() string {
-	switch content := m.Content.(type) {
-	case string:
-		return content
-	case []interface{}:
-		var result string
-		for _, block := range content {
-			if blockMap, ok := block.(map[string]interface{}); ok {
-				if blockMap["type"] == "text" {
-					if text, ok := blockMap["text"].(string); ok {
-						result += text
-					}
-				}
-			}
-		}
-		return result
-	}
-	return ""
-}
-
-// GetContentBlocks extracts content blocks from an AnthropicMessage
-func (m *AnthropicMessage) GetContentBlocks() []AnthropicContentBlock {
-	switch content := m.Content.(type) {
-	case string:
-		return []AnthropicContentBlock{{Type: "text", Text: content}}
-	case []interface{}:
-		var blocks []AnthropicContentBlock
-		for _, block := range content {
-			if blockMap, ok := block.(map[string]interface{}); ok {
-				cb := AnthropicContentBlock{}
-				data, err := json.Marshal(blockMap)
-				if err != nil {
-					continue
-				}
-				if err := json.Unmarshal(data, &cb); err != nil {
-					continue
-				}
-				blocks = append(blocks, cb)
-			}
-		}
-		return blocks
-	}
-	return nil
-}
--- a/core/schema/anthropic_test.go
+++ b/core/schema/anthropic_test.go
@@ -1,216 +0,0 @@
-package schema_test
-
-import (
-	"encoding/json"
-
-	"github.com/mudler/LocalAI/core/schema"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("Anthropic Schema", func() {
-	Describe("AnthropicRequest", func() {
-		It("should unmarshal a valid request", func() {
-			jsonData := `{
-				"model": "claude-3-sonnet-20240229",
-				"max_tokens": 1024,
-				"messages": [
-					{"role": "user", "content": "Hello, world!"}
-				],
-				"system": "You are a helpful assistant.",
-				"temperature": 0.7
-			}`
-
-			var req schema.AnthropicRequest
-			err := json.Unmarshal([]byte(jsonData), &req)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(req.Model).To(Equal("claude-3-sonnet-20240229"))
-			Expect(req.MaxTokens).To(Equal(1024))
-			Expect(len(req.Messages)).To(Equal(1))
-			Expect(req.System).To(Equal("You are a helpful assistant."))
-			Expect(*req.Temperature).To(Equal(0.7))
-		})
-
-		It("should unmarshal a request with tools", func() {
-			jsonData := `{
-				"model": "claude-3-sonnet-20240229",
-				"max_tokens": 1024,
-				"messages": [
-					{"role": "user", "content": "What's the weather?"}
-				],
-				"tools": [
-					{
-						"name": "get_weather",
-						"description": "Get the current weather",
-						"input_schema": {
-							"type": "object",
-							"properties": {
-								"location": {"type": "string"}
-							}
-						}
-					}
-				],
-				"tool_choice": {"type": "tool", "name": "get_weather"}
-			}`
-
-			var req schema.AnthropicRequest
-			err := json.Unmarshal([]byte(jsonData), &req)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(req.Tools)).To(Equal(1))
-			Expect(req.Tools[0].Name).To(Equal("get_weather"))
-			Expect(req.Tools[0].Description).To(Equal("Get the current weather"))
-			Expect(req.ToolChoice).ToNot(BeNil())
-		})
-
-		It("should implement LocalAIRequest interface", func() {
-			req := &schema.AnthropicRequest{Model: "test-model"}
-			Expect(req.ModelName(nil)).To(Equal("test-model"))
-
-			newModel := "new-model"
-			Expect(req.ModelName(&newModel)).To(Equal("new-model"))
-			Expect(req.Model).To(Equal("new-model"))
-		})
-	})
-
-	Describe("AnthropicMessage", func() {
-		It("should get string content from string content", func() {
-			msg := schema.AnthropicMessage{
-				Role:    "user",
-				Content: "Hello, world!",
-			}
-			Expect(msg.GetStringContent()).To(Equal("Hello, world!"))
-		})
-
-		It("should get string content from array content", func() {
-			msg := schema.AnthropicMessage{
-				Role: "user",
-				Content: []interface{}{
-					map[string]interface{}{"type": "text", "text": "Hello, "},
-					map[string]interface{}{"type": "text", "text": "world!"},
-				},
-			}
-			Expect(msg.GetStringContent()).To(Equal("Hello, world!"))
-		})
-
-		It("should get content blocks from string content", func() {
-			msg := schema.AnthropicMessage{
-				Role:    "user",
-				Content: "Hello, world!",
-			}
-			blocks := msg.GetContentBlocks()
-			Expect(len(blocks)).To(Equal(1))
-			Expect(blocks[0].Type).To(Equal("text"))
-			Expect(blocks[0].Text).To(Equal("Hello, world!"))
-		})
-
-		It("should get content blocks from array content", func() {
-			msg := schema.AnthropicMessage{
-				Role: "user",
-				Content: []interface{}{
-					map[string]interface{}{"type": "text", "text": "Hello"},
-					map[string]interface{}{"type": "image", "source": map[string]interface{}{"type": "base64", "data": "abc123"}},
-				},
-			}
-			blocks := msg.GetContentBlocks()
-			Expect(len(blocks)).To(Equal(2))
-			Expect(blocks[0].Type).To(Equal("text"))
-			Expect(blocks[0].Text).To(Equal("Hello"))
-		})
-	})
-
-	Describe("AnthropicResponse", func() {
-		It("should marshal a valid response", func() {
-			stopReason := "end_turn"
-			resp := schema.AnthropicResponse{
-				ID:         "msg_123",
-				Type:       "message",
-				Role:       "assistant",
-				Model:      "claude-3-sonnet-20240229",
-				StopReason: &stopReason,
-				Content: []schema.AnthropicContentBlock{
-					{Type: "text", Text: "Hello!"},
-				},
-				Usage: schema.AnthropicUsage{
-					InputTokens:  10,
-					OutputTokens: 5,
-				},
-			}
-
-			data, err := json.Marshal(resp)
-			Expect(err).ToNot(HaveOccurred())
-
-			var result map[string]interface{}
-			err = json.Unmarshal(data, &result)
-			Expect(err).ToNot(HaveOccurred())
-
-			Expect(result["id"]).To(Equal("msg_123"))
-			Expect(result["type"]).To(Equal("message"))
-			Expect(result["role"]).To(Equal("assistant"))
-			Expect(result["stop_reason"]).To(Equal("end_turn"))
-		})
-
-		It("should marshal a response with tool use", func() {
-			stopReason := "tool_use"
-			resp := schema.AnthropicResponse{
-				ID:         "msg_123",
-				Type:       "message",
-				Role:       "assistant",
-				Model:      "claude-3-sonnet-20240229",
-				StopReason: &stopReason,
-				Content: []schema.AnthropicContentBlock{
-					{
-						Type: "tool_use",
-						ID:   "toolu_123",
-						Name: "get_weather",
-						Input: map[string]interface{}{
-							"location": "San Francisco",
-						},
-					},
-				},
-				Usage: schema.AnthropicUsage{
-					InputTokens:  10,
-					OutputTokens: 5,
-				},
-			}
-
-			data, err := json.Marshal(resp)
-			Expect(err).ToNot(HaveOccurred())
-
-			var result map[string]interface{}
-			err = json.Unmarshal(data, &result)
-			Expect(err).ToNot(HaveOccurred())
-
-			Expect(result["stop_reason"]).To(Equal("tool_use"))
-			content := result["content"].([]interface{})
-			Expect(len(content)).To(Equal(1))
-			toolUse := content[0].(map[string]interface{})
-			Expect(toolUse["type"]).To(Equal("tool_use"))
-			Expect(toolUse["id"]).To(Equal("toolu_123"))
-			Expect(toolUse["name"]).To(Equal("get_weather"))
-		})
-	})
-
-	Describe("AnthropicErrorResponse", func() {
-		It("should marshal an error response", func() {
-			resp := schema.AnthropicErrorResponse{
-				Type: "error",
-				Error: schema.AnthropicError{
-					Type:    "invalid_request_error",
-					Message: "max_tokens is required",
-				},
-			}
-
-			data, err := json.Marshal(resp)
-			Expect(err).ToNot(HaveOccurred())
-
-			var result map[string]interface{}
-			err = json.Unmarshal(data, &result)
-			Expect(err).ToNot(HaveOccurred())
-
-			Expect(result["type"]).To(Equal("error"))
-			errorObj := result["error"].(map[string]interface{})
-			Expect(errorObj["type"]).To(Equal("invalid_request_error"))
-			Expect(errorObj["message"]).To(Equal("max_tokens is required"))
-		})
-	})
-})
--- a/core/schema/message.go
+++ b/core/schema/message.go
@@ -27,9 +27,6 @@ type Message struct {
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`

 	ToolCalls []ToolCall `json:"tool_calls,omitempty" yaml:"tool_call,omitempty"`
-
-	// Reasoning content extracted from <thinking>...</thinking> tags
-	Reasoning *string `json:"reasoning,omitempty" yaml:"reasoning,omitempty"`
 }

 type ToolCall struct {
@@ -81,8 +78,8 @@ func (messages Messages) ToProto() []*proto.Message {
 			}
 		}

-		// Note: tool_call_id is not in schema.Message yet
-		// Reasoning field is now available in schema.Message but not yet in proto.Message
+		// Note: tool_call_id and reasoning_content are not in schema.Message yet
+		// They may need to be added to schema.Message if needed in the future
 	}
 	return protoMessages
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,56 +1,4 @@
 ---
- name: "qwen3-vl-reranker-8b"
-  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
-  urls:
-    - https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF
-  description: |
-    **Model Name:** Qwen3-VL-Reranker-8B
-    **Base Model:** Qwen/Qwen3-VL-Reranker-8B
-
-    **Description:**
-    A high-performance multimodal reranking model for state-of-the-art cross-modal search. It supports 30+ languages and handles text, images, screenshots, videos, and mixed modalities. With 8B parameters and a 32K context length, it refines retrieval results by combining embedding vectors with precise relevance scores. Optimized for efficiency, it supports quantized versions (e.g., Q8_0, Q4_K_M) and is ideal for applications requiring accurate multimodal content matching.
-
-    **Key Features:**
-      - **Multimodal**: Text, images, videos, and mixed content.
-      - **Language Support**: 30+ languages.
-      - **Quantization**: Available in Q8_0 (best quality), Q4_K_M (fast, recommended), and lower-precision options.
-      - **Performance**: Outperforms base models in retrieval tasks (e.g., JinaVDR, ViDoRe v3).
-      - **Use Case**: Enhances search pipelines by refining embeddings with precise relevance scores.
-
-    **Downloads:**
-      - [GGUF Files](https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF) (e.g., `Qwen3-VL-Reranker-8B.Q8_0.gguf`).
-
-    **Usage:**
-      - Requires `transformers`, `qwen-vl-utils`, and `torch`.
-      - Example: `from scripts.qwen3_vl_reranker import Qwen3VLReranker; model = Qwen3VLReranker(...)`
-
-    **Citation:**
-    @article{qwen3vlembedding, ...}
-
-    This description emphasizes its capabilities, efficiency, and versatility for multimodal search tasks.
-  overrides:
-    parameters:
-      model: llama-cpp/models/Qwen3-VL-Reranker-8B.Q4_K_M.gguf
-    name: Qwen3-VL-Reranker-8B-GGUF
-    backend: llama-cpp
-    template:
-      use_tokenizer_template: true
-    known_usecases:
-      - chat
-    function:
-      grammar:
-        disable: true
-    mmproj: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
-    description: Imported from https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF
-    options:
-      - use_jinja:true
-  files:
-    - filename: llama-cpp/models/Qwen3-VL-Reranker-8B.Q4_K_M.gguf
-      sha256: f73e62ea68abf741c3e713af823cfb4d2fd2ca35c8b68277b87b4b3d8570b66d
-      uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF/resolve/main/Qwen3-VL-Reranker-8B.Q4_K_M.gguf
-    - filename: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
-      sha256: 15cd9bd4882dae771344f0ac204fce07de91b47c1438ada3861dfc817403c31e
-      uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF/resolve/main/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
 - name: "liquidai.lfm2-2.6b-transcript"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
@@ -6163,7 +6111,6 @@
  tags:
    - embeddings
  overrides:
-    backend: llama-cpp
    embeddings: true
    parameters:
      model: granite-embedding-107m-multilingual-f16.gguf
--- a/go.mod
+++ b/go.mod
@@ -9,7 +9,6 @@ require (
 	fyne.io/fyne/v2 v2.7.1
 	github.com/Masterminds/sprig/v3 v3.3.0
 	github.com/alecthomas/kong v1.13.0
-	github.com/anthropics/anthropic-sdk-go v1.19.0
 	github.com/charmbracelet/glamour v0.10.0
 	github.com/containerd/containerd v1.7.30
 	github.com/ebitengine/purego v0.9.1
@@ -59,7 +58,6 @@ require (
 	go.opentelemetry.io/otel/metric v1.39.0
 	go.opentelemetry.io/otel/sdk/metric v1.39.0
 	google.golang.org/grpc v1.78.0
-	google.golang.org/protobuf v1.36.10
 	gopkg.in/yaml.v2 v2.4.0
 	gopkg.in/yaml.v3 v3.0.1
 	oras.land/oras-go/v2 v2.6.0
@@ -69,11 +67,8 @@ require (
 	github.com/ghodss/yaml v1.0.0 // indirect
 	github.com/labstack/gommon v0.4.2 // indirect
 	github.com/swaggo/files/v2 v2.0.2 // indirect
-	github.com/tidwall/gjson v1.18.0 // indirect
-	github.com/tidwall/match v1.1.1 // indirect
-	github.com/tidwall/pretty v1.2.1 // indirect
-	github.com/tidwall/sjson v1.2.5 // indirect
 	github.com/valyala/fasttemplate v1.2.2 // indirect
+	google.golang.org/protobuf v1.36.10 // indirect
 )

 require (
--- a/go.sum
+++ b/go.sum
@@ -44,8 +44,6 @@ github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu
 github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
 github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
 github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
-github.com/anthropics/anthropic-sdk-go v1.19.0 h1:mO6E+ffSzLRvR/YUH9KJC0uGw0uV8GjISIuzem//3KE=
-github.com/anthropics/anthropic-sdk-go v1.19.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
 github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8=
@@ -764,12 +762,10 @@ github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4
 github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
 github.com/testcontainers/testcontainers-go v0.40.0 h1:pSdJYLOVgLE8YdUY2FHQ1Fxu+aMnb6JfVz1mxk7OeMU=
 github.com/testcontainers/testcontainers-go v0.40.0/go.mod h1:FSXV5KQtX2HAMlm7U3APNyLkkap35zNLxukw9oBi/MY=
-github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
 github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
 github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
 github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
-github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
 github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
 github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
 github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
--- a/pkg/functions/reasoning.go
+++ b/pkg/functions/reasoning.go
@@ -1,114 +0,0 @@
-package functions
-
-import (
-	"strings"
-)
-
-// ExtractReasoning extracts reasoning content from thinking tags and returns
-// both the extracted reasoning and the cleaned content (with tags removed).
-// It handles <thinking>...</thinking> and <think>...</think> tags.
-// Multiple reasoning blocks are concatenated with newlines.
-func ExtractReasoning(content string) (reasoning string, cleanedContent string) {
-	if content == "" {
-		return "", content
-	}
-
-	var reasoningParts []string
-	var cleanedParts []string
-	remaining := content
-
-	// Define tag pairs to look for
-	tagPairs := []struct {
-		start string
-		end   string
-	}{
-		{"<thinking>", "</thinking>"},
-		{"<think>", "</think>"},
-	}
-
-	// Track the last position we've processed
-	lastPos := 0
-
-	for {
-		// Find the earliest tag start
-		earliestStart := -1
-		earliestEnd := -1
-		isUnclosed := false
-		var matchedTag struct {
-			start string
-			end   string
-		}
-
-		for _, tagPair := range tagPairs {
-			startIdx := strings.Index(remaining[lastPos:], tagPair.start)
-			if startIdx == -1 {
-				continue
-			}
-			startIdx += lastPos
-
-			// Find the corresponding end tag
-			endIdx := strings.Index(remaining[startIdx+len(tagPair.start):], tagPair.end)
-			if endIdx == -1 {
-				// Unclosed tag - extract what we have
-				if earliestStart == -1 || startIdx < earliestStart {
-					earliestStart = startIdx
-					earliestEnd = len(remaining)
-					isUnclosed = true
-					matchedTag = tagPair
-				}
-				continue
-			}
-			endIdx += startIdx + len(tagPair.start)
-
-			// Found a complete tag pair
-			if earliestStart == -1 || startIdx < earliestStart {
-				earliestStart = startIdx
-				earliestEnd = endIdx + len(tagPair.end)
-				isUnclosed = false
-				matchedTag = tagPair
-			}
-		}
-
-		if earliestStart == -1 {
-			// No more tags found, add remaining content
-			if lastPos < len(remaining) {
-				cleanedParts = append(cleanedParts, remaining[lastPos:])
-			}
-			break
-		}
-
-		// Add content before the tag
-		if earliestStart > lastPos {
-			cleanedParts = append(cleanedParts, remaining[lastPos:earliestStart])
-		}
-
-		// Extract reasoning content
-		reasoningStart := earliestStart + len(matchedTag.start)
-		// For unclosed tags, earliestEnd is already at the end of the string
-		// For closed tags, earliestEnd points to after the closing tag, so we subtract the end tag length
-		var reasoningEnd int
-		if isUnclosed {
-			// Unclosed tag - extract everything to the end
-			reasoningEnd = len(remaining)
-		} else {
-			// Closed tag - exclude the end tag
-			reasoningEnd = earliestEnd - len(matchedTag.end)
-		}
-		if reasoningEnd > reasoningStart {
-			reasoningContent := strings.TrimSpace(remaining[reasoningStart:reasoningEnd])
-			if reasoningContent != "" {
-				reasoningParts = append(reasoningParts, reasoningContent)
-			}
-		}
-
-		// Move past this tag
-		lastPos = earliestEnd
-	}
-
-	// Combine reasoning parts
-	reasoning = strings.Join(reasoningParts, "\n\n")
-	// Combine cleaned content parts
-	cleanedContent = strings.Join(cleanedParts, "")
-
-	return reasoning, cleanedContent
-}
--- a/pkg/functions/reasoning_test.go
+++ b/pkg/functions/reasoning_test.go
@@ -1,261 +0,0 @@
-package functions_test
-
-import (
-	"strings"
-
-	. "github.com/mudler/LocalAI/pkg/functions"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("ExtractReasoning", func() {
-	Context("when content has no reasoning tags", func() {
-		It("should return empty reasoning and original content", func() {
-			content := "This is regular content without any tags."
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal(content))
-		})
-
-		It("should handle empty string", func() {
-			content := ""
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(BeEmpty())
-		})
-
-		It("should handle content with only whitespace", func() {
-			content := "   \n\t  "
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal(content))
-		})
-	})
-
-	Context("when content has <thinking> tags", func() {
-		It("should extract reasoning from single thinking block", func() {
-			content := "Some text <thinking>This is my reasoning</thinking> More text"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("This is my reasoning"))
-			Expect(cleaned).To(Equal("Some text  More text"))
-		})
-
-		It("should extract reasoning and preserve surrounding content", func() {
-			content := "Before <thinking>Reasoning here</thinking> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning here"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle thinking block at the start", func() {
-			content := "<thinking>Start reasoning</thinking> Regular content"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Start reasoning"))
-			Expect(cleaned).To(Equal(" Regular content"))
-		})
-
-		It("should handle thinking block at the end", func() {
-			content := "Regular content <thinking>End reasoning</thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("End reasoning"))
-			Expect(cleaned).To(Equal("Regular content "))
-		})
-
-		It("should handle only thinking block", func() {
-			content := "<thinking>Only reasoning</thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Only reasoning"))
-			Expect(cleaned).To(BeEmpty())
-		})
-
-		It("should trim whitespace from reasoning content", func() {
-			content := "Text <thinking>  \n  Reasoning with spaces  \n  </thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with spaces"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has <think> tags", func() {
-		It("should extract reasoning from redacted_reasoning block", func() {
-			content := "Text <think>Redacted reasoning</think> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Redacted reasoning"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should handle redacted_reasoning with multiline content", func() {
-			content := "Before <think>Line 1\nLine 2\nLine 3</think> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle redacted_reasoning with complex content", func() {
-			content := "Start <think>Complex reasoning\nwith\nmultiple\nlines</think> End"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Complex reasoning\nwith\nmultiple\nlines"))
-			Expect(cleaned).To(Equal("Start  End"))
-		})
-	})
-
-	Context("when content has multiple reasoning blocks", func() {
-		It("should concatenate multiple thinking blocks with newlines", func() {
-			content := "Text <thinking>First</thinking> Middle <thinking>Second</thinking> End"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("First\n\nSecond"))
-			Expect(cleaned).To(Equal("Text  Middle  End"))
-		})
-
-		It("should handle multiple different tag types", func() {
-			content := "A <thinking>One</thinking> B <think>Two</think> C <think>Three</think> D"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(ContainSubstring("One"))
-			Expect(reasoning).To(ContainSubstring("Two"))
-			Expect(reasoning).To(ContainSubstring("Three"))
-			Expect(cleaned).To(Equal("A  B  C  D"))
-		})
-
-		It("should handle nested tags correctly (extracts first match)", func() {
-			content := "Text <thinking>Outer <think>Inner</think></thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			// Should extract the outer thinking block
-			Expect(reasoning).To(ContainSubstring("Outer"))
-			Expect(reasoning).To(ContainSubstring("Inner"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has unclosed reasoning tags", func() {
-		It("should extract unclosed thinking block", func() {
-			content := "Text <thinking>Unclosed reasoning"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Unclosed reasoning"))
-			Expect(cleaned).To(Equal("Text "))
-		})
-
-		It("should extract unclosed think block", func() {
-			content := "Before <think>Incomplete"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Incomplete"))
-			Expect(cleaned).To(Equal("Before "))
-		})
-
-		It("should extract unclosed redacted_reasoning block", func() {
-			content := "Start <think>Partial reasoning content"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Partial reasoning content"))
-			Expect(cleaned).To(Equal("Start "))
-		})
-
-		It("should handle unclosed tag at the end", func() {
-			content := "Regular content <thinking>Unclosed at end"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Unclosed at end"))
-			Expect(cleaned).To(Equal("Regular content "))
-		})
-	})
-
-	Context("when content has empty reasoning blocks", func() {
-		It("should ignore empty thinking block", func() {
-			content := "Text <thinking></thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should ignore thinking block with only whitespace", func() {
-			content := "Text <thinking>   \n\t  </thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has reasoning tags with special characters", func() {
-		It("should handle reasoning with newlines", func() {
-			content := "Before <thinking>Line 1\nLine 2\nLine 3</thinking> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Line 1\nLine 2\nLine 3"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle reasoning with code blocks", func() {
-			content := "Text <thinking>Reasoning with ```code``` blocks</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with ```code``` blocks"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should handle reasoning with JSON", func() {
-			content := "Before <think>{\"key\": \"value\"}</think> After"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("{\"key\": \"value\"}"))
-			Expect(cleaned).To(Equal("Before  After"))
-		})
-
-		It("should handle reasoning with HTML-like content", func() {
-			content := "Text <thinking>Reasoning with <tags> inside</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with <tags> inside"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-
-	Context("when content has reasoning mixed with regular content", func() {
-		It("should preserve content order correctly", func() {
-			content := "Start <thinking>Reasoning</thinking> Middle <think>More reasoning</think> End"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(ContainSubstring("Reasoning"))
-			Expect(reasoning).To(ContainSubstring("More reasoning"))
-			Expect(cleaned).To(Equal("Start  Middle  End"))
-		})
-
-		It("should handle reasoning in the middle of a sentence", func() {
-			content := "This is a <thinking>reasoning</thinking> sentence."
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("reasoning"))
-			Expect(cleaned).To(Equal("This is a  sentence."))
-		})
-	})
-
-	Context("edge cases", func() {
-		It("should handle content with only opening tag", func() {
-			content := "<thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal(""))
-		})
-
-		It("should handle content with only closing tag", func() {
-			content := "</thinking>"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(BeEmpty())
-			Expect(cleaned).To(Equal("</thinking>"))
-		})
-
-		It("should handle mismatched tags", func() {
-			content := "<thinking>Content</think>"
-			reasoning, cleaned := ExtractReasoning(content)
-			// Should extract unclosed thinking block
-			Expect(reasoning).To(ContainSubstring("Content"))
-			Expect(cleaned).To(Equal(""))
-		})
-
-		It("should handle very long reasoning content", func() {
-			longReasoning := strings.Repeat("This is reasoning content. ", 100)
-			content := "Text <thinking>" + longReasoning + "</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			// TrimSpace is applied, so we need to account for that
-			Expect(reasoning).To(Equal(strings.TrimSpace(longReasoning)))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-
-		It("should handle reasoning with unicode characters", func() {
-			content := "Text <thinking>Reasoning with 中文 and emoji 🧠</thinking> More"
-			reasoning, cleaned := ExtractReasoning(content)
-			Expect(reasoning).To(Equal("Reasoning with 中文 and emoji 🧠"))
-			Expect(cleaned).To(Equal("Text  More"))
-		})
-	})
-})
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -24,6 +24,8 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 		return fmt.Errorf("model %s not found", s)
 	}

+	defer delete(ml.models, s)
+
 	retries := 1
 	for model.GRPC(false, ml.wd).IsBusy() {
 		xlog.Debug("Model busy. Waiting.", "model", s)
@@ -46,7 +48,6 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 	if process == nil {
 		xlog.Error("No process", "model", s)
 		// Nothing to do as there is no process
-		delete(ml.models, s)
 		return nil
 	}

@@ -55,10 +56,6 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 		xlog.Error("(deleteProcess) error while deleting process", "error", err, "model", s)
 	}

-	if err == nil {
-		delete(ml.models, s)
-	}
-
 	return err
 }

--- a/pkg/system/capabilities.go
+++ b/pkg/system/capabilities.go
@@ -12,17 +12,15 @@ import (
 )

 const (
-	// Public constants - used by tests and external packages
-	Nvidia = "nvidia"
-	AMD    = "amd"
-	Intel  = "intel"
-
-	// Private constants - only used within this package
 	defaultCapability = "default"
 	nvidiaL4T         = "nvidia-l4t"
 	darwinX86         = "darwin-x86"
 	metal             = "metal"
-	vulkan            = "vulkan"
+	nvidia            = "nvidia"
+
+	amd    = "amd"
+	intel  = "intel"
+	vulkan = "vulkan"

 	nvidiaCuda13    = "nvidia-cuda-13"
 	nvidiaCuda12    = "nvidia-cuda-12"
@@ -32,16 +30,6 @@ const (
 	capabilityEnv        = "LOCALAI_FORCE_META_BACKEND_CAPABILITY"
 	capabilityRunFileEnv = "LOCALAI_FORCE_META_BACKEND_CAPABILITY_RUN_FILE"
 	defaultRunFile       = "/run/localai/capability"
-
-	// Backend detection tokens (private)
-	backendTokenDarwin = "darwin"
-	backendTokenMLX    = "mlx"
-	backendTokenMetal  = "metal"
-	backendTokenL4T    = "l4t"
-	backendTokenCUDA   = "cuda"
-	backendTokenROCM   = "rocm"
-	backendTokenHIP    = "hip"
-	backendTokenSYCL   = "sycl"
 )

 var (
@@ -108,7 +96,7 @@ func (s *SystemState) getSystemCapabilities() string {

 	// If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t
 	if runtime.GOOS == "linux" && runtime.GOARCH == "arm64" {
-		if s.GPUVendor == Nvidia {
+		if s.GPUVendor == nvidia {
 			xlog.Info("Using nvidia-l4t capability (arm64 on linux)", "env", capabilityEnv)
 			if cuda13DirExists {
 				return nvidiaL4TCuda13
@@ -143,6 +131,7 @@ func (s *SystemState) getSystemCapabilities() string {
 	return s.GPUVendor
 }

+
 // BackendPreferenceTokens returns a list of substrings that represent the preferred
 // backend implementation order for the current system capability. Callers can use
 // these tokens to select the most appropriate concrete backend among multiple
@@ -150,76 +139,19 @@ func (s *SystemState) getSystemCapabilities() string {
 func (s *SystemState) BackendPreferenceTokens() []string {
 	capStr := strings.ToLower(s.getSystemCapabilities())
 	switch {
-	case strings.HasPrefix(capStr, Nvidia):
-		return []string{backendTokenCUDA, vulkan, "cpu"}
-	case strings.HasPrefix(capStr, AMD):
-		return []string{backendTokenROCM, backendTokenHIP, vulkan, "cpu"}
-	case strings.HasPrefix(capStr, Intel):
-		return []string{backendTokenSYCL, Intel, "cpu"}
+	case strings.HasPrefix(capStr, nvidia):
+		return []string{"cuda", "vulkan", "cpu"}
+	case strings.HasPrefix(capStr, amd):
+		return []string{"rocm", "hip", "vulkan", "cpu"}
+	case strings.HasPrefix(capStr, intel):
+		return []string{"sycl", intel, "cpu"}
 	case strings.HasPrefix(capStr, metal):
-		return []string{backendTokenMetal, "cpu"}
+		return []string{"metal", "cpu"}
 	case strings.HasPrefix(capStr, darwinX86):
 		return []string{"darwin-x86", "cpu"}
 	case strings.HasPrefix(capStr, vulkan):
-		return []string{vulkan, "cpu"}
+		return []string{"vulkan", "cpu"}
 	default:
 		return []string{"cpu"}
 	}
 }
-
-// DetectedCapability returns the detected system capability string.
-// This can be used by the UI to display what capability was detected.
-func (s *SystemState) DetectedCapability() string {
-	return s.getSystemCapabilities()
-}
-
-// IsBackendCompatible checks if a backend (identified by name and URI) is compatible
-// with the current system capability. This function uses getSystemCapabilities to ensure
-// consistency with capability detection (including VRAM checks, environment overrides, etc.).
-func (s *SystemState) IsBackendCompatible(name, uri string) bool {
-	combined := strings.ToLower(name + " " + uri)
-	capability := s.getSystemCapabilities()
-
-	// Check for darwin/macOS-specific backends (mlx, metal, darwin)
-	isDarwinBackend := strings.Contains(combined, backendTokenDarwin) ||
-		strings.Contains(combined, backendTokenMLX) ||
-		strings.Contains(combined, backendTokenMetal)
-	if isDarwinBackend {
-		// Darwin backends require the system to be running on darwin with metal or darwin-x86 capability
-		return capability == metal || capability == darwinX86
-	}
-
-	// Check for NVIDIA L4T-specific backends (arm64 Linux with NVIDIA GPU)
-	// This must be checked before the general NVIDIA check as L4T backends
-	// may also contain "cuda" or "nvidia" in their names
-	isL4TBackend := strings.Contains(combined, backendTokenL4T)
-	if isL4TBackend {
-		return strings.HasPrefix(capability, nvidiaL4T)
-	}
-
-	// Check for NVIDIA/CUDA-specific backends (non-L4T)
-	isNvidiaBackend := strings.Contains(combined, backendTokenCUDA) ||
-		strings.Contains(combined, Nvidia)
-	if isNvidiaBackend {
-		// NVIDIA backends are compatible with nvidia, nvidia-cuda-12, nvidia-cuda-13, and l4t capabilities
-		return strings.HasPrefix(capability, Nvidia)
-	}
-
-	// Check for AMD/ROCm-specific backends
-	isAMDBackend := strings.Contains(combined, backendTokenROCM) ||
-		strings.Contains(combined, backendTokenHIP) ||
-		strings.Contains(combined, AMD)
-	if isAMDBackend {
-		return capability == AMD
-	}
-
-	// Check for Intel/SYCL-specific backends
-	isIntelBackend := strings.Contains(combined, backendTokenSYCL) ||
-		strings.Contains(combined, Intel)
-	if isIntelBackend {
-		return capability == Intel
-	}
-
-	// CPU backends are always compatible
-	return true
-}
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1198,30 +1198,6 @@ const docTemplate = `{
                }
            }
        },
-        "/v1/messages": {
-            "post": {
-                "summary": "Generate a message response for the given messages and model.",
-                "parameters": [
-                    {
-                        "description": "query params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.AnthropicRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "Response",
-                        "schema": {
-                            "$ref": "#/definitions/schema.AnthropicResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/v1/models": {
            "get": {
                "summary": "List and describe the various models available in the API.",
@@ -1763,169 +1739,6 @@ const docTemplate = `{
                }
            }
        },
-        "schema.AnthropicContentBlock": {
-            "type": "object",
-            "properties": {
-                "content": {},
-                "id": {
-                    "type": "string"
-                },
-                "input": {
-                    "type": "object",
-                    "additionalProperties": true
-                },
-                "is_error": {
-                    "type": "boolean"
-                },
-                "name": {
-                    "type": "string"
-                },
-                "source": {
-                    "$ref": "#/definitions/schema.AnthropicImageSource"
-                },
-                "text": {
-                    "type": "string"
-                },
-                "tool_use_id": {
-                    "type": "string"
-                },
-                "type": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicImageSource": {
-            "type": "object",
-            "properties": {
-                "data": {
-                    "type": "string"
-                },
-                "media_type": {
-                    "type": "string"
-                },
-                "type": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicMessage": {
-            "type": "object",
-            "properties": {
-                "content": {},
-                "role": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicRequest": {
-            "type": "object",
-            "properties": {
-                "max_tokens": {
-                    "type": "integer"
-                },
-                "messages": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.AnthropicMessage"
-                    }
-                },
-                "metadata": {
-                    "type": "object",
-                    "additionalProperties": {
-                        "type": "string"
-                    }
-                },
-                "model": {
-                    "type": "string"
-                },
-                "stop_sequences": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "stream": {
-                    "type": "boolean"
-                },
-                "system": {
-                    "type": "string"
-                },
-                "temperature": {
-                    "type": "number"
-                },
-                "tool_choice": {},
-                "tools": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.AnthropicTool"
-                    }
-                },
-                "top_k": {
-                    "type": "integer"
-                },
-                "top_p": {
-                    "type": "number"
-                }
-            }
-        },
-        "schema.AnthropicResponse": {
-            "type": "object",
-            "properties": {
-                "content": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.AnthropicContentBlock"
-                    }
-                },
-                "id": {
-                    "type": "string"
-                },
-                "model": {
-                    "type": "string"
-                },
-                "role": {
-                    "type": "string"
-                },
-                "stop_reason": {
-                    "type": "string"
-                },
-                "stop_sequence": {
-                    "type": "string"
-                },
-                "type": {
-                    "type": "string"
-                },
-                "usage": {
-                    "$ref": "#/definitions/schema.AnthropicUsage"
-                }
-            }
-        },
-        "schema.AnthropicTool": {
-            "type": "object",
-            "properties": {
-                "description": {
-                    "type": "string"
-                },
-                "input_schema": {
-                    "type": "object",
-                    "additionalProperties": true
-                },
-                "name": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicUsage": {
-            "type": "object",
-            "properties": {
-                "input_tokens": {
-                    "type": "integer"
-                },
-                "output_tokens": {
-                    "type": "integer"
-                }
-            }
-        },
        "schema.BackendMonitorRequest": {
            "type": "object",
            "properties": {
@@ -2416,10 +2229,6 @@ const docTemplate = `{
                    "description": "The message name (used for tools calls)",
                    "type": "string"
                },
-                "reasoning": {
-                    "description": "Reasoning content extracted from \u003cthinking\u003e...\u003c/thinking\u003e tags",
-                    "type": "string"
-                },
                "role": {
                    "description": "The message role",
                    "type": "string"
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1191,30 +1191,6 @@
                }
            }
        },
-        "/v1/messages": {
-            "post": {
-                "summary": "Generate a message response for the given messages and model.",
-                "parameters": [
-                    {
-                        "description": "query params",
-                        "name": "request",
-                        "in": "body",
-                        "required": true,
-                        "schema": {
-                            "$ref": "#/definitions/schema.AnthropicRequest"
-                        }
-                    }
-                ],
-                "responses": {
-                    "200": {
-                        "description": "Response",
-                        "schema": {
-                            "$ref": "#/definitions/schema.AnthropicResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/v1/models": {
            "get": {
                "summary": "List and describe the various models available in the API.",
@@ -1756,169 +1732,6 @@
                }
            }
        },
-        "schema.AnthropicContentBlock": {
-            "type": "object",
-            "properties": {
-                "content": {},
-                "id": {
-                    "type": "string"
-                },
-                "input": {
-                    "type": "object",
-                    "additionalProperties": true
-                },
-                "is_error": {
-                    "type": "boolean"
-                },
-                "name": {
-                    "type": "string"
-                },
-                "source": {
-                    "$ref": "#/definitions/schema.AnthropicImageSource"
-                },
-                "text": {
-                    "type": "string"
-                },
-                "tool_use_id": {
-                    "type": "string"
-                },
-                "type": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicImageSource": {
-            "type": "object",
-            "properties": {
-                "data": {
-                    "type": "string"
-                },
-                "media_type": {
-                    "type": "string"
-                },
-                "type": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicMessage": {
-            "type": "object",
-            "properties": {
-                "content": {},
-                "role": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicRequest": {
-            "type": "object",
-            "properties": {
-                "max_tokens": {
-                    "type": "integer"
-                },
-                "messages": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.AnthropicMessage"
-                    }
-                },
-                "metadata": {
-                    "type": "object",
-                    "additionalProperties": {
-                        "type": "string"
-                    }
-                },
-                "model": {
-                    "type": "string"
-                },
-                "stop_sequences": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "stream": {
-                    "type": "boolean"
-                },
-                "system": {
-                    "type": "string"
-                },
-                "temperature": {
-                    "type": "number"
-                },
-                "tool_choice": {},
-                "tools": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.AnthropicTool"
-                    }
-                },
-                "top_k": {
-                    "type": "integer"
-                },
-                "top_p": {
-                    "type": "number"
-                }
-            }
-        },
-        "schema.AnthropicResponse": {
-            "type": "object",
-            "properties": {
-                "content": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.AnthropicContentBlock"
-                    }
-                },
-                "id": {
-                    "type": "string"
-                },
-                "model": {
-                    "type": "string"
-                },
-                "role": {
-                    "type": "string"
-                },
-                "stop_reason": {
-                    "type": "string"
-                },
-                "stop_sequence": {
-                    "type": "string"
-                },
-                "type": {
-                    "type": "string"
-                },
-                "usage": {
-                    "$ref": "#/definitions/schema.AnthropicUsage"
-                }
-            }
-        },
-        "schema.AnthropicTool": {
-            "type": "object",
-            "properties": {
-                "description": {
-                    "type": "string"
-                },
-                "input_schema": {
-                    "type": "object",
-                    "additionalProperties": true
-                },
-                "name": {
-                    "type": "string"
-                }
-            }
-        },
-        "schema.AnthropicUsage": {
-            "type": "object",
-            "properties": {
-                "input_tokens": {
-                    "type": "integer"
-                },
-                "output_tokens": {
-                    "type": "integer"
-                }
-            }
-        },
        "schema.BackendMonitorRequest": {
            "type": "object",
            "properties": {
@@ -2409,10 +2222,6 @@
                    "description": "The message name (used for tools calls)",
                    "type": "string"
                },
-                "reasoning": {
-                    "description": "Reasoning content extracted from \u003cthinking\u003e...\u003c/thinking\u003e tags",
-                    "type": "string"
-                },
                "role": {
                    "description": "The message role",
                    "type": "string"
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -239,114 +239,6 @@ definitions:
      start:
        type: number
    type: object
-  schema.AnthropicContentBlock:
-    properties:
-      content: {}
-      id:
-        type: string
-      input:
-        additionalProperties: true
-        type: object
-      is_error:
-        type: boolean
-      name:
-        type: string
-      source:
-        $ref: '#/definitions/schema.AnthropicImageSource'
-      text:
-        type: string
-      tool_use_id:
-        type: string
-      type:
-        type: string
-    type: object
-  schema.AnthropicImageSource:
-    properties:
-      data:
-        type: string
-      media_type:
-        type: string
-      type:
-        type: string
-    type: object
-  schema.AnthropicMessage:
-    properties:
-      content: {}
-      role:
-        type: string
-    type: object
-  schema.AnthropicRequest:
-    properties:
-      max_tokens:
-        type: integer
-      messages:
-        items:
-          $ref: '#/definitions/schema.AnthropicMessage'
-        type: array
-      metadata:
-        additionalProperties:
-          type: string
-        type: object
-      model:
-        type: string
-      stop_sequences:
-        items:
-          type: string
-        type: array
-      stream:
-        type: boolean
-      system:
-        type: string
-      temperature:
-        type: number
-      tool_choice: {}
-      tools:
-        items:
-          $ref: '#/definitions/schema.AnthropicTool'
-        type: array
-      top_k:
-        type: integer
-      top_p:
-        type: number
-    type: object
-  schema.AnthropicResponse:
-    properties:
-      content:
-        items:
-          $ref: '#/definitions/schema.AnthropicContentBlock'
-        type: array
-      id:
-        type: string
-      model:
-        type: string
-      role:
-        type: string
-      stop_reason:
-        type: string
-      stop_sequence:
-        type: string
-      type:
-        type: string
-      usage:
-        $ref: '#/definitions/schema.AnthropicUsage'
-    type: object
-  schema.AnthropicTool:
-    properties:
-      description:
-        type: string
-      input_schema:
-        additionalProperties: true
-        type: object
-      name:
-        type: string
-    type: object
-  schema.AnthropicUsage:
-    properties:
-      input_tokens:
-        type: integer
-      output_tokens:
-        type: integer
-    type: object
  schema.BackendMonitorRequest:
    properties:
      model:
@@ -681,9 +573,6 @@ definitions:
      name:
        description: The message name (used for tools calls)
        type: string
-      reasoning:
-        description: Reasoning content extracted from <thinking>...</thinking> tags
-        type: string
      role:
        description: The message role
        type: string
@@ -1924,21 +1813,6 @@ paths:
          schema:
            $ref: '#/definitions/schema.OpenAIResponse'
      summary: Stream MCP chat completions with reasoning, tool calls, and results
-  /v1/messages:
-    post:
-      parameters:
-      - description: query params
-        in: body
-        name: request
-        required: true
-        schema:
-          $ref: '#/definitions/schema.AnthropicRequest'
-      responses:
-        "200":
-          description: Response
-          schema:
-            $ref: '#/definitions/schema.AnthropicResponse'
-      summary: Generate a message response for the given messages and model.
  /v1/models:
    get:
      responses:
--- a/tests/e2e/e2e_anthropic_test.go
+++ b/tests/e2e/e2e_anthropic_test.go
@@ -1,375 +0,0 @@
-package e2e_test
-
-import (
-	"context"
-
-	"github.com/anthropics/anthropic-sdk-go"
-	"github.com/anthropics/anthropic-sdk-go/option"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("Anthropic API E2E test", func() {
-	var client anthropic.Client
-
-	Context("API with Anthropic SDK", func() {
-		BeforeEach(func() {
-			// Create Anthropic client pointing to LocalAI
-			client = anthropic.NewClient(
-				option.WithBaseURL(localAIURL),
-				option.WithAPIKey("test-api-key"), // LocalAI doesn't require a real API key
-			)
-
-			// Wait for API to be ready by attempting a simple request
-			Eventually(func() error {
-				_, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 10,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("Hi")),
-					},
-				})
-				return err
-			}, "2m").ShouldNot(HaveOccurred())
-		})
-
-		Context("Non-streaming responses", func() {
-			It("generates a response for a simple message", func() {
-				message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("How much is 2+2? Reply with just the number.")),
-					},
-				})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(message.Content).ToNot(BeEmpty())
-				// Role is a constant type that defaults to "assistant"
-				Expect(string(message.Role)).To(Equal("assistant"))
-				Expect(message.StopReason).To(Equal(anthropic.MessageStopReasonEndTurn))
-				Expect(string(message.Type)).To(Equal("message"))
-
-				// Check that content contains text block with expected answer
-				Expect(len(message.Content)).To(BeNumerically(">=", 1))
-				textBlock := message.Content[0]
-				Expect(string(textBlock.Type)).To(Equal("text"))
-				Expect(textBlock.Text).To(Or(ContainSubstring("4"), ContainSubstring("four")))
-			})
-
-			It("handles system prompts", func() {
-				message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					System: []anthropic.TextBlockParam{
-						{Text: "You are a helpful assistant. Always respond in uppercase letters."},
-					},
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("Say hello")),
-					},
-				})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(message.Content).ToNot(BeEmpty())
-				Expect(len(message.Content)).To(BeNumerically(">=", 1))
-			})
-
-			It("returns usage information", func() {
-				message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 100,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("Hello")),
-					},
-				})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(message.Usage.InputTokens).To(BeNumerically(">", 0))
-				Expect(message.Usage.OutputTokens).To(BeNumerically(">", 0))
-			})
-		})
-
-		Context("Streaming responses", func() {
-			It("streams tokens for a simple message", func() {
-				stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("Count from 1 to 5")),
-					},
-				})
-
-				message := anthropic.Message{}
-				eventCount := 0
-				hasContentDelta := false
-
-				for stream.Next() {
-					event := stream.Current()
-					err := message.Accumulate(event)
-					Expect(err).ToNot(HaveOccurred())
-					eventCount++
-
-					// Check for content block delta events
-					switch event.AsAny().(type) {
-					case anthropic.ContentBlockDeltaEvent:
-						hasContentDelta = true
-					}
-				}
-
-				Expect(stream.Err()).ToNot(HaveOccurred())
-				Expect(eventCount).To(BeNumerically(">", 0))
-				Expect(hasContentDelta).To(BeTrue())
-
-				// Check accumulated message
-				Expect(message.Content).ToNot(BeEmpty())
-				// Role is a constant type that defaults to "assistant"
-				Expect(string(message.Role)).To(Equal("assistant"))
-			})
-
-			It("streams with system prompt", func() {
-				stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					System: []anthropic.TextBlockParam{
-						{Text: "You are a helpful assistant."},
-					},
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("Say hello")),
-					},
-				})
-
-				message := anthropic.Message{}
-				for stream.Next() {
-					event := stream.Current()
-					err := message.Accumulate(event)
-					Expect(err).ToNot(HaveOccurred())
-				}
-
-				Expect(stream.Err()).ToNot(HaveOccurred())
-				Expect(message.Content).ToNot(BeEmpty())
-			})
-		})
-
-		Context("Tool calling", func() {
-			It("handles tool calls in non-streaming mode", func() {
-				message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather like in San Francisco?")),
-					},
-					Tools: []anthropic.ToolParam{
-						{
-							Name:        "get_weather",
-							Description: anthropic.F("Get the current weather in a given location"),
-							InputSchema: anthropic.F(map[string]interface{}{
-								"type": "object",
-								"properties": map[string]interface{}{
-									"location": map[string]interface{}{
-										"type":        "string",
-										"description": "The city and state, e.g. San Francisco, CA",
-									},
-								},
-								"required": []string{"location"},
-							}),
-						},
-					},
-				})
-
-				Expect(err).ToNot(HaveOccurred())
-				Expect(message.Content).ToNot(BeEmpty())
-
-				// The model must use tools - find the tool use in the response
-				hasToolUse := false
-				for _, block := range message.Content {
-					if block.Type == anthropic.ContentBlockTypeToolUse {
-						hasToolUse = true
-						Expect(block.Name).To(Equal("get_weather"))
-						Expect(block.ID).ToNot(BeEmpty())
-						// Verify that input contains location
-						inputMap, ok := block.Input.(map[string]interface{})
-						Expect(ok).To(BeTrue())
-						_, hasLocation := inputMap["location"]
-						Expect(hasLocation).To(BeTrue())
-					}
-				}
-
-				// Model must have called the tool
-				Expect(hasToolUse).To(BeTrue(), "Model should have called the get_weather tool")
-				Expect(message.StopReason).To(Equal(anthropic.MessageStopReasonToolUse))
-			})
-
-			It("handles tool_choice parameter", func() {
-				message, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("Tell me about the weather")),
-					},
-					Tools: []anthropic.ToolParam{
-						{
-							Name:        "get_weather",
-							Description: anthropic.F("Get the current weather"),
-							InputSchema: anthropic.F(map[string]interface{}{
-								"type": "object",
-								"properties": map[string]interface{}{
-									"location": map[string]interface{}{
-										"type": "string",
-									},
-								},
-							}),
-						},
-					},
-					ToolChoice: anthropic.F[anthropic.ToolChoiceUnionParam](
-						anthropic.ToolChoiceAutoParam{
-							Type: anthropic.F(anthropic.ToolChoiceAutoTypeAuto),
-						},
-					),
-				})
-
-				Expect(err).ToNot(HaveOccurred())
-				Expect(message.Content).ToNot(BeEmpty())
-			})
-
-			It("handles tool results in messages", func() {
-				// First, make a request that should trigger a tool call
-				firstMessage, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather in SF?")),
-					},
-					Tools: []anthropic.ToolParam{
-						{
-							Name:        "get_weather",
-							Description: anthropic.F("Get weather"),
-							InputSchema: anthropic.F(map[string]interface{}{
-								"type": "object",
-								"properties": map[string]interface{}{
-									"location": map[string]interface{}{"type": "string"},
-								},
-							}),
-						},
-					},
-				})
-
-				Expect(err).ToNot(HaveOccurred())
-
-				// Find the tool use block - model must call the tool
-				var toolUseID string
-				var toolUseName string
-				for _, block := range firstMessage.Content {
-					if block.Type == anthropic.ContentBlockTypeToolUse {
-						toolUseID = block.ID
-						toolUseName = block.Name
-						break
-					}
-				}
-
-				// Model must have called the tool
-				Expect(toolUseID).ToNot(BeEmpty(), "Model should have called the get_weather tool")
-
-				// Send back a tool result and verify it's handled correctly
-				secondMessage, err := client.Messages.New(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather in SF?")),
-						anthropic.NewAssistantMessage(firstMessage.Content...),
-						anthropic.NewUserMessage(
-							anthropic.NewToolResultBlock(toolUseID, "Sunny, 72°F", false),
-						),
-					},
-					Tools: []anthropic.ToolParam{
-						{
-							Name:        toolUseName,
-							Description: anthropic.F("Get weather"),
-							InputSchema: anthropic.F(map[string]interface{}{
-								"type": "object",
-								"properties": map[string]interface{}{
-									"location": map[string]interface{}{"type": "string"},
-								},
-							}),
-						},
-					},
-				})
-
-				Expect(err).ToNot(HaveOccurred())
-				Expect(secondMessage.Content).ToNot(BeEmpty())
-			})
-
-			It("handles tool calls in streaming mode", func() {
-				stream := client.Messages.NewStreaming(context.TODO(), anthropic.MessageNewParams{
-					Model:     "gpt-4",
-					MaxTokens: 1024,
-					Messages: []anthropic.MessageParam{
-						anthropic.NewUserMessage(anthropic.NewTextBlock("What's the weather like in San Francisco?")),
-					},
-					Tools: []anthropic.ToolParam{
-						{
-							Name:        "get_weather",
-							Description: anthropic.F("Get the current weather in a given location"),
-							InputSchema: anthropic.F(map[string]interface{}{
-								"type": "object",
-								"properties": map[string]interface{}{
-									"location": map[string]interface{}{
-										"type":        "string",
-										"description": "The city and state, e.g. San Francisco, CA",
-									},
-								},
-								"required": []string{"location"},
-							}),
-						},
-					},
-				})
-
-				message := anthropic.Message{}
-				eventCount := 0
-				hasToolUseBlock := false
-				hasContentBlockStart := false
-				hasContentBlockDelta := false
-				hasContentBlockStop := false
-
-				for stream.Next() {
-					event := stream.Current()
-					err := message.Accumulate(event)
-					Expect(err).ToNot(HaveOccurred())
-					eventCount++
-
-					// Check for different event types related to tool use
-					switch e := event.AsAny().(type) {
-					case anthropic.ContentBlockStartEvent:
-						hasContentBlockStart = true
-						if e.ContentBlock.Type == anthropic.ContentBlockTypeToolUse {
-							hasToolUseBlock = true
-						}
-					case anthropic.ContentBlockDeltaEvent:
-						hasContentBlockDelta = true
-					case anthropic.ContentBlockStopEvent:
-						hasContentBlockStop = true
-					}
-				}
-
-				Expect(stream.Err()).ToNot(HaveOccurred())
-				Expect(eventCount).To(BeNumerically(">", 0))
-
-				// Verify streaming events were emitted
-				Expect(hasContentBlockStart).To(BeTrue(), "Should have content_block_start event")
-				Expect(hasContentBlockDelta).To(BeTrue(), "Should have content_block_delta event")
-				Expect(hasContentBlockStop).To(BeTrue(), "Should have content_block_stop event")
-
-				// Check accumulated message has tool use
-				Expect(message.Content).ToNot(BeEmpty())
-				
-				// Model must have called the tool
-				foundToolUse := false
-				for _, block := range message.Content {
-					if block.Type == anthropic.ContentBlockTypeToolUse {
-						foundToolUse = true
-						Expect(block.Name).To(Equal("get_weather"))
-						Expect(block.ID).ToNot(BeEmpty())
-					}
-				}
-				Expect(foundToolUse).To(BeTrue(), "Model should have called the get_weather tool in streaming mode")
-				Expect(message.StopReason).To(Equal(anthropic.MessageStopReasonToolUse))
-			})
-		})
-	})
-})